Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 6d4c49a2

History | View | Annotate | Download (111 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "libavutil/imgutils.h"
31
#include "avcodec.h"
32
#include "dsputil.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "faanidct.h"
36
#include "mathops.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41

    
42
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
43
uint32_t ff_squareTbl[512] = {0, };
44

    
45
#define BIT_DEPTH 9
46
#include "dsputil_internal.h"
47
#undef BIT_DEPTH
48

    
49
#define BIT_DEPTH 10
50
#include "dsputil_internal.h"
51
#undef BIT_DEPTH
52

    
53
#define BIT_DEPTH 8
54
#include "dsputil_internal.h"
55

    
56
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
57
#define pb_7f (~0UL/255 * 0x7f)
58
#define pb_80 (~0UL/255 * 0x80)
59

    
60
const uint8_t ff_zigzag_direct[64] = {
61
    0,   1,  8, 16,  9,  2,  3, 10,
62
    17, 24, 32, 25, 18, 11,  4,  5,
63
    12, 19, 26, 33, 40, 48, 41, 34,
64
    27, 20, 13,  6,  7, 14, 21, 28,
65
    35, 42, 49, 56, 57, 50, 43, 36,
66
    29, 22, 15, 23, 30, 37, 44, 51,
67
    58, 59, 52, 45, 38, 31, 39, 46,
68
    53, 60, 61, 54, 47, 55, 62, 63
69
};
70

    
71
/* Specific zigzag scan for 248 idct. NOTE that unlike the
72
   specification, we interleave the fields */
73
const uint8_t ff_zigzag248_direct[64] = {
74
     0,  8,  1,  9, 16, 24,  2, 10,
75
    17, 25, 32, 40, 48, 56, 33, 41,
76
    18, 26,  3, 11,  4, 12, 19, 27,
77
    34, 42, 49, 57, 50, 58, 35, 43,
78
    20, 28,  5, 13,  6, 14, 21, 29,
79
    36, 44, 51, 59, 52, 60, 37, 45,
80
    22, 30,  7, 15, 23, 31, 38, 46,
81
    53, 61, 54, 62, 39, 47, 55, 63,
82
};
83

    
84
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
85
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
86

    
87
const uint8_t ff_alternate_horizontal_scan[64] = {
88
    0,  1,   2,  3,  8,  9, 16, 17,
89
    10, 11,  4,  5,  6,  7, 15, 14,
90
    13, 12, 19, 18, 24, 25, 32, 33,
91
    26, 27, 20, 21, 22, 23, 28, 29,
92
    30, 31, 34, 35, 40, 41, 48, 49,
93
    42, 43, 36, 37, 38, 39, 44, 45,
94
    46, 47, 50, 51, 56, 57, 58, 59,
95
    52, 53, 54, 55, 60, 61, 62, 63,
96
};
97

    
98
const uint8_t ff_alternate_vertical_scan[64] = {
99
    0,  8,  16, 24,  1,  9,  2, 10,
100
    17, 25, 32, 40, 48, 56, 57, 49,
101
    41, 33, 26, 18,  3, 11,  4, 12,
102
    19, 27, 34, 42, 50, 58, 35, 43,
103
    51, 59, 20, 28,  5, 13,  6, 14,
104
    21, 29, 36, 44, 52, 60, 37, 45,
105
    53, 61, 22, 30,  7, 15, 23, 31,
106
    38, 46, 54, 62, 39, 47, 55, 63,
107
};
108

    
109
/* Input permutation for the simple_idct_mmx */
110
static const uint8_t simple_mmx_permutation[64]={
111
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
112
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
113
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
114
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
115
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
116
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
117
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
118
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
119
};
120

    
121
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
122

    
123
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
124
    int i;
125
    int end;
126

    
127
    st->scantable= src_scantable;
128

    
129
    for(i=0; i<64; i++){
130
        int j;
131
        j = src_scantable[i];
132
        st->permutated[i] = permutation[j];
133
#if ARCH_PPC
134
        st->inverse[j] = i;
135
#endif
136
    }
137

    
138
    end=-1;
139
    for(i=0; i<64; i++){
140
        int j;
141
        j = st->permutated[i];
142
        if(j>end) end=j;
143
        st->raster_end[i]= end;
144
    }
145
}
146

    
147
static int pix_sum_c(uint8_t * pix, int line_size)
148
{
149
    int s, i, j;
150

    
151
    s = 0;
152
    for (i = 0; i < 16; i++) {
153
        for (j = 0; j < 16; j += 8) {
154
            s += pix[0];
155
            s += pix[1];
156
            s += pix[2];
157
            s += pix[3];
158
            s += pix[4];
159
            s += pix[5];
160
            s += pix[6];
161
            s += pix[7];
162
            pix += 8;
163
        }
164
        pix += line_size - 16;
165
    }
166
    return s;
167
}
168

    
169
static int pix_norm1_c(uint8_t * pix, int line_size)
170
{
171
    int s, i, j;
172
    uint32_t *sq = ff_squareTbl + 256;
173

    
174
    s = 0;
175
    for (i = 0; i < 16; i++) {
176
        for (j = 0; j < 16; j += 8) {
177
#if 0
178
            s += sq[pix[0]];
179
            s += sq[pix[1]];
180
            s += sq[pix[2]];
181
            s += sq[pix[3]];
182
            s += sq[pix[4]];
183
            s += sq[pix[5]];
184
            s += sq[pix[6]];
185
            s += sq[pix[7]];
186
#else
187
#if LONG_MAX > 2147483647
188
            register uint64_t x=*(uint64_t*)pix;
189
            s += sq[x&0xff];
190
            s += sq[(x>>8)&0xff];
191
            s += sq[(x>>16)&0xff];
192
            s += sq[(x>>24)&0xff];
193
            s += sq[(x>>32)&0xff];
194
            s += sq[(x>>40)&0xff];
195
            s += sq[(x>>48)&0xff];
196
            s += sq[(x>>56)&0xff];
197
#else
198
            register uint32_t x=*(uint32_t*)pix;
199
            s += sq[x&0xff];
200
            s += sq[(x>>8)&0xff];
201
            s += sq[(x>>16)&0xff];
202
            s += sq[(x>>24)&0xff];
203
            x=*(uint32_t*)(pix+4);
204
            s += sq[x&0xff];
205
            s += sq[(x>>8)&0xff];
206
            s += sq[(x>>16)&0xff];
207
            s += sq[(x>>24)&0xff];
208
#endif
209
#endif
210
            pix += 8;
211
        }
212
        pix += line_size - 16;
213
    }
214
    return s;
215
}
216

    
217
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
218
    int i;
219

    
220
    for(i=0; i+8<=w; i+=8){
221
        dst[i+0]= av_bswap32(src[i+0]);
222
        dst[i+1]= av_bswap32(src[i+1]);
223
        dst[i+2]= av_bswap32(src[i+2]);
224
        dst[i+3]= av_bswap32(src[i+3]);
225
        dst[i+4]= av_bswap32(src[i+4]);
226
        dst[i+5]= av_bswap32(src[i+5]);
227
        dst[i+6]= av_bswap32(src[i+6]);
228
        dst[i+7]= av_bswap32(src[i+7]);
229
    }
230
    for(;i<w; i++){
231
        dst[i+0]= av_bswap32(src[i+0]);
232
    }
233
}
234

    
235
static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
236
{
237
    while (len--)
238
        *dst++ = av_bswap16(*src++);
239
}
240

    
241
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
242
{
243
    int s, i;
244
    uint32_t *sq = ff_squareTbl + 256;
245

    
246
    s = 0;
247
    for (i = 0; i < h; i++) {
248
        s += sq[pix1[0] - pix2[0]];
249
        s += sq[pix1[1] - pix2[1]];
250
        s += sq[pix1[2] - pix2[2]];
251
        s += sq[pix1[3] - pix2[3]];
252
        pix1 += line_size;
253
        pix2 += line_size;
254
    }
255
    return s;
256
}
257

    
258
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
259
{
260
    int s, i;
261
    uint32_t *sq = ff_squareTbl + 256;
262

    
263
    s = 0;
264
    for (i = 0; i < h; i++) {
265
        s += sq[pix1[0] - pix2[0]];
266
        s += sq[pix1[1] - pix2[1]];
267
        s += sq[pix1[2] - pix2[2]];
268
        s += sq[pix1[3] - pix2[3]];
269
        s += sq[pix1[4] - pix2[4]];
270
        s += sq[pix1[5] - pix2[5]];
271
        s += sq[pix1[6] - pix2[6]];
272
        s += sq[pix1[7] - pix2[7]];
273
        pix1 += line_size;
274
        pix2 += line_size;
275
    }
276
    return s;
277
}
278

    
279
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
280
{
281
    int s, i;
282
    uint32_t *sq = ff_squareTbl + 256;
283

    
284
    s = 0;
285
    for (i = 0; i < h; i++) {
286
        s += sq[pix1[ 0] - pix2[ 0]];
287
        s += sq[pix1[ 1] - pix2[ 1]];
288
        s += sq[pix1[ 2] - pix2[ 2]];
289
        s += sq[pix1[ 3] - pix2[ 3]];
290
        s += sq[pix1[ 4] - pix2[ 4]];
291
        s += sq[pix1[ 5] - pix2[ 5]];
292
        s += sq[pix1[ 6] - pix2[ 6]];
293
        s += sq[pix1[ 7] - pix2[ 7]];
294
        s += sq[pix1[ 8] - pix2[ 8]];
295
        s += sq[pix1[ 9] - pix2[ 9]];
296
        s += sq[pix1[10] - pix2[10]];
297
        s += sq[pix1[11] - pix2[11]];
298
        s += sq[pix1[12] - pix2[12]];
299
        s += sq[pix1[13] - pix2[13]];
300
        s += sq[pix1[14] - pix2[14]];
301
        s += sq[pix1[15] - pix2[15]];
302

    
303
        pix1 += line_size;
304
        pix2 += line_size;
305
    }
306
    return s;
307
}
308

    
309
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
310
{
311
    int i;
312

    
313
    /* read the pixels */
314
    for(i=0;i<8;i++) {
315
        block[0] = pixels[0];
316
        block[1] = pixels[1];
317
        block[2] = pixels[2];
318
        block[3] = pixels[3];
319
        block[4] = pixels[4];
320
        block[5] = pixels[5];
321
        block[6] = pixels[6];
322
        block[7] = pixels[7];
323
        pixels += line_size;
324
        block += 8;
325
    }
326
}
327

    
328
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
329
                          const uint8_t *s2, int stride){
330
    int i;
331

    
332
    /* read the pixels */
333
    for(i=0;i<8;i++) {
334
        block[0] = s1[0] - s2[0];
335
        block[1] = s1[1] - s2[1];
336
        block[2] = s1[2] - s2[2];
337
        block[3] = s1[3] - s2[3];
338
        block[4] = s1[4] - s2[4];
339
        block[5] = s1[5] - s2[5];
340
        block[6] = s1[6] - s2[6];
341
        block[7] = s1[7] - s2[7];
342
        s1 += stride;
343
        s2 += stride;
344
        block += 8;
345
    }
346
}
347

    
348

    
349
void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
350
                             int line_size)
351
{
352
    int i;
353
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
354

    
355
    /* read the pixels */
356
    for(i=0;i<8;i++) {
357
        pixels[0] = cm[block[0]];
358
        pixels[1] = cm[block[1]];
359
        pixels[2] = cm[block[2]];
360
        pixels[3] = cm[block[3]];
361
        pixels[4] = cm[block[4]];
362
        pixels[5] = cm[block[5]];
363
        pixels[6] = cm[block[6]];
364
        pixels[7] = cm[block[7]];
365

    
366
        pixels += line_size;
367
        block += 8;
368
    }
369
}
370

    
371
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
372
                                 int line_size)
373
{
374
    int i;
375
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
376

    
377
    /* read the pixels */
378
    for(i=0;i<4;i++) {
379
        pixels[0] = cm[block[0]];
380
        pixels[1] = cm[block[1]];
381
        pixels[2] = cm[block[2]];
382
        pixels[3] = cm[block[3]];
383

    
384
        pixels += line_size;
385
        block += 8;
386
    }
387
}
388

    
389
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
390
                                 int line_size)
391
{
392
    int i;
393
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
394

    
395
    /* read the pixels */
396
    for(i=0;i<2;i++) {
397
        pixels[0] = cm[block[0]];
398
        pixels[1] = cm[block[1]];
399

    
400
        pixels += line_size;
401
        block += 8;
402
    }
403
}
404

    
405
void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
406
                                    uint8_t *restrict pixels,
407
                                    int line_size)
408
{
409
    int i, j;
410

    
411
    for (i = 0; i < 8; i++) {
412
        for (j = 0; j < 8; j++) {
413
            if (*block < -128)
414
                *pixels = 0;
415
            else if (*block > 127)
416
                *pixels = 255;
417
            else
418
                *pixels = (uint8_t)(*block + 128);
419
            block++;
420
            pixels++;
421
        }
422
        pixels += (line_size - 8);
423
    }
424
}
425

    
426
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
427
                                    int line_size)
428
{
429
    int i;
430

    
431
    /* read the pixels */
432
    for(i=0;i<8;i++) {
433
        pixels[0] = block[0];
434
        pixels[1] = block[1];
435
        pixels[2] = block[2];
436
        pixels[3] = block[3];
437
        pixels[4] = block[4];
438
        pixels[5] = block[5];
439
        pixels[6] = block[6];
440
        pixels[7] = block[7];
441

    
442
        pixels += line_size;
443
        block += 8;
444
    }
445
}
446

    
447
void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
448
                             int line_size)
449
{
450
    int i;
451
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
452

    
453
    /* read the pixels */
454
    for(i=0;i<8;i++) {
455
        pixels[0] = cm[pixels[0] + block[0]];
456
        pixels[1] = cm[pixels[1] + block[1]];
457
        pixels[2] = cm[pixels[2] + block[2]];
458
        pixels[3] = cm[pixels[3] + block[3]];
459
        pixels[4] = cm[pixels[4] + block[4]];
460
        pixels[5] = cm[pixels[5] + block[5]];
461
        pixels[6] = cm[pixels[6] + block[6]];
462
        pixels[7] = cm[pixels[7] + block[7]];
463
        pixels += line_size;
464
        block += 8;
465
    }
466
}
467

    
468
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
469
                          int line_size)
470
{
471
    int i;
472
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
473

    
474
    /* read the pixels */
475
    for(i=0;i<4;i++) {
476
        pixels[0] = cm[pixels[0] + block[0]];
477
        pixels[1] = cm[pixels[1] + block[1]];
478
        pixels[2] = cm[pixels[2] + block[2]];
479
        pixels[3] = cm[pixels[3] + block[3]];
480
        pixels += line_size;
481
        block += 8;
482
    }
483
}
484

    
485
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
486
                          int line_size)
487
{
488
    int i;
489
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
490

    
491
    /* read the pixels */
492
    for(i=0;i<2;i++) {
493
        pixels[0] = cm[pixels[0] + block[0]];
494
        pixels[1] = cm[pixels[1] + block[1]];
495
        pixels += line_size;
496
        block += 8;
497
    }
498
}
499

    
500
static int sum_abs_dctelem_c(DCTELEM *block)
501
{
502
    int sum=0, i;
503
    for(i=0; i<64; i++)
504
        sum+= FFABS(block[i]);
505
    return sum;
506
}
507

    
508
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
509
{
510
    int i;
511

    
512
    for (i = 0; i < h; i++) {
513
        memset(block, value, 16);
514
        block += line_size;
515
    }
516
}
517

    
518
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
519
{
520
    int i;
521

    
522
    for (i = 0; i < h; i++) {
523
        memset(block, value, 8);
524
        block += line_size;
525
    }
526
}
527

    
528
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
529
{
530
    int i, j;
531
    uint16_t *dst1 = (uint16_t *) dst;
532
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
533

    
534
    for (j = 0; j < 8; j++) {
535
        for (i = 0; i < 8; i++) {
536
            dst1[i] = dst2[i] = src[i] * 0x0101;
537
        }
538
        src  += 8;
539
        dst1 += linesize;
540
        dst2 += linesize;
541
    }
542
}
543

    
544
#define avg2(a,b) ((a+b+1)>>1)
545
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
546

    
547
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
548
{
549
    const int A=(16-x16)*(16-y16);
550
    const int B=(   x16)*(16-y16);
551
    const int C=(16-x16)*(   y16);
552
    const int D=(   x16)*(   y16);
553
    int i;
554

    
555
    for(i=0; i<h; i++)
556
    {
557
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
558
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
559
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
560
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
561
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
562
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
563
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
564
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
565
        dst+= stride;
566
        src+= stride;
567
    }
568
}
569

    
570
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
571
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
572
{
573
    int y, vx, vy;
574
    const int s= 1<<shift;
575

    
576
    width--;
577
    height--;
578

    
579
    for(y=0; y<h; y++){
580
        int x;
581

    
582
        vx= ox;
583
        vy= oy;
584
        for(x=0; x<8; x++){ //XXX FIXME optimize
585
            int src_x, src_y, frac_x, frac_y, index;
586

    
587
            src_x= vx>>16;
588
            src_y= vy>>16;
589
            frac_x= src_x&(s-1);
590
            frac_y= src_y&(s-1);
591
            src_x>>=shift;
592
            src_y>>=shift;
593

    
594
            if((unsigned)src_x < width){
595
                if((unsigned)src_y < height){
596
                    index= src_x + src_y*stride;
597
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
598
                                           + src[index       +1]*   frac_x )*(s-frac_y)
599
                                        + (  src[index+stride  ]*(s-frac_x)
600
                                           + src[index+stride+1]*   frac_x )*   frac_y
601
                                        + r)>>(shift*2);
602
                }else{
603
                    index= src_x + av_clip(src_y, 0, height)*stride;
604
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
605
                                          + src[index       +1]*   frac_x )*s
606
                                        + r)>>(shift*2);
607
                }
608
            }else{
609
                if((unsigned)src_y < height){
610
                    index= av_clip(src_x, 0, width) + src_y*stride;
611
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
612
                                           + src[index+stride  ]*   frac_y )*s
613
                                        + r)>>(shift*2);
614
                }else{
615
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
616
                    dst[y*stride + x]=    src[index         ];
617
                }
618
            }
619

    
620
            vx+= dxx;
621
            vy+= dyx;
622
        }
623
        ox += dxy;
624
        oy += dyy;
625
    }
626
}
627

    
628
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
629
    switch(width){
630
    case 2: put_pixels2_8_c (dst, src, stride, height); break;
631
    case 4: put_pixels4_8_c (dst, src, stride, height); break;
632
    case 8: put_pixels8_8_c (dst, src, stride, height); break;
633
    case 16:put_pixels16_8_c(dst, src, stride, height); break;
634
    }
635
}
636

    
637
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
638
    int i,j;
639
    for (i=0; i < height; i++) {
640
      for (j=0; j < width; j++) {
641
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
642
      }
643
      src += stride;
644
      dst += stride;
645
    }
646
}
647

    
648
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
649
    int i,j;
650
    for (i=0; i < height; i++) {
651
      for (j=0; j < width; j++) {
652
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
653
      }
654
      src += stride;
655
      dst += stride;
656
    }
657
}
658

    
659
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
660
    int i,j;
661
    for (i=0; i < height; i++) {
662
      for (j=0; j < width; j++) {
663
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
664
      }
665
      src += stride;
666
      dst += stride;
667
    }
668
}
669

    
670
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
671
    int i,j;
672
    for (i=0; i < height; i++) {
673
      for (j=0; j < width; j++) {
674
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
675
      }
676
      src += stride;
677
      dst += stride;
678
    }
679
}
680

    
681
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
682
    int i,j;
683
    for (i=0; i < height; i++) {
684
      for (j=0; j < width; j++) {
685
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
686
      }
687
      src += stride;
688
      dst += stride;
689
    }
690
}
691

    
692
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
693
    int i,j;
694
    for (i=0; i < height; i++) {
695
      for (j=0; j < width; j++) {
696
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
697
      }
698
      src += stride;
699
      dst += stride;
700
    }
701
}
702

    
703
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
704
    int i,j;
705
    for (i=0; i < height; i++) {
706
      for (j=0; j < width; j++) {
707
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
708
      }
709
      src += stride;
710
      dst += stride;
711
    }
712
}
713

    
714
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
715
    int i,j;
716
    for (i=0; i < height; i++) {
717
      for (j=0; j < width; j++) {
718
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
719
      }
720
      src += stride;
721
      dst += stride;
722
    }
723
}
724

    
725
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
726
    switch(width){
727
    case 2: avg_pixels2_8_c (dst, src, stride, height); break;
728
    case 4: avg_pixels4_8_c (dst, src, stride, height); break;
729
    case 8: avg_pixels8_8_c (dst, src, stride, height); break;
730
    case 16:avg_pixels16_8_c(dst, src, stride, height); break;
731
    }
732
}
733

    
734
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
735
    int i,j;
736
    for (i=0; i < height; i++) {
737
      for (j=0; j < width; j++) {
738
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
739
      }
740
      src += stride;
741
      dst += stride;
742
    }
743
}
744

    
745
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
746
    int i,j;
747
    for (i=0; i < height; i++) {
748
      for (j=0; j < width; j++) {
749
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
750
      }
751
      src += stride;
752
      dst += stride;
753
    }
754
}
755

    
756
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
757
    int i,j;
758
    for (i=0; i < height; i++) {
759
      for (j=0; j < width; j++) {
760
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
761
      }
762
      src += stride;
763
      dst += stride;
764
    }
765
}
766

    
767
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
768
    int i,j;
769
    for (i=0; i < height; i++) {
770
      for (j=0; j < width; j++) {
771
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
772
      }
773
      src += stride;
774
      dst += stride;
775
    }
776
}
777

    
778
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
779
    int i,j;
780
    for (i=0; i < height; i++) {
781
      for (j=0; j < width; j++) {
782
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
783
      }
784
      src += stride;
785
      dst += stride;
786
    }
787
}
788

    
789
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
790
    int i,j;
791
    for (i=0; i < height; i++) {
792
      for (j=0; j < width; j++) {
793
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
794
      }
795
      src += stride;
796
      dst += stride;
797
    }
798
}
799

    
800
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
801
    int i,j;
802
    for (i=0; i < height; i++) {
803
      for (j=0; j < width; j++) {
804
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
805
      }
806
      src += stride;
807
      dst += stride;
808
    }
809
}
810

    
811
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
812
    int i,j;
813
    for (i=0; i < height; i++) {
814
      for (j=0; j < width; j++) {
815
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
816
      }
817
      src += stride;
818
      dst += stride;
819
    }
820
}
821
#if 0
822
#define TPEL_WIDTH(width)\
823
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
824
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
825
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
826
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
827
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
828
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
829
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
830
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
831
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
832
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
833
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
834
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
835
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
836
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
837
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
838
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
839
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
840
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
841
#endif
842

    
843
#define QPEL_MC(r, OPNAME, RND, OP) \
844
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
845
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
846
    int i;\
847
    for(i=0; i<h; i++)\
848
    {\
849
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
850
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
851
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
852
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
853
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
854
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
855
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
856
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
857
        dst+=dstStride;\
858
        src+=srcStride;\
859
    }\
860
}\
861
\
862
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
863
    const int w=8;\
864
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
865
    int i;\
866
    for(i=0; i<w; i++)\
867
    {\
868
        const int src0= src[0*srcStride];\
869
        const int src1= src[1*srcStride];\
870
        const int src2= src[2*srcStride];\
871
        const int src3= src[3*srcStride];\
872
        const int src4= src[4*srcStride];\
873
        const int src5= src[5*srcStride];\
874
        const int src6= src[6*srcStride];\
875
        const int src7= src[7*srcStride];\
876
        const int src8= src[8*srcStride];\
877
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
878
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
879
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
880
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
881
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
882
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
883
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
884
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
885
        dst++;\
886
        src++;\
887
    }\
888
}\
889
\
890
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
891
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
892
    int i;\
893
    \
894
    for(i=0; i<h; i++)\
895
    {\
896
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
897
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
898
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
899
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
900
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
901
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
902
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
903
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
904
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
905
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
906
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
907
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
908
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
909
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
910
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
911
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
912
        dst+=dstStride;\
913
        src+=srcStride;\
914
    }\
915
}\
916
\
917
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
918
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
919
    int i;\
920
    const int w=16;\
921
    for(i=0; i<w; i++)\
922
    {\
923
        const int src0= src[0*srcStride];\
924
        const int src1= src[1*srcStride];\
925
        const int src2= src[2*srcStride];\
926
        const int src3= src[3*srcStride];\
927
        const int src4= src[4*srcStride];\
928
        const int src5= src[5*srcStride];\
929
        const int src6= src[6*srcStride];\
930
        const int src7= src[7*srcStride];\
931
        const int src8= src[8*srcStride];\
932
        const int src9= src[9*srcStride];\
933
        const int src10= src[10*srcStride];\
934
        const int src11= src[11*srcStride];\
935
        const int src12= src[12*srcStride];\
936
        const int src13= src[13*srcStride];\
937
        const int src14= src[14*srcStride];\
938
        const int src15= src[15*srcStride];\
939
        const int src16= src[16*srcStride];\
940
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
941
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
942
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
943
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
944
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
945
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
946
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
947
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
948
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
949
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
950
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
951
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
952
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
953
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
954
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
955
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
956
        dst++;\
957
        src++;\
958
    }\
959
}\
960
\
961
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
962
    uint8_t half[64];\
963
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
964
    OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
965
}\
966
\
967
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
968
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
969
}\
970
\
971
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
972
    uint8_t half[64];\
973
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
974
    OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
975
}\
976
\
977
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
978
    uint8_t full[16*9];\
979
    uint8_t half[64];\
980
    copy_block9(full, src, 16, stride, 9);\
981
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
982
    OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
983
}\
984
\
985
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
986
    uint8_t full[16*9];\
987
    copy_block9(full, src, 16, stride, 9);\
988
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
989
}\
990
\
991
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
992
    uint8_t full[16*9];\
993
    uint8_t half[64];\
994
    copy_block9(full, src, 16, stride, 9);\
995
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
996
    OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
997
}\
998
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
999
    uint8_t full[16*9];\
1000
    uint8_t halfH[72];\
1001
    uint8_t halfV[64];\
1002
    uint8_t halfHV[64];\
1003
    copy_block9(full, src, 16, stride, 9);\
1004
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1005
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1006
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1007
    OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1008
}\
1009
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1010
    uint8_t full[16*9];\
1011
    uint8_t halfH[72];\
1012
    uint8_t halfHV[64];\
1013
    copy_block9(full, src, 16, stride, 9);\
1014
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1015
    put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1016
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1017
    OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1018
}\
1019
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1020
    uint8_t full[16*9];\
1021
    uint8_t halfH[72];\
1022
    uint8_t halfV[64];\
1023
    uint8_t halfHV[64];\
1024
    copy_block9(full, src, 16, stride, 9);\
1025
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1027
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1028
    OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1029
}\
1030
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1031
    uint8_t full[16*9];\
1032
    uint8_t halfH[72];\
1033
    uint8_t halfHV[64];\
1034
    copy_block9(full, src, 16, stride, 9);\
1035
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1036
    put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1037
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1038
    OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1039
}\
1040
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1041
    uint8_t full[16*9];\
1042
    uint8_t halfH[72];\
1043
    uint8_t halfV[64];\
1044
    uint8_t halfHV[64];\
1045
    copy_block9(full, src, 16, stride, 9);\
1046
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1047
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1048
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1049
    OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1050
}\
1051
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1052
    uint8_t full[16*9];\
1053
    uint8_t halfH[72];\
1054
    uint8_t halfHV[64];\
1055
    copy_block9(full, src, 16, stride, 9);\
1056
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1057
    put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1058
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1059
    OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1060
}\
1061
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1062
    uint8_t full[16*9];\
1063
    uint8_t halfH[72];\
1064
    uint8_t halfV[64];\
1065
    uint8_t halfHV[64];\
1066
    copy_block9(full, src, 16, stride, 9);\
1067
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1068
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1069
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1070
    OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1071
}\
1072
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1073
    uint8_t full[16*9];\
1074
    uint8_t halfH[72];\
1075
    uint8_t halfHV[64];\
1076
    copy_block9(full, src, 16, stride, 9);\
1077
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1078
    put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1079
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1080
    OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1081
}\
1082
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1083
    uint8_t halfH[72];\
1084
    uint8_t halfHV[64];\
1085
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1086
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1087
    OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1088
}\
1089
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1090
    uint8_t halfH[72];\
1091
    uint8_t halfHV[64];\
1092
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1093
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1094
    OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1095
}\
1096
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1097
    uint8_t full[16*9];\
1098
    uint8_t halfH[72];\
1099
    uint8_t halfV[64];\
1100
    uint8_t halfHV[64];\
1101
    copy_block9(full, src, 16, stride, 9);\
1102
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1103
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1104
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1105
    OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1106
}\
1107
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1108
    uint8_t full[16*9];\
1109
    uint8_t halfH[72];\
1110
    copy_block9(full, src, 16, stride, 9);\
1111
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1112
    put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1113
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1114
}\
1115
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1116
    uint8_t full[16*9];\
1117
    uint8_t halfH[72];\
1118
    uint8_t halfV[64];\
1119
    uint8_t halfHV[64];\
1120
    copy_block9(full, src, 16, stride, 9);\
1121
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1122
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1123
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1124
    OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1125
}\
1126
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1127
    uint8_t full[16*9];\
1128
    uint8_t halfH[72];\
1129
    copy_block9(full, src, 16, stride, 9);\
1130
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1131
    put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1132
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1133
}\
1134
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1135
    uint8_t halfH[72];\
1136
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1137
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1138
}\
1139
\
1140
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1141
    uint8_t half[256];\
1142
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1143
    OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1144
}\
1145
\
1146
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1147
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1148
}\
1149
\
1150
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1151
    uint8_t half[256];\
1152
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1153
    OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1154
}\
1155
\
1156
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1157
    uint8_t full[24*17];\
1158
    uint8_t half[256];\
1159
    copy_block17(full, src, 24, stride, 17);\
1160
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1161
    OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1162
}\
1163
\
1164
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1165
    uint8_t full[24*17];\
1166
    copy_block17(full, src, 24, stride, 17);\
1167
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1168
}\
1169
\
1170
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1171
    uint8_t full[24*17];\
1172
    uint8_t half[256];\
1173
    copy_block17(full, src, 24, stride, 17);\
1174
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1175
    OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1176
}\
1177
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1178
    uint8_t full[24*17];\
1179
    uint8_t halfH[272];\
1180
    uint8_t halfV[256];\
1181
    uint8_t halfHV[256];\
1182
    copy_block17(full, src, 24, stride, 17);\
1183
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1184
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1185
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1186
    OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1187
}\
1188
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1189
    uint8_t full[24*17];\
1190
    uint8_t halfH[272];\
1191
    uint8_t halfHV[256];\
1192
    copy_block17(full, src, 24, stride, 17);\
1193
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1194
    put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1195
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1196
    OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1197
}\
1198
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1199
    uint8_t full[24*17];\
1200
    uint8_t halfH[272];\
1201
    uint8_t halfV[256];\
1202
    uint8_t halfHV[256];\
1203
    copy_block17(full, src, 24, stride, 17);\
1204
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1205
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1206
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1207
    OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1208
}\
1209
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1210
    uint8_t full[24*17];\
1211
    uint8_t halfH[272];\
1212
    uint8_t halfHV[256];\
1213
    copy_block17(full, src, 24, stride, 17);\
1214
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1215
    put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1216
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1217
    OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1218
}\
1219
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1220
    uint8_t full[24*17];\
1221
    uint8_t halfH[272];\
1222
    uint8_t halfV[256];\
1223
    uint8_t halfHV[256];\
1224
    copy_block17(full, src, 24, stride, 17);\
1225
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1226
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1227
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1228
    OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1229
}\
1230
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1231
    uint8_t full[24*17];\
1232
    uint8_t halfH[272];\
1233
    uint8_t halfHV[256];\
1234
    copy_block17(full, src, 24, stride, 17);\
1235
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1236
    put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1237
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1238
    OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1239
}\
1240
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1241
    uint8_t full[24*17];\
1242
    uint8_t halfH[272];\
1243
    uint8_t halfV[256];\
1244
    uint8_t halfHV[256];\
1245
    copy_block17(full, src, 24, stride, 17);\
1246
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1247
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1248
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1249
    OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1250
}\
1251
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1252
    uint8_t full[24*17];\
1253
    uint8_t halfH[272];\
1254
    uint8_t halfHV[256];\
1255
    copy_block17(full, src, 24, stride, 17);\
1256
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1257
    put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1258
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1259
    OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1260
}\
1261
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1262
    uint8_t halfH[272];\
1263
    uint8_t halfHV[256];\
1264
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1265
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1266
    OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1267
}\
1268
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1269
    uint8_t halfH[272];\
1270
    uint8_t halfHV[256];\
1271
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1272
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1273
    OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1274
}\
1275
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1276
    uint8_t full[24*17];\
1277
    uint8_t halfH[272];\
1278
    uint8_t halfV[256];\
1279
    uint8_t halfHV[256];\
1280
    copy_block17(full, src, 24, stride, 17);\
1281
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1282
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1283
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1284
    OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1285
}\
1286
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1287
    uint8_t full[24*17];\
1288
    uint8_t halfH[272];\
1289
    copy_block17(full, src, 24, stride, 17);\
1290
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1291
    put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1292
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1293
}\
1294
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1295
    uint8_t full[24*17];\
1296
    uint8_t halfH[272];\
1297
    uint8_t halfV[256];\
1298
    uint8_t halfHV[256];\
1299
    copy_block17(full, src, 24, stride, 17);\
1300
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1301
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1302
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1303
    OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1304
}\
1305
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1306
    uint8_t full[24*17];\
1307
    uint8_t halfH[272];\
1308
    copy_block17(full, src, 24, stride, 17);\
1309
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1310
    put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1311
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1312
}\
1313
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1314
    uint8_t halfH[272];\
1315
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1316
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1317
}
1318

    
1319
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1320
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1321
#define op_put(a, b) a = cm[((b) + 16)>>5]
1322
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1323

    
1324
QPEL_MC(0, put_       , _       , op_put)
1325
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1326
QPEL_MC(0, avg_       , _       , op_avg)
1327
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1328
#undef op_avg
1329
#undef op_avg_no_rnd
1330
#undef op_put
1331
#undef op_put_no_rnd
1332

    
1333
#define put_qpel8_mc00_c  ff_put_pixels8x8_c
1334
#define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1335
#define put_qpel16_mc00_c ff_put_pixels16x16_c
1336
#define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1337
#define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1338
#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1339

    
1340
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1341
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1342
    int i;
1343

    
1344
    for(i=0; i<h; i++){
1345
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1346
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1347
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1348
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1349
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1350
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1351
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1352
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1353
        dst+=dstStride;
1354
        src+=srcStride;
1355
    }
1356
}
1357

    
1358
#if CONFIG_RV40_DECODER
1359
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1360
    put_pixels16_xy2_8_c(dst, src, stride, 16);
1361
}
1362
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1363
    avg_pixels16_xy2_8_c(dst, src, stride, 16);
1364
}
1365
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1366
    put_pixels8_xy2_8_c(dst, src, stride, 8);
1367
}
1368
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1369
    avg_pixels8_xy2_8_c(dst, src, stride, 8);
1370
}
1371
#endif /* CONFIG_RV40_DECODER */
1372

    
1373
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1374
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1375
    int i;
1376

    
1377
    for(i=0; i<w; i++){
1378
        const int src_1= src[ -srcStride];
1379
        const int src0 = src[0          ];
1380
        const int src1 = src[  srcStride];
1381
        const int src2 = src[2*srcStride];
1382
        const int src3 = src[3*srcStride];
1383
        const int src4 = src[4*srcStride];
1384
        const int src5 = src[5*srcStride];
1385
        const int src6 = src[6*srcStride];
1386
        const int src7 = src[7*srcStride];
1387
        const int src8 = src[8*srcStride];
1388
        const int src9 = src[9*srcStride];
1389
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1390
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1391
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1392
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1393
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1394
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1395
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1396
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1397
        src++;
1398
        dst++;
1399
    }
1400
}
1401

    
1402
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1403
    uint8_t half[64];
1404
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1405
    put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1406
}
1407

    
1408
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1409
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1410
}
1411

    
1412
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1413
    uint8_t half[64];
1414
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1415
    put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1416
}
1417

    
1418
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1419
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1420
}
1421

    
1422
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1423
    uint8_t halfH[88];
1424
    uint8_t halfV[64];
1425
    uint8_t halfHV[64];
1426
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1427
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1428
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1429
    put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1430
}
1431
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1432
    uint8_t halfH[88];
1433
    uint8_t halfV[64];
1434
    uint8_t halfHV[64];
1435
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1436
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1437
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1438
    put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1439
}
1440
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1441
    uint8_t halfH[88];
1442
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1443
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1444
}
1445

    
1446
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1447
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1448
    int x;
1449
    const int strength= ff_h263_loop_filter_strength[qscale];
1450

    
1451
    for(x=0; x<8; x++){
1452
        int d1, d2, ad1;
1453
        int p0= src[x-2*stride];
1454
        int p1= src[x-1*stride];
1455
        int p2= src[x+0*stride];
1456
        int p3= src[x+1*stride];
1457
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1458

    
1459
        if     (d<-2*strength) d1= 0;
1460
        else if(d<-  strength) d1=-2*strength - d;
1461
        else if(d<   strength) d1= d;
1462
        else if(d< 2*strength) d1= 2*strength - d;
1463
        else                   d1= 0;
1464

    
1465
        p1 += d1;
1466
        p2 -= d1;
1467
        if(p1&256) p1= ~(p1>>31);
1468
        if(p2&256) p2= ~(p2>>31);
1469

    
1470
        src[x-1*stride] = p1;
1471
        src[x+0*stride] = p2;
1472

    
1473
        ad1= FFABS(d1)>>1;
1474

    
1475
        d2= av_clip((p0-p3)/4, -ad1, ad1);
1476

    
1477
        src[x-2*stride] = p0 - d2;
1478
        src[x+  stride] = p3 + d2;
1479
    }
1480
    }
1481
}
1482

    
1483
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1484
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1485
    int y;
1486
    const int strength= ff_h263_loop_filter_strength[qscale];
1487

    
1488
    for(y=0; y<8; y++){
1489
        int d1, d2, ad1;
1490
        int p0= src[y*stride-2];
1491
        int p1= src[y*stride-1];
1492
        int p2= src[y*stride+0];
1493
        int p3= src[y*stride+1];
1494
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1495

    
1496
        if     (d<-2*strength) d1= 0;
1497
        else if(d<-  strength) d1=-2*strength - d;
1498
        else if(d<   strength) d1= d;
1499
        else if(d< 2*strength) d1= 2*strength - d;
1500
        else                   d1= 0;
1501

    
1502
        p1 += d1;
1503
        p2 -= d1;
1504
        if(p1&256) p1= ~(p1>>31);
1505
        if(p2&256) p2= ~(p2>>31);
1506

    
1507
        src[y*stride-1] = p1;
1508
        src[y*stride+0] = p2;
1509

    
1510
        ad1= FFABS(d1)>>1;
1511

    
1512
        d2= av_clip((p0-p3)/4, -ad1, ad1);
1513

    
1514
        src[y*stride-2] = p0 - d2;
1515
        src[y*stride+1] = p3 + d2;
1516
    }
1517
    }
1518
}
1519

    
1520
static void h261_loop_filter_c(uint8_t *src, int stride){
1521
    int x,y,xy,yz;
1522
    int temp[64];
1523

    
1524
    for(x=0; x<8; x++){
1525
        temp[x      ] = 4*src[x           ];
1526
        temp[x + 7*8] = 4*src[x + 7*stride];
1527
    }
1528
    for(y=1; y<7; y++){
1529
        for(x=0; x<8; x++){
1530
            xy = y * stride + x;
1531
            yz = y * 8 + x;
1532
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1533
        }
1534
    }
1535

    
1536
    for(y=0; y<8; y++){
1537
        src[  y*stride] = (temp[  y*8] + 2)>>2;
1538
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1539
        for(x=1; x<7; x++){
1540
            xy = y * stride + x;
1541
            yz = y * 8 + x;
1542
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1543
        }
1544
    }
1545
}
1546

    
1547
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1548
{
1549
    int s, i;
1550

    
1551
    s = 0;
1552
    for(i=0;i<h;i++) {
1553
        s += abs(pix1[0] - pix2[0]);
1554
        s += abs(pix1[1] - pix2[1]);
1555
        s += abs(pix1[2] - pix2[2]);
1556
        s += abs(pix1[3] - pix2[3]);
1557
        s += abs(pix1[4] - pix2[4]);
1558
        s += abs(pix1[5] - pix2[5]);
1559
        s += abs(pix1[6] - pix2[6]);
1560
        s += abs(pix1[7] - pix2[7]);
1561
        s += abs(pix1[8] - pix2[8]);
1562
        s += abs(pix1[9] - pix2[9]);
1563
        s += abs(pix1[10] - pix2[10]);
1564
        s += abs(pix1[11] - pix2[11]);
1565
        s += abs(pix1[12] - pix2[12]);
1566
        s += abs(pix1[13] - pix2[13]);
1567
        s += abs(pix1[14] - pix2[14]);
1568
        s += abs(pix1[15] - pix2[15]);
1569
        pix1 += line_size;
1570
        pix2 += line_size;
1571
    }
1572
    return s;
1573
}
1574

    
1575
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1576
{
1577
    int s, i;
1578

    
1579
    s = 0;
1580
    for(i=0;i<h;i++) {
1581
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1582
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1583
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1584
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1585
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1586
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1587
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1588
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1589
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1590
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1591
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1592
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1593
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1594
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1595
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1596
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1597
        pix1 += line_size;
1598
        pix2 += line_size;
1599
    }
1600
    return s;
1601
}
1602

    
1603
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1604
{
1605
    int s, i;
1606
    uint8_t *pix3 = pix2 + line_size;
1607

    
1608
    s = 0;
1609
    for(i=0;i<h;i++) {
1610
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1611
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1612
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1613
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1614
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1615
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1616
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1617
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1618
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1619
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1620
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1621
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1622
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1623
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1624
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1625
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1626
        pix1 += line_size;
1627
        pix2 += line_size;
1628
        pix3 += line_size;
1629
    }
1630
    return s;
1631
}
1632

    
1633
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1634
{
1635
    int s, i;
1636
    uint8_t *pix3 = pix2 + line_size;
1637

    
1638
    s = 0;
1639
    for(i=0;i<h;i++) {
1640
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1641
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1642
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1643
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1644
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1645
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1646
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1647
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1648
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1649
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1650
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1651
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1652
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1653
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1654
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1655
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1656
        pix1 += line_size;
1657
        pix2 += line_size;
1658
        pix3 += line_size;
1659
    }
1660
    return s;
1661
}
1662

    
1663
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1664
{
1665
    int s, i;
1666

    
1667
    s = 0;
1668
    for(i=0;i<h;i++) {
1669
        s += abs(pix1[0] - pix2[0]);
1670
        s += abs(pix1[1] - pix2[1]);
1671
        s += abs(pix1[2] - pix2[2]);
1672
        s += abs(pix1[3] - pix2[3]);
1673
        s += abs(pix1[4] - pix2[4]);
1674
        s += abs(pix1[5] - pix2[5]);
1675
        s += abs(pix1[6] - pix2[6]);
1676
        s += abs(pix1[7] - pix2[7]);
1677
        pix1 += line_size;
1678
        pix2 += line_size;
1679
    }
1680
    return s;
1681
}
1682

    
1683
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1684
{
1685
    int s, i;
1686

    
1687
    s = 0;
1688
    for(i=0;i<h;i++) {
1689
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1690
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1691
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1692
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1693
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1694
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1695
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1696
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1697
        pix1 += line_size;
1698
        pix2 += line_size;
1699
    }
1700
    return s;
1701
}
1702

    
1703
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1704
{
1705
    int s, i;
1706
    uint8_t *pix3 = pix2 + line_size;
1707

    
1708
    s = 0;
1709
    for(i=0;i<h;i++) {
1710
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1711
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1712
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1713
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1714
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1715
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1716
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1717
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1718
        pix1 += line_size;
1719
        pix2 += line_size;
1720
        pix3 += line_size;
1721
    }
1722
    return s;
1723
}
1724

    
1725
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1726
{
1727
    int s, i;
1728
    uint8_t *pix3 = pix2 + line_size;
1729

    
1730
    s = 0;
1731
    for(i=0;i<h;i++) {
1732
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1733
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1734
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1735
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1736
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1737
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1738
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1739
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1740
        pix1 += line_size;
1741
        pix2 += line_size;
1742
        pix3 += line_size;
1743
    }
1744
    return s;
1745
}
1746

    
1747
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1748
    MpegEncContext *c = v;
1749
    int score1=0;
1750
    int score2=0;
1751
    int x,y;
1752

    
1753
    for(y=0; y<h; y++){
1754
        for(x=0; x<16; x++){
1755
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1756
        }
1757
        if(y+1<h){
1758
            for(x=0; x<15; x++){
1759
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
1760
                             - s1[x+1] + s1[x+1+stride])
1761
                        -FFABS(  s2[x  ] - s2[x  +stride]
1762
                             - s2[x+1] + s2[x+1+stride]);
1763
            }
1764
        }
1765
        s1+= stride;
1766
        s2+= stride;
1767
    }
1768

    
1769
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1770
    else  return score1 + FFABS(score2)*8;
1771
}
1772

    
1773
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1774
    MpegEncContext *c = v;
1775
    int score1=0;
1776
    int score2=0;
1777
    int x,y;
1778

    
1779
    for(y=0; y<h; y++){
1780
        for(x=0; x<8; x++){
1781
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1782
        }
1783
        if(y+1<h){
1784
            for(x=0; x<7; x++){
1785
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
1786
                             - s1[x+1] + s1[x+1+stride])
1787
                        -FFABS(  s2[x  ] - s2[x  +stride]
1788
                             - s2[x+1] + s2[x+1+stride]);
1789
            }
1790
        }
1791
        s1+= stride;
1792
        s2+= stride;
1793
    }
1794

    
1795
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1796
    else  return score1 + FFABS(score2)*8;
1797
}
1798

    
1799
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1800
    int i;
1801
    unsigned int sum=0;
1802

    
1803
    for(i=0; i<8*8; i++){
1804
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1805
        int w= weight[i];
1806
        b>>= RECON_SHIFT;
1807
        assert(-512<b && b<512);
1808

    
1809
        sum += (w*b)*(w*b)>>4;
1810
    }
1811
    return sum>>2;
1812
}
1813

    
1814
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1815
    int i;
1816

    
1817
    for(i=0; i<8*8; i++){
1818
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1819
    }
1820
}
1821

    
1822
/**
1823
 * permutes an 8x8 block.
1824
 * @param block the block which will be permuted according to the given permutation vector
1825
 * @param permutation the permutation vector
1826
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1827
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1828
 *                  (inverse) permutated to scantable order!
1829
 */
1830
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1831
{
1832
    int i;
1833
    DCTELEM temp[64];
1834

    
1835
    if(last<=0) return;
1836
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1837

    
1838
    for(i=0; i<=last; i++){
1839
        const int j= scantable[i];
1840
        temp[j]= block[j];
1841
        block[j]=0;
1842
    }
1843

    
1844
    for(i=0; i<=last; i++){
1845
        const int j= scantable[i];
1846
        const int perm_j= permutation[j];
1847
        block[perm_j]= temp[j];
1848
    }
1849
}
1850

    
1851
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1852
    return 0;
1853
}
1854

    
1855
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1856
    int i;
1857

    
1858
    memset(cmp, 0, sizeof(void*)*6);
1859

    
1860
    for(i=0; i<6; i++){
1861
        switch(type&0xFF){
1862
        case FF_CMP_SAD:
1863
            cmp[i]= c->sad[i];
1864
            break;
1865
        case FF_CMP_SATD:
1866
            cmp[i]= c->hadamard8_diff[i];
1867
            break;
1868
        case FF_CMP_SSE:
1869
            cmp[i]= c->sse[i];
1870
            break;
1871
        case FF_CMP_DCT:
1872
            cmp[i]= c->dct_sad[i];
1873
            break;
1874
        case FF_CMP_DCT264:
1875
            cmp[i]= c->dct264_sad[i];
1876
            break;
1877
        case FF_CMP_DCTMAX:
1878
            cmp[i]= c->dct_max[i];
1879
            break;
1880
        case FF_CMP_PSNR:
1881
            cmp[i]= c->quant_psnr[i];
1882
            break;
1883
        case FF_CMP_BIT:
1884
            cmp[i]= c->bit[i];
1885
            break;
1886
        case FF_CMP_RD:
1887
            cmp[i]= c->rd[i];
1888
            break;
1889
        case FF_CMP_VSAD:
1890
            cmp[i]= c->vsad[i];
1891
            break;
1892
        case FF_CMP_VSSE:
1893
            cmp[i]= c->vsse[i];
1894
            break;
1895
        case FF_CMP_ZERO:
1896
            cmp[i]= zero_cmp;
1897
            break;
1898
        case FF_CMP_NSSE:
1899
            cmp[i]= c->nsse[i];
1900
            break;
1901
#if CONFIG_DWT
1902
        case FF_CMP_W53:
1903
            cmp[i]= c->w53[i];
1904
            break;
1905
        case FF_CMP_W97:
1906
            cmp[i]= c->w97[i];
1907
            break;
1908
#endif
1909
        default:
1910
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1911
        }
1912
    }
1913
}
1914

    
1915
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1916
    long i;
1917
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1918
        long a = *(long*)(src+i);
1919
        long b = *(long*)(dst+i);
1920
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1921
    }
1922
    for(; i<w; i++)
1923
        dst[i+0] += src[i+0];
1924
}
1925

    
1926
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1927
    long i;
1928
#if !HAVE_FAST_UNALIGNED
1929
    if((long)src2 & (sizeof(long)-1)){
1930
        for(i=0; i+7<w; i+=8){
1931
            dst[i+0] = src1[i+0]-src2[i+0];
1932
            dst[i+1] = src1[i+1]-src2[i+1];
1933
            dst[i+2] = src1[i+2]-src2[i+2];
1934
            dst[i+3] = src1[i+3]-src2[i+3];
1935
            dst[i+4] = src1[i+4]-src2[i+4];
1936
            dst[i+5] = src1[i+5]-src2[i+5];
1937
            dst[i+6] = src1[i+6]-src2[i+6];
1938
            dst[i+7] = src1[i+7]-src2[i+7];
1939
        }
1940
    }else
1941
#endif
1942
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1943
        long a = *(long*)(src1+i);
1944
        long b = *(long*)(src2+i);
1945
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1946
    }
1947
    for(; i<w; i++)
1948
        dst[i+0] = src1[i+0]-src2[i+0];
1949
}
1950

    
1951
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1952
    int i;
1953
    uint8_t l, lt;
1954

    
1955
    l= *left;
1956
    lt= *left_top;
1957

    
1958
    for(i=0; i<w; i++){
1959
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1960
        lt= src1[i];
1961
        dst[i]= l;
1962
    }
1963

    
1964
    *left= l;
1965
    *left_top= lt;
1966
}
1967

    
1968
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1969
    int i;
1970
    uint8_t l, lt;
1971

    
1972
    l= *left;
1973
    lt= *left_top;
1974

    
1975
    for(i=0; i<w; i++){
1976
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1977
        lt= src1[i];
1978
        l= src2[i];
1979
        dst[i]= l - pred;
1980
    }
1981

    
1982
    *left= l;
1983
    *left_top= lt;
1984
}
1985

    
1986
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1987
    int i;
1988

    
1989
    for(i=0; i<w-1; i++){
1990
        acc+= src[i];
1991
        dst[i]= acc;
1992
        i++;
1993
        acc+= src[i];
1994
        dst[i]= acc;
1995
    }
1996

    
1997
    for(; i<w; i++){
1998
        acc+= src[i];
1999
        dst[i]= acc;
2000
    }
2001

    
2002
    return acc;
2003
}
2004

    
2005
#if HAVE_BIGENDIAN
2006
#define B 3
2007
#define G 2
2008
#define R 1
2009
#define A 0
2010
#else
2011
#define B 0
2012
#define G 1
2013
#define R 2
2014
#define A 3
2015
#endif
2016
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2017
    int i;
2018
    int r,g,b,a;
2019
    r= *red;
2020
    g= *green;
2021
    b= *blue;
2022
    a= *alpha;
2023

    
2024
    for(i=0; i<w; i++){
2025
        b+= src[4*i+B];
2026
        g+= src[4*i+G];
2027
        r+= src[4*i+R];
2028
        a+= src[4*i+A];
2029

    
2030
        dst[4*i+B]= b;
2031
        dst[4*i+G]= g;
2032
        dst[4*i+R]= r;
2033
        dst[4*i+A]= a;
2034
    }
2035

    
2036
    *red= r;
2037
    *green= g;
2038
    *blue= b;
2039
    *alpha= a;
2040
}
2041
#undef B
2042
#undef G
2043
#undef R
2044
#undef A
2045

    
2046
#define BUTTERFLY2(o1,o2,i1,i2) \
2047
o1= (i1)+(i2);\
2048
o2= (i1)-(i2);
2049

    
2050
#define BUTTERFLY1(x,y) \
2051
{\
2052
    int a,b;\
2053
    a= x;\
2054
    b= y;\
2055
    x= a+b;\
2056
    y= a-b;\
2057
}
2058

    
2059
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2060

    
2061
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2062
    int i;
2063
    int temp[64];
2064
    int sum=0;
2065

    
2066
    assert(h==8);
2067

    
2068
    for(i=0; i<8; i++){
2069
        //FIXME try pointer walks
2070
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2071
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2072
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2073
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2074

    
2075
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2076
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2077
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2078
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2079

    
2080
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2081
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2082
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2083
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2084
    }
2085

    
2086
    for(i=0; i<8; i++){
2087
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2088
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2089
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2090
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2091

    
2092
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2093
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2094
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2095
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2096

    
2097
        sum +=
2098
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2099
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2100
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2101
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2102
    }
2103
#if 0
2104
static int maxi=0;
2105
if(sum>maxi){
2106
    maxi=sum;
2107
    printf("MAX:%d\n", maxi);
2108
}
2109
#endif
2110
    return sum;
2111
}
2112

    
2113
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2114
    int i;
2115
    int temp[64];
2116
    int sum=0;
2117

    
2118
    assert(h==8);
2119

    
2120
    for(i=0; i<8; i++){
2121
        //FIXME try pointer walks
2122
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2123
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2124
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2125
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2126

    
2127
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2128
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2129
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2130
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2131

    
2132
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2133
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2134
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2135
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2136
    }
2137

    
2138
    for(i=0; i<8; i++){
2139
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2140
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2141
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2142
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2143

    
2144
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2145
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2146
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2147
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2148

    
2149
        sum +=
2150
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2151
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2152
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2153
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2154
    }
2155

    
2156
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2157

    
2158
    return sum;
2159
}
2160

    
2161
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2162
    MpegEncContext * const s= (MpegEncContext *)c;
2163
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2164

    
2165
    assert(h==8);
2166

    
2167
    s->dsp.diff_pixels(temp, src1, src2, stride);
2168
    s->dsp.fdct(temp);
2169
    return s->dsp.sum_abs_dctelem(temp);
2170
}
2171

    
2172
#if CONFIG_GPL
2173
#define DCT8_1D {\
2174
    const int s07 = SRC(0) + SRC(7);\
2175
    const int s16 = SRC(1) + SRC(6);\
2176
    const int s25 = SRC(2) + SRC(5);\
2177
    const int s34 = SRC(3) + SRC(4);\
2178
    const int a0 = s07 + s34;\
2179
    const int a1 = s16 + s25;\
2180
    const int a2 = s07 - s34;\
2181
    const int a3 = s16 - s25;\
2182
    const int d07 = SRC(0) - SRC(7);\
2183
    const int d16 = SRC(1) - SRC(6);\
2184
    const int d25 = SRC(2) - SRC(5);\
2185
    const int d34 = SRC(3) - SRC(4);\
2186
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
2187
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
2188
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
2189
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
2190
    DST(0,  a0 + a1     ) ;\
2191
    DST(1,  a4 + (a7>>2)) ;\
2192
    DST(2,  a2 + (a3>>1)) ;\
2193
    DST(3,  a5 + (a6>>2)) ;\
2194
    DST(4,  a0 - a1     ) ;\
2195
    DST(5,  a6 - (a5>>2)) ;\
2196
    DST(6, (a2>>1) - a3 ) ;\
2197
    DST(7, (a4>>2) - a7 ) ;\
2198
}
2199

    
2200
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2201
    MpegEncContext * const s= (MpegEncContext *)c;
2202
    DCTELEM dct[8][8];
2203
    int i;
2204
    int sum=0;
2205

    
2206
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
2207

    
2208
#define SRC(x) dct[i][x]
2209
#define DST(x,v) dct[i][x]= v
2210
    for( i = 0; i < 8; i++ )
2211
        DCT8_1D
2212
#undef SRC
2213
#undef DST
2214

    
2215
#define SRC(x) dct[x][i]
2216
#define DST(x,v) sum += FFABS(v)
2217
    for( i = 0; i < 8; i++ )
2218
        DCT8_1D
2219
#undef SRC
2220
#undef DST
2221
    return sum;
2222
}
2223
#endif
2224

    
2225
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2226
    MpegEncContext * const s= (MpegEncContext *)c;
2227
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2228
    int sum=0, i;
2229

    
2230
    assert(h==8);
2231

    
2232
    s->dsp.diff_pixels(temp, src1, src2, stride);
2233
    s->dsp.fdct(temp);
2234

    
2235
    for(i=0; i<64; i++)
2236
        sum= FFMAX(sum, FFABS(temp[i]));
2237

    
2238
    return sum;
2239
}
2240

    
2241
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2242
    MpegEncContext * const s= (MpegEncContext *)c;
2243
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2244
    DCTELEM * const bak = temp+64;
2245
    int sum=0, i;
2246

    
2247
    assert(h==8);
2248
    s->mb_intra=0;
2249

    
2250
    s->dsp.diff_pixels(temp, src1, src2, stride);
2251

    
2252
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2253

    
2254
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2255
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
2256
    ff_simple_idct(temp); //FIXME
2257

    
2258
    for(i=0; i<64; i++)
2259
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2260

    
2261
    return sum;
2262
}
2263

    
2264
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2265
    MpegEncContext * const s= (MpegEncContext *)c;
2266
    const uint8_t *scantable= s->intra_scantable.permutated;
2267
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2268
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2269
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2270
    int i, last, run, bits, level, distortion, start_i;
2271
    const int esc_length= s->ac_esc_length;
2272
    uint8_t * length;
2273
    uint8_t * last_length;
2274

    
2275
    assert(h==8);
2276

    
2277
    copy_block8(lsrc1, src1, 8, stride, 8);
2278
    copy_block8(lsrc2, src2, 8, stride, 8);
2279

    
2280
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2281

    
2282
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2283

    
2284
    bits=0;
2285

    
2286
    if (s->mb_intra) {
2287
        start_i = 1;
2288
        length     = s->intra_ac_vlc_length;
2289
        last_length= s->intra_ac_vlc_last_length;
2290
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2291
    } else {
2292
        start_i = 0;
2293
        length     = s->inter_ac_vlc_length;
2294
        last_length= s->inter_ac_vlc_last_length;
2295
    }
2296

    
2297
    if(last>=start_i){
2298
        run=0;
2299
        for(i=start_i; i<last; i++){
2300
            int j= scantable[i];
2301
            level= temp[j];
2302

    
2303
            if(level){
2304
                level+=64;
2305
                if((level&(~127)) == 0){
2306
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2307
                }else
2308
                    bits+= esc_length;
2309
                run=0;
2310
            }else
2311
                run++;
2312
        }
2313
        i= scantable[last];
2314

    
2315
        level= temp[i] + 64;
2316

    
2317
        assert(level - 64);
2318

    
2319
        if((level&(~127)) == 0){
2320
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2321
        }else
2322
            bits+= esc_length;
2323

    
2324
    }
2325

    
2326
    if(last>=0){
2327
        if(s->mb_intra)
2328
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
2329
        else
2330
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
2331
    }
2332

    
2333
    s->dsp.idct_add(lsrc2, 8, temp);
2334

    
2335
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2336

    
2337
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2338
}
2339

    
2340
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2341
    MpegEncContext * const s= (MpegEncContext *)c;
2342
    const uint8_t *scantable= s->intra_scantable.permutated;
2343
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2344
    int i, last, run, bits, level, start_i;
2345
    const int esc_length= s->ac_esc_length;
2346
    uint8_t * length;
2347
    uint8_t * last_length;
2348

    
2349
    assert(h==8);
2350

    
2351
    s->dsp.diff_pixels(temp, src1, src2, stride);
2352

    
2353
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2354

    
2355
    bits=0;
2356

    
2357
    if (s->mb_intra) {
2358
        start_i = 1;
2359
        length     = s->intra_ac_vlc_length;
2360
        last_length= s->intra_ac_vlc_last_length;
2361
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2362
    } else {
2363
        start_i = 0;
2364
        length     = s->inter_ac_vlc_length;
2365
        last_length= s->inter_ac_vlc_last_length;
2366
    }
2367

    
2368
    if(last>=start_i){
2369
        run=0;
2370
        for(i=start_i; i<last; i++){
2371
            int j= scantable[i];
2372
            level= temp[j];
2373

    
2374
            if(level){
2375
                level+=64;
2376
                if((level&(~127)) == 0){
2377
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2378
                }else
2379
                    bits+= esc_length;
2380
                run=0;
2381
            }else
2382
                run++;
2383
        }
2384
        i= scantable[last];
2385

    
2386
        level= temp[i] + 64;
2387

    
2388
        assert(level - 64);
2389

    
2390
        if((level&(~127)) == 0){
2391
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2392
        }else
2393
            bits+= esc_length;
2394
    }
2395

    
2396
    return bits;
2397
}
2398

    
2399
#define VSAD_INTRA(size) \
2400
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2401
    int score=0;                                                                                            \
2402
    int x,y;                                                                                                \
2403
                                                                                                            \
2404
    for(y=1; y<h; y++){                                                                                     \
2405
        for(x=0; x<size; x+=4){                                                                             \
2406
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2407
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2408
        }                                                                                                   \
2409
        s+= stride;                                                                                         \
2410
    }                                                                                                       \
2411
                                                                                                            \
2412
    return score;                                                                                           \
2413
}
2414
VSAD_INTRA(8)
2415
VSAD_INTRA(16)
2416

    
2417
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2418
    int score=0;
2419
    int x,y;
2420

    
2421
    for(y=1; y<h; y++){
2422
        for(x=0; x<16; x++){
2423
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2424
        }
2425
        s1+= stride;
2426
        s2+= stride;
2427
    }
2428

    
2429
    return score;
2430
}
2431

    
2432
#define SQ(a) ((a)*(a))
2433
#define VSSE_INTRA(size) \
2434
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2435
    int score=0;                                                                                            \
2436
    int x,y;                                                                                                \
2437
                                                                                                            \
2438
    for(y=1; y<h; y++){                                                                                     \
2439
        for(x=0; x<size; x+=4){                                                                               \
2440
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2441
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2442
        }                                                                                                   \
2443
        s+= stride;                                                                                         \
2444
    }                                                                                                       \
2445
                                                                                                            \
2446
    return score;                                                                                           \
2447
}
2448
VSSE_INTRA(8)
2449
VSSE_INTRA(16)
2450

    
2451
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2452
    int score=0;
2453
    int x,y;
2454

    
2455
    for(y=1; y<h; y++){
2456
        for(x=0; x<16; x++){
2457
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2458
        }
2459
        s1+= stride;
2460
        s2+= stride;
2461
    }
2462

    
2463
    return score;
2464
}
2465

    
2466
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2467
                               int size){
2468
    int score=0;
2469
    int i;
2470
    for(i=0; i<size; i++)
2471
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2472
    return score;
2473
}
2474

    
2475
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2476
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2477
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2478
#if CONFIG_GPL
2479
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2480
#endif
2481
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2482
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2483
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2484
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2485

    
2486
static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2487
    int i;
2488
    for(i=0; i<len; i++)
2489
        dst[i] = src0[i] * src1[i];
2490
}
2491

    
2492
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2493
    int i;
2494
    src1 += len-1;
2495
    for(i=0; i<len; i++)
2496
        dst[i] = src0[i] * src1[-i];
2497
}
2498

    
2499
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2500
    int i;
2501
    for(i=0; i<len; i++)
2502
        dst[i] = src0[i] * src1[i] + src2[i];
2503
}
2504

    
2505
static void vector_fmul_window_c(float *dst, const float *src0,
2506
                                 const float *src1, const float *win, int len)
2507
{
2508
    int i,j;
2509
    dst += len;
2510
    win += len;
2511
    src0+= len;
2512
    for(i=-len, j=len-1; i<0; i++, j--) {
2513
        float s0 = src0[i];
2514
        float s1 = src1[j];
2515
        float wi = win[i];
2516
        float wj = win[j];
2517
        dst[i] = s0*wj - s1*wi;
2518
        dst[j] = s0*wi + s1*wj;
2519
    }
2520
}
2521

    
2522
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2523
                                 int len)
2524
{
2525
    int i;
2526
    for (i = 0; i < len; i++)
2527
        dst[i] = src[i] * mul;
2528
}
2529

    
2530
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2531
                                      const float **sv, float mul, int len)
2532
{
2533
    int i;
2534
    for (i = 0; i < len; i += 2, sv++) {
2535
        dst[i  ] = src[i  ] * sv[0][0] * mul;
2536
        dst[i+1] = src[i+1] * sv[0][1] * mul;
2537
    }
2538
}
2539

    
2540
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2541
                                      const float **sv, float mul, int len)
2542
{
2543
    int i;
2544
    for (i = 0; i < len; i += 4, sv++) {
2545
        dst[i  ] = src[i  ] * sv[0][0] * mul;
2546
        dst[i+1] = src[i+1] * sv[0][1] * mul;
2547
        dst[i+2] = src[i+2] * sv[0][2] * mul;
2548
        dst[i+3] = src[i+3] * sv[0][3] * mul;
2549
    }
2550
}
2551

    
2552
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2553
                               int len)
2554
{
2555
    int i;
2556
    for (i = 0; i < len; i += 2, sv++) {
2557
        dst[i  ] = sv[0][0] * mul;
2558
        dst[i+1] = sv[0][1] * mul;
2559
    }
2560
}
2561

    
2562
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2563
                               int len)
2564
{
2565
    int i;
2566
    for (i = 0; i < len; i += 4, sv++) {
2567
        dst[i  ] = sv[0][0] * mul;
2568
        dst[i+1] = sv[0][1] * mul;
2569
        dst[i+2] = sv[0][2] * mul;
2570
        dst[i+3] = sv[0][3] * mul;
2571
    }
2572
}
2573

    
2574
static void butterflies_float_c(float *restrict v1, float *restrict v2,
2575
                                int len)
2576
{
2577
    int i;
2578
    for (i = 0; i < len; i++) {
2579
        float t = v1[i] - v2[i];
2580
        v1[i] += v2[i];
2581
        v2[i] = t;
2582
    }
2583
}
2584

    
2585
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2586
{
2587
    float p = 0.0;
2588
    int i;
2589

    
2590
    for (i = 0; i < len; i++)
2591
        p += v1[i] * v2[i];
2592

    
2593
    return p;
2594
}
2595

    
2596
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2597
                   uint32_t maxi, uint32_t maxisign)
2598
{
2599

    
2600
    if(a > mini) return mini;
2601
    else if((a^(1U<<31)) > maxisign) return maxi;
2602
    else return a;
2603
}
2604

    
2605
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2606
    int i;
2607
    uint32_t mini = *(uint32_t*)min;
2608
    uint32_t maxi = *(uint32_t*)max;
2609
    uint32_t maxisign = maxi ^ (1U<<31);
2610
    uint32_t *dsti = (uint32_t*)dst;
2611
    const uint32_t *srci = (const uint32_t*)src;
2612
    for(i=0; i<len; i+=8) {
2613
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2614
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2615
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2616
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2617
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2618
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2619
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2620
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2621
    }
2622
}
2623
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2624
    int i;
2625
    if(min < 0 && max > 0) {
2626
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2627
    } else {
2628
        for(i=0; i < len; i+=8) {
2629
            dst[i    ] = av_clipf(src[i    ], min, max);
2630
            dst[i + 1] = av_clipf(src[i + 1], min, max);
2631
            dst[i + 2] = av_clipf(src[i + 2], min, max);
2632
            dst[i + 3] = av_clipf(src[i + 3], min, max);
2633
            dst[i + 4] = av_clipf(src[i + 4], min, max);
2634
            dst[i + 5] = av_clipf(src[i + 5], min, max);
2635
            dst[i + 6] = av_clipf(src[i + 6], min, max);
2636
            dst[i + 7] = av_clipf(src[i + 7], min, max);
2637
        }
2638
    }
2639
}
2640

    
2641
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2642
{
2643
    int res = 0;
2644

    
2645
    while (order--)
2646
        res += (*v1++ * *v2++) >> shift;
2647

    
2648
    return res;
2649
}
2650

    
2651
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2652
{
2653
    int res = 0;
2654
    while (order--) {
2655
        res   += *v1 * *v2++;
2656
        *v1++ += mul * *v3++;
2657
    }
2658
    return res;
2659
}
2660

    
2661
static void apply_window_int16_c(int16_t *output, const int16_t *input,
2662
                                 const int16_t *window, unsigned int len)
2663
{
2664
    int i;
2665
    int len2 = len >> 1;
2666

    
2667
    for (i = 0; i < len2; i++) {
2668
        int16_t w       = window[i];
2669
        output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2670
        output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2671
    }
2672
}
2673

    
2674
#define W0 2048
2675
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2676
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2677
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2678
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2679
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2680
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2681
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2682

    
2683
static void wmv2_idct_row(short * b)
2684
{
2685
    int s1,s2;
2686
    int a0,a1,a2,a3,a4,a5,a6,a7;
2687
    /*step 1*/
2688
    a1 = W1*b[1]+W7*b[7];
2689
    a7 = W7*b[1]-W1*b[7];
2690
    a5 = W5*b[5]+W3*b[3];
2691
    a3 = W3*b[5]-W5*b[3];
2692
    a2 = W2*b[2]+W6*b[6];
2693
    a6 = W6*b[2]-W2*b[6];
2694
    a0 = W0*b[0]+W0*b[4];
2695
    a4 = W0*b[0]-W0*b[4];
2696
    /*step 2*/
2697
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2698
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
2699
    /*step 3*/
2700
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2701
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
2702
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
2703
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2704
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2705
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
2706
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
2707
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2708
}
2709
static void wmv2_idct_col(short * b)
2710
{
2711
    int s1,s2;
2712
    int a0,a1,a2,a3,a4,a5,a6,a7;
2713
    /*step 1, with extended precision*/
2714
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2715
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2716
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2717
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2718
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2719
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2720
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2721
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2722
    /*step 2*/
2723
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
2724
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
2725
    /*step 3*/
2726
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2727
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2728
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2729
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2730

    
2731
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2732
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2733
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2734
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2735
}
2736
void ff_wmv2_idct_c(short * block){
2737
    int i;
2738

    
2739
    for(i=0;i<64;i+=8){
2740
        wmv2_idct_row(block+i);
2741
    }
2742
    for(i=0;i<8;i++){
2743
        wmv2_idct_col(block+i);
2744
    }
2745
}
2746
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2747
 converted */
2748
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2749
{
2750
    ff_wmv2_idct_c(block);
2751
    ff_put_pixels_clamped_c(block, dest, line_size);
2752
}
2753
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2754
{
2755
    ff_wmv2_idct_c(block);
2756
    ff_add_pixels_clamped_c(block, dest, line_size);
2757
}
2758
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2759
{
2760
    j_rev_dct (block);
2761
    ff_put_pixels_clamped_c(block, dest, line_size);
2762
}
2763
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2764
{
2765
    j_rev_dct (block);
2766
    ff_add_pixels_clamped_c(block, dest, line_size);
2767
}
2768

    
2769
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2770
{
2771
    j_rev_dct4 (block);
2772
    put_pixels_clamped4_c(block, dest, line_size);
2773
}
2774
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2775
{
2776
    j_rev_dct4 (block);
2777
    add_pixels_clamped4_c(block, dest, line_size);
2778
}
2779

    
2780
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2781
{
2782
    j_rev_dct2 (block);
2783
    put_pixels_clamped2_c(block, dest, line_size);
2784
}
2785
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2786
{
2787
    j_rev_dct2 (block);
2788
    add_pixels_clamped2_c(block, dest, line_size);
2789
}
2790

    
2791
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2792
{
2793
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2794

    
2795
    dest[0] = cm[(block[0] + 4)>>3];
2796
}
2797
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2798
{
2799
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2800

    
2801
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2802
}
2803

    
2804
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2805

    
2806
/* init static data */
2807
av_cold void dsputil_static_init(void)
2808
{
2809
    int i;
2810

    
2811
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2812
    for(i=0;i<MAX_NEG_CROP;i++) {
2813
        ff_cropTbl[i] = 0;
2814
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2815
    }
2816

    
2817
    for(i=0;i<512;i++) {
2818
        ff_squareTbl[i] = (i - 256) * (i - 256);
2819
    }
2820

    
2821
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2822
}
2823

    
2824
int ff_check_alignment(void){
2825
    static int did_fail=0;
2826
    DECLARE_ALIGNED(16, int, aligned);
2827

    
2828
    if((intptr_t)&aligned & 15){
2829
        if(!did_fail){
2830
#if HAVE_MMX || HAVE_ALTIVEC
2831
            av_log(NULL, AV_LOG_ERROR,
2832
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2833
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
2834
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2835
                "Do not report crashes to FFmpeg developers.\n");
2836
#endif
2837
            did_fail=1;
2838
        }
2839
        return -1;
2840
    }
2841
    return 0;
2842
}
2843

    
2844
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2845
{
2846
    int i;
2847

    
2848
    ff_check_alignment();
2849

    
2850
#if CONFIG_ENCODERS
2851
    if(avctx->dct_algo==FF_DCT_FASTINT) {
2852
        c->fdct = fdct_ifast;
2853
        c->fdct248 = fdct_ifast248;
2854
    }
2855
    else if(avctx->dct_algo==FF_DCT_FAAN) {
2856
        c->fdct = ff_faandct;
2857
        c->fdct248 = ff_faandct248;
2858
    }
2859
    else {
2860
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2861
        c->fdct248 = ff_fdct248_islow;
2862
    }
2863
#endif //CONFIG_ENCODERS
2864

    
2865
    if(avctx->lowres==1){
2866
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2867
            c->idct_put= ff_jref_idct4_put;
2868
            c->idct_add= ff_jref_idct4_add;
2869
        }else{
2870
            if (avctx->codec_id != CODEC_ID_H264) {
2871
                c->idct_put= ff_h264_lowres_idct_put_8_c;
2872
                c->idct_add= ff_h264_lowres_idct_add_8_c;
2873
            } else {
2874
                switch (avctx->bits_per_raw_sample) {
2875
                    case 9:
2876
                        c->idct_put= ff_h264_lowres_idct_put_9_c;
2877
                        c->idct_add= ff_h264_lowres_idct_add_9_c;
2878
                        break;
2879
                    case 10:
2880
                        c->idct_put= ff_h264_lowres_idct_put_10_c;
2881
                        c->idct_add= ff_h264_lowres_idct_add_10_c;
2882
                        break;
2883
                    default:
2884
                        c->idct_put= ff_h264_lowres_idct_put_8_c;
2885
                        c->idct_add= ff_h264_lowres_idct_add_8_c;
2886
                }
2887
            }
2888
        }
2889
        c->idct    = j_rev_dct4;
2890
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2891
    }else if(avctx->lowres==2){
2892
        c->idct_put= ff_jref_idct2_put;
2893
        c->idct_add= ff_jref_idct2_add;
2894
        c->idct    = j_rev_dct2;
2895
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2896
    }else if(avctx->lowres==3){
2897
        c->idct_put= ff_jref_idct1_put;
2898
        c->idct_add= ff_jref_idct1_add;
2899
        c->idct    = j_rev_dct1;
2900
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2901
    }else{
2902
        if(avctx->idct_algo==FF_IDCT_INT){
2903
            c->idct_put= ff_jref_idct_put;
2904
            c->idct_add= ff_jref_idct_add;
2905
            c->idct    = j_rev_dct;
2906
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2907
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2908
                avctx->idct_algo==FF_IDCT_VP3){
2909
            c->idct_put= ff_vp3_idct_put_c;
2910
            c->idct_add= ff_vp3_idct_add_c;
2911
            c->idct    = ff_vp3_idct_c;
2912
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2913
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
2914
            c->idct_put= ff_wmv2_idct_put_c;
2915
            c->idct_add= ff_wmv2_idct_add_c;
2916
            c->idct    = ff_wmv2_idct_c;
2917
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2918
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
2919
            c->idct_put= ff_faanidct_put;
2920
            c->idct_add= ff_faanidct_add;
2921
            c->idct    = ff_faanidct;
2922
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2923
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2924
            c->idct_put= ff_ea_idct_put_c;
2925
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2926
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2927
            c->idct     = ff_bink_idct_c;
2928
            c->idct_add = ff_bink_idct_add_c;
2929
            c->idct_put = ff_bink_idct_put_c;
2930
            c->idct_permutation_type = FF_NO_IDCT_PERM;
2931
        }else{ //accurate/default
2932
            c->idct_put= ff_simple_idct_put;
2933
            c->idct_add= ff_simple_idct_add;
2934
            c->idct    = ff_simple_idct;
2935
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2936
        }
2937
    }
2938

    
2939
    c->get_pixels = get_pixels_c;
2940
    c->diff_pixels = diff_pixels_c;
2941
    c->put_pixels_clamped = ff_put_pixels_clamped_c;
2942
    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2943
    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2944
    c->add_pixels_clamped = ff_add_pixels_clamped_c;
2945
    c->sum_abs_dctelem = sum_abs_dctelem_c;
2946
    c->gmc1 = gmc1_c;
2947
    c->gmc = ff_gmc_c;
2948
    c->pix_sum = pix_sum_c;
2949
    c->pix_norm1 = pix_norm1_c;
2950

    
2951
    c->fill_block_tab[0] = fill_block16_c;
2952
    c->fill_block_tab[1] = fill_block8_c;
2953
    c->scale_block = scale_block_c;
2954

    
2955
    /* TODO [0] 16  [1] 8 */
2956
    c->pix_abs[0][0] = pix_abs16_c;
2957
    c->pix_abs[0][1] = pix_abs16_x2_c;
2958
    c->pix_abs[0][2] = pix_abs16_y2_c;
2959
    c->pix_abs[0][3] = pix_abs16_xy2_c;
2960
    c->pix_abs[1][0] = pix_abs8_c;
2961
    c->pix_abs[1][1] = pix_abs8_x2_c;
2962
    c->pix_abs[1][2] = pix_abs8_y2_c;
2963
    c->pix_abs[1][3] = pix_abs8_xy2_c;
2964

    
2965
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2966
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2967
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2968
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2969
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2970
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2971
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2972
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2973
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2974

    
2975
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2976
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2977
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2978
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2979
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2980
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2981
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2982
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2983
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2984

    
2985
#define dspfunc(PFX, IDX, NUM) \
2986
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2987
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2988
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2989
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2990
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2991
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2992
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2993
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2994
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2995
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2996
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2997
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2998
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2999
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3000
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3001
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3002

    
3003
    dspfunc(put_qpel, 0, 16);
3004
    dspfunc(put_no_rnd_qpel, 0, 16);
3005

    
3006
    dspfunc(avg_qpel, 0, 16);
3007
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3008

    
3009
    dspfunc(put_qpel, 1, 8);
3010
    dspfunc(put_no_rnd_qpel, 1, 8);
3011

    
3012
    dspfunc(avg_qpel, 1, 8);
3013
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3014

    
3015
#undef dspfunc
3016

    
3017
#if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3018
    ff_mlp_init(c, avctx);
3019
#endif
3020
#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3021
    ff_intrax8dsp_init(c,avctx);
3022
#endif
3023
#if CONFIG_RV30_DECODER
3024
    ff_rv30dsp_init(c,avctx);
3025
#endif
3026
#if CONFIG_RV40_DECODER
3027
    ff_rv40dsp_init(c,avctx);
3028
    c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3029
    c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3030
    c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3031
    c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3032
#endif
3033

    
3034
    c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3035
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3036
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3037
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3038
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3039
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3040
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3041
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3042

    
3043
#define SET_CMP_FUNC(name) \
3044
    c->name[0]= name ## 16_c;\
3045
    c->name[1]= name ## 8x8_c;
3046

    
3047
    SET_CMP_FUNC(hadamard8_diff)
3048
    c->hadamard8_diff[4]= hadamard8_intra16_c;
3049
    c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3050
    SET_CMP_FUNC(dct_sad)
3051
    SET_CMP_FUNC(dct_max)
3052
#if CONFIG_GPL
3053
    SET_CMP_FUNC(dct264_sad)
3054
#endif
3055
    c->sad[0]= pix_abs16_c;
3056
    c->sad[1]= pix_abs8_c;
3057
    c->sse[0]= sse16_c;
3058
    c->sse[1]= sse8_c;
3059
    c->sse[2]= sse4_c;
3060
    SET_CMP_FUNC(quant_psnr)
3061
    SET_CMP_FUNC(rd)
3062
    SET_CMP_FUNC(bit)
3063
    c->vsad[0]= vsad16_c;
3064
    c->vsad[4]= vsad_intra16_c;
3065
    c->vsad[5]= vsad_intra8_c;
3066
    c->vsse[0]= vsse16_c;
3067
    c->vsse[4]= vsse_intra16_c;
3068
    c->vsse[5]= vsse_intra8_c;
3069
    c->nsse[0]= nsse16_c;
3070
    c->nsse[1]= nsse8_c;
3071
#if CONFIG_DWT
3072
    ff_dsputil_init_dwt(c);
3073
#endif
3074

    
3075
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3076

    
3077
    c->add_bytes= add_bytes_c;
3078
    c->diff_bytes= diff_bytes_c;
3079
    c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3080
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3081
    c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
3082
    c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3083
    c->bswap_buf= bswap_buf;
3084
    c->bswap16_buf = bswap16_buf;
3085

    
3086
    if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3087
        c->h263_h_loop_filter= h263_h_loop_filter_c;
3088
        c->h263_v_loop_filter= h263_v_loop_filter_c;
3089
    }
3090

    
3091
    if (CONFIG_VP3_DECODER) {
3092
        c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3093
        c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3094
        c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3095
    }
3096

    
3097
    c->h261_loop_filter= h261_loop_filter_c;
3098

    
3099
    c->try_8x8basis= try_8x8basis_c;
3100
    c->add_8x8basis= add_8x8basis_c;
3101

    
3102
#if CONFIG_VORBIS_DECODER
3103
    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3104
#endif
3105
#if CONFIG_AC3_DECODER
3106
    c->ac3_downmix = ff_ac3_downmix_c;
3107
#endif
3108
    c->vector_fmul = vector_fmul_c;
3109
    c->vector_fmul_reverse = vector_fmul_reverse_c;
3110
    c->vector_fmul_add = vector_fmul_add_c;
3111
    c->vector_fmul_window = vector_fmul_window_c;
3112
    c->vector_clipf = vector_clipf_c;
3113
    c->scalarproduct_int16 = scalarproduct_int16_c;
3114
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3115
    c->apply_window_int16 = apply_window_int16_c;
3116
    c->scalarproduct_float = scalarproduct_float_c;
3117
    c->butterflies_float = butterflies_float_c;
3118
    c->vector_fmul_scalar = vector_fmul_scalar_c;
3119

    
3120
    c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3121
    c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3122

    
3123
    c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3124
    c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3125

    
3126
    c->shrink[0]= av_image_copy_plane;
3127
    c->shrink[1]= ff_shrink22;
3128
    c->shrink[2]= ff_shrink44;
3129
    c->shrink[3]= ff_shrink88;
3130

    
3131
    c->prefetch= just_return;
3132

    
3133
    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3134
    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3135

    
3136
#undef FUNC
3137
#undef FUNCC
3138
#define FUNC(f, depth) f ## _ ## depth
3139
#define FUNCC(f, depth) f ## _ ## depth ## _c
3140

    
3141
#define dspfunc1(PFX, IDX, NUM, depth)\
3142
    c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3143
    c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3144
    c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3145
    c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3146

    
3147
#define dspfunc2(PFX, IDX, NUM, depth)\
3148
    c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3149
    c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3150
    c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3151
    c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3152
    c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3153
    c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3154
    c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3155
    c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3156
    c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3157
    c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3158
    c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3159
    c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3160
    c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3161
    c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3162
    c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3163
    c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3164

    
3165

    
3166
#define BIT_DEPTH_FUNCS(depth)\
3167
    c->draw_edges                    = FUNCC(draw_edges            , depth);\
3168
    c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3169
    c->clear_block                   = FUNCC(clear_block           , depth);\
3170
    c->clear_blocks                  = FUNCC(clear_blocks          , depth);\
3171
    c->add_pixels8                   = FUNCC(add_pixels8           , depth);\
3172
    c->add_pixels4                   = FUNCC(add_pixels4           , depth);\
3173
    c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3174
    c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3175
\
3176
    c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3177
    c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3178
    c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3179
    c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3180
    c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3181
    c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3182
\
3183
    dspfunc1(put       , 0, 16, depth);\
3184
    dspfunc1(put       , 1,  8, depth);\
3185
    dspfunc1(put       , 2,  4, depth);\
3186
    dspfunc1(put       , 3,  2, depth);\
3187
    dspfunc1(put_no_rnd, 0, 16, depth);\
3188
    dspfunc1(put_no_rnd, 1,  8, depth);\
3189
    dspfunc1(avg       , 0, 16, depth);\
3190
    dspfunc1(avg       , 1,  8, depth);\
3191
    dspfunc1(avg       , 2,  4, depth);\
3192
    dspfunc1(avg       , 3,  2, depth);\
3193
    dspfunc1(avg_no_rnd, 0, 16, depth);\
3194
    dspfunc1(avg_no_rnd, 1,  8, depth);\
3195
\
3196
    dspfunc2(put_h264_qpel, 0, 16, depth);\
3197
    dspfunc2(put_h264_qpel, 1,  8, depth);\
3198
    dspfunc2(put_h264_qpel, 2,  4, depth);\
3199
    dspfunc2(put_h264_qpel, 3,  2, depth);\
3200
    dspfunc2(avg_h264_qpel, 0, 16, depth);\
3201
    dspfunc2(avg_h264_qpel, 1,  8, depth);\
3202
    dspfunc2(avg_h264_qpel, 2,  4, depth);
3203

    
3204
    if (avctx->codec_id != CODEC_ID_H264 || avctx->bits_per_raw_sample == 8) {
3205
        BIT_DEPTH_FUNCS(8)
3206
    } else {
3207
        switch (avctx->bits_per_raw_sample) {
3208
            case 9:
3209
                BIT_DEPTH_FUNCS(9)
3210
                break;
3211
            case 10:
3212
                BIT_DEPTH_FUNCS(10)
3213
                break;
3214
            default:
3215
                av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3216
                BIT_DEPTH_FUNCS(8)
3217
                break;
3218
        }
3219
    }
3220

    
3221

    
3222
    if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3223
    if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3224
    if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3225
    if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3226
    if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3227
    if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3228
    if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3229
    if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3230
    if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3231

    
3232
    for(i=0; i<64; i++){
3233
        if(!c->put_2tap_qpel_pixels_tab[0][i])
3234
            c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3235
        if(!c->avg_2tap_qpel_pixels_tab[0][i])
3236
            c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3237
    }
3238

    
3239
    c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3240
    c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3241
    c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3242
    c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3243

    
3244
    c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3245
    c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3246
    c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3247
    c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3248

    
3249
    switch(c->idct_permutation_type){
3250
    case FF_NO_IDCT_PERM:
3251
        for(i=0; i<64; i++)
3252
            c->idct_permutation[i]= i;
3253
        break;
3254
    case FF_LIBMPEG2_IDCT_PERM:
3255
        for(i=0; i<64; i++)
3256
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3257
        break;
3258
    case FF_SIMPLE_IDCT_PERM:
3259
        for(i=0; i<64; i++)
3260
            c->idct_permutation[i]= simple_mmx_permutation[i];
3261
        break;
3262
    case FF_TRANSPOSE_IDCT_PERM:
3263
        for(i=0; i<64; i++)
3264
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3265
        break;
3266
    case FF_PARTTRANS_IDCT_PERM:
3267
        for(i=0; i<64; i++)
3268
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3269
        break;
3270
    case FF_SSE2_IDCT_PERM:
3271
        for(i=0; i<64; i++)
3272
            c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3273
        break;
3274
    default:
3275
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3276
    }
3277
}
3278