Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 8dbe5856

History | View | Annotate | Download (111 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "libavutil/imgutils.h"
31
#include "avcodec.h"
32
#include "dsputil.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "faanidct.h"
36
#include "mathops.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
#define BIT_DEPTH 9
47
#include "dsputil_internal.h"
48
#undef BIT_DEPTH
49

    
50
#define BIT_DEPTH 10
51
#include "dsputil_internal.h"
52
#undef BIT_DEPTH
53

    
54
#define BIT_DEPTH 8
55
#include "dsputil_internal.h"
56

    
57
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58
#define pb_7f (~0UL/255 * 0x7f)
59
#define pb_80 (~0UL/255 * 0x80)
60

    
61
const uint8_t ff_zigzag_direct[64] = {
62
    0,   1,  8, 16,  9,  2,  3, 10,
63
    17, 24, 32, 25, 18, 11,  4,  5,
64
    12, 19, 26, 33, 40, 48, 41, 34,
65
    27, 20, 13,  6,  7, 14, 21, 28,
66
    35, 42, 49, 56, 57, 50, 43, 36,
67
    29, 22, 15, 23, 30, 37, 44, 51,
68
    58, 59, 52, 45, 38, 31, 39, 46,
69
    53, 60, 61, 54, 47, 55, 62, 63
70
};
71

    
72
/* Specific zigzag scan for 248 idct. NOTE that unlike the
73
   specification, we interleave the fields */
74
const uint8_t ff_zigzag248_direct[64] = {
75
     0,  8,  1,  9, 16, 24,  2, 10,
76
    17, 25, 32, 40, 48, 56, 33, 41,
77
    18, 26,  3, 11,  4, 12, 19, 27,
78
    34, 42, 49, 57, 50, 58, 35, 43,
79
    20, 28,  5, 13,  6, 14, 21, 29,
80
    36, 44, 51, 59, 52, 60, 37, 45,
81
    22, 30,  7, 15, 23, 31, 38, 46,
82
    53, 61, 54, 62, 39, 47, 55, 63,
83
};
84

    
85
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
87

    
88
const uint8_t ff_alternate_horizontal_scan[64] = {
89
    0,  1,   2,  3,  8,  9, 16, 17,
90
    10, 11,  4,  5,  6,  7, 15, 14,
91
    13, 12, 19, 18, 24, 25, 32, 33,
92
    26, 27, 20, 21, 22, 23, 28, 29,
93
    30, 31, 34, 35, 40, 41, 48, 49,
94
    42, 43, 36, 37, 38, 39, 44, 45,
95
    46, 47, 50, 51, 56, 57, 58, 59,
96
    52, 53, 54, 55, 60, 61, 62, 63,
97
};
98

    
99
const uint8_t ff_alternate_vertical_scan[64] = {
100
    0,  8,  16, 24,  1,  9,  2, 10,
101
    17, 25, 32, 40, 48, 56, 57, 49,
102
    41, 33, 26, 18,  3, 11,  4, 12,
103
    19, 27, 34, 42, 50, 58, 35, 43,
104
    51, 59, 20, 28,  5, 13,  6, 14,
105
    21, 29, 36, 44, 52, 60, 37, 45,
106
    53, 61, 22, 30,  7, 15, 23, 31,
107
    38, 46, 54, 62, 39, 47, 55, 63,
108
};
109

    
110
/* Input permutation for the simple_idct_mmx */
111
static const uint8_t simple_mmx_permutation[64]={
112
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
120
};
121

    
122
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
123

    
124
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
125
    int i;
126
    int end;
127

    
128
    st->scantable= src_scantable;
129

    
130
    for(i=0; i<64; i++){
131
        int j;
132
        j = src_scantable[i];
133
        st->permutated[i] = permutation[j];
134
#if ARCH_PPC
135
        st->inverse[j] = i;
136
#endif
137
    }
138

    
139
    end=-1;
140
    for(i=0; i<64; i++){
141
        int j;
142
        j = st->permutated[i];
143
        if(j>end) end=j;
144
        st->raster_end[i]= end;
145
    }
146
}
147

    
148
static int pix_sum_c(uint8_t * pix, int line_size)
149
{
150
    int s, i, j;
151

    
152
    s = 0;
153
    for (i = 0; i < 16; i++) {
154
        for (j = 0; j < 16; j += 8) {
155
            s += pix[0];
156
            s += pix[1];
157
            s += pix[2];
158
            s += pix[3];
159
            s += pix[4];
160
            s += pix[5];
161
            s += pix[6];
162
            s += pix[7];
163
            pix += 8;
164
        }
165
        pix += line_size - 16;
166
    }
167
    return s;
168
}
169

    
170
static int pix_norm1_c(uint8_t * pix, int line_size)
171
{
172
    int s, i, j;
173
    uint32_t *sq = ff_squareTbl + 256;
174

    
175
    s = 0;
176
    for (i = 0; i < 16; i++) {
177
        for (j = 0; j < 16; j += 8) {
178
#if 0
179
            s += sq[pix[0]];
180
            s += sq[pix[1]];
181
            s += sq[pix[2]];
182
            s += sq[pix[3]];
183
            s += sq[pix[4]];
184
            s += sq[pix[5]];
185
            s += sq[pix[6]];
186
            s += sq[pix[7]];
187
#else
188
#if LONG_MAX > 2147483647
189
            register uint64_t x=*(uint64_t*)pix;
190
            s += sq[x&0xff];
191
            s += sq[(x>>8)&0xff];
192
            s += sq[(x>>16)&0xff];
193
            s += sq[(x>>24)&0xff];
194
            s += sq[(x>>32)&0xff];
195
            s += sq[(x>>40)&0xff];
196
            s += sq[(x>>48)&0xff];
197
            s += sq[(x>>56)&0xff];
198
#else
199
            register uint32_t x=*(uint32_t*)pix;
200
            s += sq[x&0xff];
201
            s += sq[(x>>8)&0xff];
202
            s += sq[(x>>16)&0xff];
203
            s += sq[(x>>24)&0xff];
204
            x=*(uint32_t*)(pix+4);
205
            s += sq[x&0xff];
206
            s += sq[(x>>8)&0xff];
207
            s += sq[(x>>16)&0xff];
208
            s += sq[(x>>24)&0xff];
209
#endif
210
#endif
211
            pix += 8;
212
        }
213
        pix += line_size - 16;
214
    }
215
    return s;
216
}
217

    
218
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
219
    int i;
220

    
221
    for(i=0; i+8<=w; i+=8){
222
        dst[i+0]= av_bswap32(src[i+0]);
223
        dst[i+1]= av_bswap32(src[i+1]);
224
        dst[i+2]= av_bswap32(src[i+2]);
225
        dst[i+3]= av_bswap32(src[i+3]);
226
        dst[i+4]= av_bswap32(src[i+4]);
227
        dst[i+5]= av_bswap32(src[i+5]);
228
        dst[i+6]= av_bswap32(src[i+6]);
229
        dst[i+7]= av_bswap32(src[i+7]);
230
    }
231
    for(;i<w; i++){
232
        dst[i+0]= av_bswap32(src[i+0]);
233
    }
234
}
235

    
236
static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
237
{
238
    while (len--)
239
        *dst++ = av_bswap16(*src++);
240
}
241

    
242
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
243
{
244
    int s, i;
245
    uint32_t *sq = ff_squareTbl + 256;
246

    
247
    s = 0;
248
    for (i = 0; i < h; i++) {
249
        s += sq[pix1[0] - pix2[0]];
250
        s += sq[pix1[1] - pix2[1]];
251
        s += sq[pix1[2] - pix2[2]];
252
        s += sq[pix1[3] - pix2[3]];
253
        pix1 += line_size;
254
        pix2 += line_size;
255
    }
256
    return s;
257
}
258

    
259
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
260
{
261
    int s, i;
262
    uint32_t *sq = ff_squareTbl + 256;
263

    
264
    s = 0;
265
    for (i = 0; i < h; i++) {
266
        s += sq[pix1[0] - pix2[0]];
267
        s += sq[pix1[1] - pix2[1]];
268
        s += sq[pix1[2] - pix2[2]];
269
        s += sq[pix1[3] - pix2[3]];
270
        s += sq[pix1[4] - pix2[4]];
271
        s += sq[pix1[5] - pix2[5]];
272
        s += sq[pix1[6] - pix2[6]];
273
        s += sq[pix1[7] - pix2[7]];
274
        pix1 += line_size;
275
        pix2 += line_size;
276
    }
277
    return s;
278
}
279

    
280
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
281
{
282
    int s, i;
283
    uint32_t *sq = ff_squareTbl + 256;
284

    
285
    s = 0;
286
    for (i = 0; i < h; i++) {
287
        s += sq[pix1[ 0] - pix2[ 0]];
288
        s += sq[pix1[ 1] - pix2[ 1]];
289
        s += sq[pix1[ 2] - pix2[ 2]];
290
        s += sq[pix1[ 3] - pix2[ 3]];
291
        s += sq[pix1[ 4] - pix2[ 4]];
292
        s += sq[pix1[ 5] - pix2[ 5]];
293
        s += sq[pix1[ 6] - pix2[ 6]];
294
        s += sq[pix1[ 7] - pix2[ 7]];
295
        s += sq[pix1[ 8] - pix2[ 8]];
296
        s += sq[pix1[ 9] - pix2[ 9]];
297
        s += sq[pix1[10] - pix2[10]];
298
        s += sq[pix1[11] - pix2[11]];
299
        s += sq[pix1[12] - pix2[12]];
300
        s += sq[pix1[13] - pix2[13]];
301
        s += sq[pix1[14] - pix2[14]];
302
        s += sq[pix1[15] - pix2[15]];
303

    
304
        pix1 += line_size;
305
        pix2 += line_size;
306
    }
307
    return s;
308
}
309

    
310
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
311
{
312
    int i;
313

    
314
    /* read the pixels */
315
    for(i=0;i<8;i++) {
316
        block[0] = pixels[0];
317
        block[1] = pixels[1];
318
        block[2] = pixels[2];
319
        block[3] = pixels[3];
320
        block[4] = pixels[4];
321
        block[5] = pixels[5];
322
        block[6] = pixels[6];
323
        block[7] = pixels[7];
324
        pixels += line_size;
325
        block += 8;
326
    }
327
}
328

    
329
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
330
                          const uint8_t *s2, int stride){
331
    int i;
332

    
333
    /* read the pixels */
334
    for(i=0;i<8;i++) {
335
        block[0] = s1[0] - s2[0];
336
        block[1] = s1[1] - s2[1];
337
        block[2] = s1[2] - s2[2];
338
        block[3] = s1[3] - s2[3];
339
        block[4] = s1[4] - s2[4];
340
        block[5] = s1[5] - s2[5];
341
        block[6] = s1[6] - s2[6];
342
        block[7] = s1[7] - s2[7];
343
        s1 += stride;
344
        s2 += stride;
345
        block += 8;
346
    }
347
}
348

    
349

    
350
void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
351
                             int line_size)
352
{
353
    int i;
354
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
355

    
356
    /* read the pixels */
357
    for(i=0;i<8;i++) {
358
        pixels[0] = cm[block[0]];
359
        pixels[1] = cm[block[1]];
360
        pixels[2] = cm[block[2]];
361
        pixels[3] = cm[block[3]];
362
        pixels[4] = cm[block[4]];
363
        pixels[5] = cm[block[5]];
364
        pixels[6] = cm[block[6]];
365
        pixels[7] = cm[block[7]];
366

    
367
        pixels += line_size;
368
        block += 8;
369
    }
370
}
371

    
372
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
373
                                 int line_size)
374
{
375
    int i;
376
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
377

    
378
    /* read the pixels */
379
    for(i=0;i<4;i++) {
380
        pixels[0] = cm[block[0]];
381
        pixels[1] = cm[block[1]];
382
        pixels[2] = cm[block[2]];
383
        pixels[3] = cm[block[3]];
384

    
385
        pixels += line_size;
386
        block += 8;
387
    }
388
}
389

    
390
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
391
                                 int line_size)
392
{
393
    int i;
394
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
395

    
396
    /* read the pixels */
397
    for(i=0;i<2;i++) {
398
        pixels[0] = cm[block[0]];
399
        pixels[1] = cm[block[1]];
400

    
401
        pixels += line_size;
402
        block += 8;
403
    }
404
}
405

    
406
void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
407
                                    uint8_t *restrict pixels,
408
                                    int line_size)
409
{
410
    int i, j;
411

    
412
    for (i = 0; i < 8; i++) {
413
        for (j = 0; j < 8; j++) {
414
            if (*block < -128)
415
                *pixels = 0;
416
            else if (*block > 127)
417
                *pixels = 255;
418
            else
419
                *pixels = (uint8_t)(*block + 128);
420
            block++;
421
            pixels++;
422
        }
423
        pixels += (line_size - 8);
424
    }
425
}
426

    
427
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
428
                                    int line_size)
429
{
430
    int i;
431

    
432
    /* read the pixels */
433
    for(i=0;i<8;i++) {
434
        pixels[0] = block[0];
435
        pixels[1] = block[1];
436
        pixels[2] = block[2];
437
        pixels[3] = block[3];
438
        pixels[4] = block[4];
439
        pixels[5] = block[5];
440
        pixels[6] = block[6];
441
        pixels[7] = block[7];
442

    
443
        pixels += line_size;
444
        block += 8;
445
    }
446
}
447

    
448
void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
449
                             int line_size)
450
{
451
    int i;
452
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
453

    
454
    /* read the pixels */
455
    for(i=0;i<8;i++) {
456
        pixels[0] = cm[pixels[0] + block[0]];
457
        pixels[1] = cm[pixels[1] + block[1]];
458
        pixels[2] = cm[pixels[2] + block[2]];
459
        pixels[3] = cm[pixels[3] + block[3]];
460
        pixels[4] = cm[pixels[4] + block[4]];
461
        pixels[5] = cm[pixels[5] + block[5]];
462
        pixels[6] = cm[pixels[6] + block[6]];
463
        pixels[7] = cm[pixels[7] + block[7]];
464
        pixels += line_size;
465
        block += 8;
466
    }
467
}
468

    
469
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
470
                          int line_size)
471
{
472
    int i;
473
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
474

    
475
    /* read the pixels */
476
    for(i=0;i<4;i++) {
477
        pixels[0] = cm[pixels[0] + block[0]];
478
        pixels[1] = cm[pixels[1] + block[1]];
479
        pixels[2] = cm[pixels[2] + block[2]];
480
        pixels[3] = cm[pixels[3] + block[3]];
481
        pixels += line_size;
482
        block += 8;
483
    }
484
}
485

    
486
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
487
                          int line_size)
488
{
489
    int i;
490
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
491

    
492
    /* read the pixels */
493
    for(i=0;i<2;i++) {
494
        pixels[0] = cm[pixels[0] + block[0]];
495
        pixels[1] = cm[pixels[1] + block[1]];
496
        pixels += line_size;
497
        block += 8;
498
    }
499
}
500

    
501
static int sum_abs_dctelem_c(DCTELEM *block)
502
{
503
    int sum=0, i;
504
    for(i=0; i<64; i++)
505
        sum+= FFABS(block[i]);
506
    return sum;
507
}
508

    
509
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
510
{
511
    int i;
512

    
513
    for (i = 0; i < h; i++) {
514
        memset(block, value, 16);
515
        block += line_size;
516
    }
517
}
518

    
519
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
520
{
521
    int i;
522

    
523
    for (i = 0; i < h; i++) {
524
        memset(block, value, 8);
525
        block += line_size;
526
    }
527
}
528

    
529
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
530
{
531
    int i, j;
532
    uint16_t *dst1 = (uint16_t *) dst;
533
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
534

    
535
    for (j = 0; j < 8; j++) {
536
        for (i = 0; i < 8; i++) {
537
            dst1[i] = dst2[i] = src[i] * 0x0101;
538
        }
539
        src  += 8;
540
        dst1 += linesize;
541
        dst2 += linesize;
542
    }
543
}
544

    
545
#define avg2(a,b) ((a+b+1)>>1)
546
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
547

    
548
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
549
{
550
    const int A=(16-x16)*(16-y16);
551
    const int B=(   x16)*(16-y16);
552
    const int C=(16-x16)*(   y16);
553
    const int D=(   x16)*(   y16);
554
    int i;
555

    
556
    for(i=0; i<h; i++)
557
    {
558
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
559
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
560
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
561
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
562
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
563
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
564
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
565
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
566
        dst+= stride;
567
        src+= stride;
568
    }
569
}
570

    
571
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
572
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
573
{
574
    int y, vx, vy;
575
    const int s= 1<<shift;
576

    
577
    width--;
578
    height--;
579

    
580
    for(y=0; y<h; y++){
581
        int x;
582

    
583
        vx= ox;
584
        vy= oy;
585
        for(x=0; x<8; x++){ //XXX FIXME optimize
586
            int src_x, src_y, frac_x, frac_y, index;
587

    
588
            src_x= vx>>16;
589
            src_y= vy>>16;
590
            frac_x= src_x&(s-1);
591
            frac_y= src_y&(s-1);
592
            src_x>>=shift;
593
            src_y>>=shift;
594

    
595
            if((unsigned)src_x < width){
596
                if((unsigned)src_y < height){
597
                    index= src_x + src_y*stride;
598
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
599
                                           + src[index       +1]*   frac_x )*(s-frac_y)
600
                                        + (  src[index+stride  ]*(s-frac_x)
601
                                           + src[index+stride+1]*   frac_x )*   frac_y
602
                                        + r)>>(shift*2);
603
                }else{
604
                    index= src_x + av_clip(src_y, 0, height)*stride;
605
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
606
                                          + src[index       +1]*   frac_x )*s
607
                                        + r)>>(shift*2);
608
                }
609
            }else{
610
                if((unsigned)src_y < height){
611
                    index= av_clip(src_x, 0, width) + src_y*stride;
612
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
613
                                           + src[index+stride  ]*   frac_y )*s
614
                                        + r)>>(shift*2);
615
                }else{
616
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
617
                    dst[y*stride + x]=    src[index         ];
618
                }
619
            }
620

    
621
            vx+= dxx;
622
            vy+= dyx;
623
        }
624
        ox += dxy;
625
        oy += dyy;
626
    }
627
}
628

    
629
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
630
    switch(width){
631
    case 2: put_pixels2_8_c (dst, src, stride, height); break;
632
    case 4: put_pixels4_8_c (dst, src, stride, height); break;
633
    case 8: put_pixels8_8_c (dst, src, stride, height); break;
634
    case 16:put_pixels16_8_c(dst, src, stride, height); break;
635
    }
636
}
637

    
638
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
639
    int i,j;
640
    for (i=0; i < height; i++) {
641
      for (j=0; j < width; j++) {
642
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
643
      }
644
      src += stride;
645
      dst += stride;
646
    }
647
}
648

    
649
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
650
    int i,j;
651
    for (i=0; i < height; i++) {
652
      for (j=0; j < width; j++) {
653
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
654
      }
655
      src += stride;
656
      dst += stride;
657
    }
658
}
659

    
660
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
661
    int i,j;
662
    for (i=0; i < height; i++) {
663
      for (j=0; j < width; j++) {
664
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
665
      }
666
      src += stride;
667
      dst += stride;
668
    }
669
}
670

    
671
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
672
    int i,j;
673
    for (i=0; i < height; i++) {
674
      for (j=0; j < width; j++) {
675
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
676
      }
677
      src += stride;
678
      dst += stride;
679
    }
680
}
681

    
682
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
683
    int i,j;
684
    for (i=0; i < height; i++) {
685
      for (j=0; j < width; j++) {
686
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
687
      }
688
      src += stride;
689
      dst += stride;
690
    }
691
}
692

    
693
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
694
    int i,j;
695
    for (i=0; i < height; i++) {
696
      for (j=0; j < width; j++) {
697
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
698
      }
699
      src += stride;
700
      dst += stride;
701
    }
702
}
703

    
704
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
705
    int i,j;
706
    for (i=0; i < height; i++) {
707
      for (j=0; j < width; j++) {
708
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
709
      }
710
      src += stride;
711
      dst += stride;
712
    }
713
}
714

    
715
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
716
    int i,j;
717
    for (i=0; i < height; i++) {
718
      for (j=0; j < width; j++) {
719
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
720
      }
721
      src += stride;
722
      dst += stride;
723
    }
724
}
725

    
726
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
727
    switch(width){
728
    case 2: avg_pixels2_8_c (dst, src, stride, height); break;
729
    case 4: avg_pixels4_8_c (dst, src, stride, height); break;
730
    case 8: avg_pixels8_8_c (dst, src, stride, height); break;
731
    case 16:avg_pixels16_8_c(dst, src, stride, height); break;
732
    }
733
}
734

    
735
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
736
    int i,j;
737
    for (i=0; i < height; i++) {
738
      for (j=0; j < width; j++) {
739
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
740
      }
741
      src += stride;
742
      dst += stride;
743
    }
744
}
745

    
746
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
747
    int i,j;
748
    for (i=0; i < height; i++) {
749
      for (j=0; j < width; j++) {
750
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
751
      }
752
      src += stride;
753
      dst += stride;
754
    }
755
}
756

    
757
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
758
    int i,j;
759
    for (i=0; i < height; i++) {
760
      for (j=0; j < width; j++) {
761
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
762
      }
763
      src += stride;
764
      dst += stride;
765
    }
766
}
767

    
768
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
769
    int i,j;
770
    for (i=0; i < height; i++) {
771
      for (j=0; j < width; j++) {
772
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
773
      }
774
      src += stride;
775
      dst += stride;
776
    }
777
}
778

    
779
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
780
    int i,j;
781
    for (i=0; i < height; i++) {
782
      for (j=0; j < width; j++) {
783
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
784
      }
785
      src += stride;
786
      dst += stride;
787
    }
788
}
789

    
790
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
791
    int i,j;
792
    for (i=0; i < height; i++) {
793
      for (j=0; j < width; j++) {
794
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
795
      }
796
      src += stride;
797
      dst += stride;
798
    }
799
}
800

    
801
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
802
    int i,j;
803
    for (i=0; i < height; i++) {
804
      for (j=0; j < width; j++) {
805
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
806
      }
807
      src += stride;
808
      dst += stride;
809
    }
810
}
811

    
812
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
813
    int i,j;
814
    for (i=0; i < height; i++) {
815
      for (j=0; j < width; j++) {
816
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
817
      }
818
      src += stride;
819
      dst += stride;
820
    }
821
}
822
#if 0
823
#define TPEL_WIDTH(width)\
824
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
825
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
826
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
827
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
828
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
829
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
830
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
831
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
832
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
833
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
834
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
835
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
836
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
837
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
838
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
839
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
840
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
841
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
842
#endif
843

    
844
#define QPEL_MC(r, OPNAME, RND, OP) \
845
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
846
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
847
    int i;\
848
    for(i=0; i<h; i++)\
849
    {\
850
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
851
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
852
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
853
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
854
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
855
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
856
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
857
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
858
        dst+=dstStride;\
859
        src+=srcStride;\
860
    }\
861
}\
862
\
863
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
864
    const int w=8;\
865
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
866
    int i;\
867
    for(i=0; i<w; i++)\
868
    {\
869
        const int src0= src[0*srcStride];\
870
        const int src1= src[1*srcStride];\
871
        const int src2= src[2*srcStride];\
872
        const int src3= src[3*srcStride];\
873
        const int src4= src[4*srcStride];\
874
        const int src5= src[5*srcStride];\
875
        const int src6= src[6*srcStride];\
876
        const int src7= src[7*srcStride];\
877
        const int src8= src[8*srcStride];\
878
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
879
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
880
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
881
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
882
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
883
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
884
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
885
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
886
        dst++;\
887
        src++;\
888
    }\
889
}\
890
\
891
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
892
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
893
    int i;\
894
    \
895
    for(i=0; i<h; i++)\
896
    {\
897
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
898
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
899
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
900
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
901
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
902
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
903
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
904
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
905
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
906
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
907
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
908
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
909
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
910
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
911
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
912
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
913
        dst+=dstStride;\
914
        src+=srcStride;\
915
    }\
916
}\
917
\
918
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
919
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
920
    int i;\
921
    const int w=16;\
922
    for(i=0; i<w; i++)\
923
    {\
924
        const int src0= src[0*srcStride];\
925
        const int src1= src[1*srcStride];\
926
        const int src2= src[2*srcStride];\
927
        const int src3= src[3*srcStride];\
928
        const int src4= src[4*srcStride];\
929
        const int src5= src[5*srcStride];\
930
        const int src6= src[6*srcStride];\
931
        const int src7= src[7*srcStride];\
932
        const int src8= src[8*srcStride];\
933
        const int src9= src[9*srcStride];\
934
        const int src10= src[10*srcStride];\
935
        const int src11= src[11*srcStride];\
936
        const int src12= src[12*srcStride];\
937
        const int src13= src[13*srcStride];\
938
        const int src14= src[14*srcStride];\
939
        const int src15= src[15*srcStride];\
940
        const int src16= src[16*srcStride];\
941
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
942
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
943
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
944
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
945
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
946
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
947
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
948
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
949
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
950
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
951
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
952
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
953
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
954
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
955
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
956
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
957
        dst++;\
958
        src++;\
959
    }\
960
}\
961
\
962
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
963
    uint8_t half[64];\
964
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
965
    OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
966
}\
967
\
968
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
969
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
970
}\
971
\
972
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
973
    uint8_t half[64];\
974
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
975
    OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
976
}\
977
\
978
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
979
    uint8_t full[16*9];\
980
    uint8_t half[64];\
981
    copy_block9(full, src, 16, stride, 9);\
982
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
983
    OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
984
}\
985
\
986
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
987
    uint8_t full[16*9];\
988
    copy_block9(full, src, 16, stride, 9);\
989
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
990
}\
991
\
992
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
993
    uint8_t full[16*9];\
994
    uint8_t half[64];\
995
    copy_block9(full, src, 16, stride, 9);\
996
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
997
    OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
998
}\
999
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1000
    uint8_t full[16*9];\
1001
    uint8_t halfH[72];\
1002
    uint8_t halfV[64];\
1003
    uint8_t halfHV[64];\
1004
    copy_block9(full, src, 16, stride, 9);\
1005
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1006
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1007
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1008
    OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1009
}\
1010
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1011
    uint8_t full[16*9];\
1012
    uint8_t halfH[72];\
1013
    uint8_t halfHV[64];\
1014
    copy_block9(full, src, 16, stride, 9);\
1015
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016
    put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1017
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018
    OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1019
}\
1020
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021
    uint8_t full[16*9];\
1022
    uint8_t halfH[72];\
1023
    uint8_t halfV[64];\
1024
    uint8_t halfHV[64];\
1025
    copy_block9(full, src, 16, stride, 9);\
1026
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1028
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029
    OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1030
}\
1031
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1032
    uint8_t full[16*9];\
1033
    uint8_t halfH[72];\
1034
    uint8_t halfHV[64];\
1035
    copy_block9(full, src, 16, stride, 9);\
1036
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037
    put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1038
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039
    OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1040
}\
1041
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1042
    uint8_t full[16*9];\
1043
    uint8_t halfH[72];\
1044
    uint8_t halfV[64];\
1045
    uint8_t halfHV[64];\
1046
    copy_block9(full, src, 16, stride, 9);\
1047
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1048
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1049
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1050
    OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1051
}\
1052
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1053
    uint8_t full[16*9];\
1054
    uint8_t halfH[72];\
1055
    uint8_t halfHV[64];\
1056
    copy_block9(full, src, 16, stride, 9);\
1057
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1058
    put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1059
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1060
    OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1061
}\
1062
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1063
    uint8_t full[16*9];\
1064
    uint8_t halfH[72];\
1065
    uint8_t halfV[64];\
1066
    uint8_t halfHV[64];\
1067
    copy_block9(full, src, 16, stride, 9);\
1068
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1069
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1070
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1071
    OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1072
}\
1073
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1074
    uint8_t full[16*9];\
1075
    uint8_t halfH[72];\
1076
    uint8_t halfHV[64];\
1077
    copy_block9(full, src, 16, stride, 9);\
1078
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1079
    put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1080
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1081
    OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1082
}\
1083
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1084
    uint8_t halfH[72];\
1085
    uint8_t halfHV[64];\
1086
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1087
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1088
    OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1089
}\
1090
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1091
    uint8_t halfH[72];\
1092
    uint8_t halfHV[64];\
1093
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1094
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1095
    OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1096
}\
1097
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1098
    uint8_t full[16*9];\
1099
    uint8_t halfH[72];\
1100
    uint8_t halfV[64];\
1101
    uint8_t halfHV[64];\
1102
    copy_block9(full, src, 16, stride, 9);\
1103
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1104
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1105
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1106
    OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1107
}\
1108
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1109
    uint8_t full[16*9];\
1110
    uint8_t halfH[72];\
1111
    copy_block9(full, src, 16, stride, 9);\
1112
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1113
    put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1114
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1115
}\
1116
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1117
    uint8_t full[16*9];\
1118
    uint8_t halfH[72];\
1119
    uint8_t halfV[64];\
1120
    uint8_t halfHV[64];\
1121
    copy_block9(full, src, 16, stride, 9);\
1122
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1123
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1124
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1125
    OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1126
}\
1127
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1128
    uint8_t full[16*9];\
1129
    uint8_t halfH[72];\
1130
    copy_block9(full, src, 16, stride, 9);\
1131
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1132
    put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1133
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1134
}\
1135
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1136
    uint8_t halfH[72];\
1137
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1138
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1139
}\
1140
\
1141
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1142
    uint8_t half[256];\
1143
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1144
    OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1145
}\
1146
\
1147
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1148
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1149
}\
1150
\
1151
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1152
    uint8_t half[256];\
1153
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1154
    OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1155
}\
1156
\
1157
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1158
    uint8_t full[24*17];\
1159
    uint8_t half[256];\
1160
    copy_block17(full, src, 24, stride, 17);\
1161
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1162
    OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1163
}\
1164
\
1165
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1166
    uint8_t full[24*17];\
1167
    copy_block17(full, src, 24, stride, 17);\
1168
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1169
}\
1170
\
1171
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1172
    uint8_t full[24*17];\
1173
    uint8_t half[256];\
1174
    copy_block17(full, src, 24, stride, 17);\
1175
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1176
    OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1177
}\
1178
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1179
    uint8_t full[24*17];\
1180
    uint8_t halfH[272];\
1181
    uint8_t halfV[256];\
1182
    uint8_t halfHV[256];\
1183
    copy_block17(full, src, 24, stride, 17);\
1184
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1186
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187
    OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1188
}\
1189
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1190
    uint8_t full[24*17];\
1191
    uint8_t halfH[272];\
1192
    uint8_t halfHV[256];\
1193
    copy_block17(full, src, 24, stride, 17);\
1194
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195
    put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1196
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197
    OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1198
}\
1199
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200
    uint8_t full[24*17];\
1201
    uint8_t halfH[272];\
1202
    uint8_t halfV[256];\
1203
    uint8_t halfHV[256];\
1204
    copy_block17(full, src, 24, stride, 17);\
1205
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1206
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1207
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208
    OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1209
}\
1210
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1211
    uint8_t full[24*17];\
1212
    uint8_t halfH[272];\
1213
    uint8_t halfHV[256];\
1214
    copy_block17(full, src, 24, stride, 17);\
1215
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216
    put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1217
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218
    OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1219
}\
1220
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1221
    uint8_t full[24*17];\
1222
    uint8_t halfH[272];\
1223
    uint8_t halfV[256];\
1224
    uint8_t halfHV[256];\
1225
    copy_block17(full, src, 24, stride, 17);\
1226
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1227
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1228
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1229
    OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1230
}\
1231
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1232
    uint8_t full[24*17];\
1233
    uint8_t halfH[272];\
1234
    uint8_t halfHV[256];\
1235
    copy_block17(full, src, 24, stride, 17);\
1236
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237
    put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1238
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1239
    OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1240
}\
1241
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1242
    uint8_t full[24*17];\
1243
    uint8_t halfH[272];\
1244
    uint8_t halfV[256];\
1245
    uint8_t halfHV[256];\
1246
    copy_block17(full, src, 24, stride, 17);\
1247
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1248
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1249
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1250
    OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1251
}\
1252
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1253
    uint8_t full[24*17];\
1254
    uint8_t halfH[272];\
1255
    uint8_t halfHV[256];\
1256
    copy_block17(full, src, 24, stride, 17);\
1257
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1258
    put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1259
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1260
    OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1261
}\
1262
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1263
    uint8_t halfH[272];\
1264
    uint8_t halfHV[256];\
1265
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1266
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1267
    OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1268
}\
1269
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1270
    uint8_t halfH[272];\
1271
    uint8_t halfHV[256];\
1272
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1273
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1274
    OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1275
}\
1276
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1277
    uint8_t full[24*17];\
1278
    uint8_t halfH[272];\
1279
    uint8_t halfV[256];\
1280
    uint8_t halfHV[256];\
1281
    copy_block17(full, src, 24, stride, 17);\
1282
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1283
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1284
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1285
    OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1286
}\
1287
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1288
    uint8_t full[24*17];\
1289
    uint8_t halfH[272];\
1290
    copy_block17(full, src, 24, stride, 17);\
1291
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1292
    put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1293
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1294
}\
1295
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1296
    uint8_t full[24*17];\
1297
    uint8_t halfH[272];\
1298
    uint8_t halfV[256];\
1299
    uint8_t halfHV[256];\
1300
    copy_block17(full, src, 24, stride, 17);\
1301
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1302
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1303
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1304
    OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1305
}\
1306
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1307
    uint8_t full[24*17];\
1308
    uint8_t halfH[272];\
1309
    copy_block17(full, src, 24, stride, 17);\
1310
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1311
    put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1312
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1313
}\
1314
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1315
    uint8_t halfH[272];\
1316
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1317
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1318
}
1319

    
1320
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1321
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1322
#define op_put(a, b) a = cm[((b) + 16)>>5]
1323
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1324

    
1325
QPEL_MC(0, put_       , _       , op_put)
1326
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1327
QPEL_MC(0, avg_       , _       , op_avg)
1328
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1329
#undef op_avg
1330
#undef op_avg_no_rnd
1331
#undef op_put
1332
#undef op_put_no_rnd
1333

    
1334
#define put_qpel8_mc00_c  ff_put_pixels8x8_c
1335
#define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1336
#define put_qpel16_mc00_c ff_put_pixels16x16_c
1337
#define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1338
#define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1339
#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1340

    
1341
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1342
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1343
    int i;
1344

    
1345
    for(i=0; i<h; i++){
1346
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1347
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1348
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1349
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1350
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1351
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1352
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1353
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1354
        dst+=dstStride;
1355
        src+=srcStride;
1356
    }
1357
}
1358

    
1359
#if CONFIG_RV40_DECODER
1360
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1361
    put_pixels16_xy2_8_c(dst, src, stride, 16);
1362
}
1363
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1364
    avg_pixels16_xy2_8_c(dst, src, stride, 16);
1365
}
1366
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1367
    put_pixels8_xy2_8_c(dst, src, stride, 8);
1368
}
1369
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1370
    avg_pixels8_xy2_8_c(dst, src, stride, 8);
1371
}
1372
#endif /* CONFIG_RV40_DECODER */
1373

    
1374
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1375
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1376
    int i;
1377

    
1378
    for(i=0; i<w; i++){
1379
        const int src_1= src[ -srcStride];
1380
        const int src0 = src[0          ];
1381
        const int src1 = src[  srcStride];
1382
        const int src2 = src[2*srcStride];
1383
        const int src3 = src[3*srcStride];
1384
        const int src4 = src[4*srcStride];
1385
        const int src5 = src[5*srcStride];
1386
        const int src6 = src[6*srcStride];
1387
        const int src7 = src[7*srcStride];
1388
        const int src8 = src[8*srcStride];
1389
        const int src9 = src[9*srcStride];
1390
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1391
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1392
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1393
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1394
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1395
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1396
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1397
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1398
        src++;
1399
        dst++;
1400
    }
1401
}
1402

    
1403
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1404
    uint8_t half[64];
1405
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1406
    put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1407
}
1408

    
1409
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1410
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1411
}
1412

    
1413
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1414
    uint8_t half[64];
1415
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1416
    put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1417
}
1418

    
1419
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1420
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1421
}
1422

    
1423
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1424
    uint8_t halfH[88];
1425
    uint8_t halfV[64];
1426
    uint8_t halfHV[64];
1427
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1428
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1429
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1430
    put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1431
}
1432
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1433
    uint8_t halfH[88];
1434
    uint8_t halfV[64];
1435
    uint8_t halfHV[64];
1436
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1437
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1438
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1439
    put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1440
}
1441
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1442
    uint8_t halfH[88];
1443
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1444
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1445
}
1446

    
1447
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1448
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1449
    int x;
1450
    const int strength= ff_h263_loop_filter_strength[qscale];
1451

    
1452
    for(x=0; x<8; x++){
1453
        int d1, d2, ad1;
1454
        int p0= src[x-2*stride];
1455
        int p1= src[x-1*stride];
1456
        int p2= src[x+0*stride];
1457
        int p3= src[x+1*stride];
1458
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1459

    
1460
        if     (d<-2*strength) d1= 0;
1461
        else if(d<-  strength) d1=-2*strength - d;
1462
        else if(d<   strength) d1= d;
1463
        else if(d< 2*strength) d1= 2*strength - d;
1464
        else                   d1= 0;
1465

    
1466
        p1 += d1;
1467
        p2 -= d1;
1468
        if(p1&256) p1= ~(p1>>31);
1469
        if(p2&256) p2= ~(p2>>31);
1470

    
1471
        src[x-1*stride] = p1;
1472
        src[x+0*stride] = p2;
1473

    
1474
        ad1= FFABS(d1)>>1;
1475

    
1476
        d2= av_clip((p0-p3)/4, -ad1, ad1);
1477

    
1478
        src[x-2*stride] = p0 - d2;
1479
        src[x+  stride] = p3 + d2;
1480
    }
1481
    }
1482
}
1483

    
1484
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1485
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1486
    int y;
1487
    const int strength= ff_h263_loop_filter_strength[qscale];
1488

    
1489
    for(y=0; y<8; y++){
1490
        int d1, d2, ad1;
1491
        int p0= src[y*stride-2];
1492
        int p1= src[y*stride-1];
1493
        int p2= src[y*stride+0];
1494
        int p3= src[y*stride+1];
1495
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1496

    
1497
        if     (d<-2*strength) d1= 0;
1498
        else if(d<-  strength) d1=-2*strength - d;
1499
        else if(d<   strength) d1= d;
1500
        else if(d< 2*strength) d1= 2*strength - d;
1501
        else                   d1= 0;
1502

    
1503
        p1 += d1;
1504
        p2 -= d1;
1505
        if(p1&256) p1= ~(p1>>31);
1506
        if(p2&256) p2= ~(p2>>31);
1507

    
1508
        src[y*stride-1] = p1;
1509
        src[y*stride+0] = p2;
1510

    
1511
        ad1= FFABS(d1)>>1;
1512

    
1513
        d2= av_clip((p0-p3)/4, -ad1, ad1);
1514

    
1515
        src[y*stride-2] = p0 - d2;
1516
        src[y*stride+1] = p3 + d2;
1517
    }
1518
    }
1519
}
1520

    
1521
static void h261_loop_filter_c(uint8_t *src, int stride){
1522
    int x,y,xy,yz;
1523
    int temp[64];
1524

    
1525
    for(x=0; x<8; x++){
1526
        temp[x      ] = 4*src[x           ];
1527
        temp[x + 7*8] = 4*src[x + 7*stride];
1528
    }
1529
    for(y=1; y<7; y++){
1530
        for(x=0; x<8; x++){
1531
            xy = y * stride + x;
1532
            yz = y * 8 + x;
1533
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1534
        }
1535
    }
1536

    
1537
    for(y=0; y<8; y++){
1538
        src[  y*stride] = (temp[  y*8] + 2)>>2;
1539
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1540
        for(x=1; x<7; x++){
1541
            xy = y * stride + x;
1542
            yz = y * 8 + x;
1543
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1544
        }
1545
    }
1546
}
1547

    
1548
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1549
{
1550
    int s, i;
1551

    
1552
    s = 0;
1553
    for(i=0;i<h;i++) {
1554
        s += abs(pix1[0] - pix2[0]);
1555
        s += abs(pix1[1] - pix2[1]);
1556
        s += abs(pix1[2] - pix2[2]);
1557
        s += abs(pix1[3] - pix2[3]);
1558
        s += abs(pix1[4] - pix2[4]);
1559
        s += abs(pix1[5] - pix2[5]);
1560
        s += abs(pix1[6] - pix2[6]);
1561
        s += abs(pix1[7] - pix2[7]);
1562
        s += abs(pix1[8] - pix2[8]);
1563
        s += abs(pix1[9] - pix2[9]);
1564
        s += abs(pix1[10] - pix2[10]);
1565
        s += abs(pix1[11] - pix2[11]);
1566
        s += abs(pix1[12] - pix2[12]);
1567
        s += abs(pix1[13] - pix2[13]);
1568
        s += abs(pix1[14] - pix2[14]);
1569
        s += abs(pix1[15] - pix2[15]);
1570
        pix1 += line_size;
1571
        pix2 += line_size;
1572
    }
1573
    return s;
1574
}
1575

    
1576
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1577
{
1578
    int s, i;
1579

    
1580
    s = 0;
1581
    for(i=0;i<h;i++) {
1582
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1583
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1584
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1585
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1586
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1587
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1588
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1589
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1590
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1591
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1592
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1593
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1594
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1595
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1596
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1597
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1598
        pix1 += line_size;
1599
        pix2 += line_size;
1600
    }
1601
    return s;
1602
}
1603

    
1604
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1605
{
1606
    int s, i;
1607
    uint8_t *pix3 = pix2 + line_size;
1608

    
1609
    s = 0;
1610
    for(i=0;i<h;i++) {
1611
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1612
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1613
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1614
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1615
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1616
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1617
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1618
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1619
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1620
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1621
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1622
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1623
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1624
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1625
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1626
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1627
        pix1 += line_size;
1628
        pix2 += line_size;
1629
        pix3 += line_size;
1630
    }
1631
    return s;
1632
}
1633

    
1634
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1635
{
1636
    int s, i;
1637
    uint8_t *pix3 = pix2 + line_size;
1638

    
1639
    s = 0;
1640
    for(i=0;i<h;i++) {
1641
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1642
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1643
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1644
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1645
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1646
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1647
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1648
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1649
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1650
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1651
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1652
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1653
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1654
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1655
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1656
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1657
        pix1 += line_size;
1658
        pix2 += line_size;
1659
        pix3 += line_size;
1660
    }
1661
    return s;
1662
}
1663

    
1664
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1665
{
1666
    int s, i;
1667

    
1668
    s = 0;
1669
    for(i=0;i<h;i++) {
1670
        s += abs(pix1[0] - pix2[0]);
1671
        s += abs(pix1[1] - pix2[1]);
1672
        s += abs(pix1[2] - pix2[2]);
1673
        s += abs(pix1[3] - pix2[3]);
1674
        s += abs(pix1[4] - pix2[4]);
1675
        s += abs(pix1[5] - pix2[5]);
1676
        s += abs(pix1[6] - pix2[6]);
1677
        s += abs(pix1[7] - pix2[7]);
1678
        pix1 += line_size;
1679
        pix2 += line_size;
1680
    }
1681
    return s;
1682
}
1683

    
1684
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1685
{
1686
    int s, i;
1687

    
1688
    s = 0;
1689
    for(i=0;i<h;i++) {
1690
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1691
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1692
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1693
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1694
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1695
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1696
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1697
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1698
        pix1 += line_size;
1699
        pix2 += line_size;
1700
    }
1701
    return s;
1702
}
1703

    
1704
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1705
{
1706
    int s, i;
1707
    uint8_t *pix3 = pix2 + line_size;
1708

    
1709
    s = 0;
1710
    for(i=0;i<h;i++) {
1711
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1712
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1713
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1714
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1715
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1716
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1717
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1718
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1719
        pix1 += line_size;
1720
        pix2 += line_size;
1721
        pix3 += line_size;
1722
    }
1723
    return s;
1724
}
1725

    
1726
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1727
{
1728
    int s, i;
1729
    uint8_t *pix3 = pix2 + line_size;
1730

    
1731
    s = 0;
1732
    for(i=0;i<h;i++) {
1733
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1734
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1735
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1736
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1737
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1738
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1739
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1740
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1741
        pix1 += line_size;
1742
        pix2 += line_size;
1743
        pix3 += line_size;
1744
    }
1745
    return s;
1746
}
1747

    
1748
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1749
    MpegEncContext *c = v;
1750
    int score1=0;
1751
    int score2=0;
1752
    int x,y;
1753

    
1754
    for(y=0; y<h; y++){
1755
        for(x=0; x<16; x++){
1756
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1757
        }
1758
        if(y+1<h){
1759
            for(x=0; x<15; x++){
1760
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
1761
                             - s1[x+1] + s1[x+1+stride])
1762
                        -FFABS(  s2[x  ] - s2[x  +stride]
1763
                             - s2[x+1] + s2[x+1+stride]);
1764
            }
1765
        }
1766
        s1+= stride;
1767
        s2+= stride;
1768
    }
1769

    
1770
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1771
    else  return score1 + FFABS(score2)*8;
1772
}
1773

    
1774
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1775
    MpegEncContext *c = v;
1776
    int score1=0;
1777
    int score2=0;
1778
    int x,y;
1779

    
1780
    for(y=0; y<h; y++){
1781
        for(x=0; x<8; x++){
1782
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1783
        }
1784
        if(y+1<h){
1785
            for(x=0; x<7; x++){
1786
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
1787
                             - s1[x+1] + s1[x+1+stride])
1788
                        -FFABS(  s2[x  ] - s2[x  +stride]
1789
                             - s2[x+1] + s2[x+1+stride]);
1790
            }
1791
        }
1792
        s1+= stride;
1793
        s2+= stride;
1794
    }
1795

    
1796
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1797
    else  return score1 + FFABS(score2)*8;
1798
}
1799

    
1800
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1801
    int i;
1802
    unsigned int sum=0;
1803

    
1804
    for(i=0; i<8*8; i++){
1805
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1806
        int w= weight[i];
1807
        b>>= RECON_SHIFT;
1808
        assert(-512<b && b<512);
1809

    
1810
        sum += (w*b)*(w*b)>>4;
1811
    }
1812
    return sum>>2;
1813
}
1814

    
1815
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1816
    int i;
1817

    
1818
    for(i=0; i<8*8; i++){
1819
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1820
    }
1821
}
1822

    
1823
/**
1824
 * permutes an 8x8 block.
1825
 * @param block the block which will be permuted according to the given permutation vector
1826
 * @param permutation the permutation vector
1827
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1828
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1829
 *                  (inverse) permutated to scantable order!
1830
 */
1831
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1832
{
1833
    int i;
1834
    DCTELEM temp[64];
1835

    
1836
    if(last<=0) return;
1837
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1838

    
1839
    for(i=0; i<=last; i++){
1840
        const int j= scantable[i];
1841
        temp[j]= block[j];
1842
        block[j]=0;
1843
    }
1844

    
1845
    for(i=0; i<=last; i++){
1846
        const int j= scantable[i];
1847
        const int perm_j= permutation[j];
1848
        block[perm_j]= temp[j];
1849
    }
1850
}
1851

    
1852
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1853
    return 0;
1854
}
1855

    
1856
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1857
    int i;
1858

    
1859
    memset(cmp, 0, sizeof(void*)*6);
1860

    
1861
    for(i=0; i<6; i++){
1862
        switch(type&0xFF){
1863
        case FF_CMP_SAD:
1864
            cmp[i]= c->sad[i];
1865
            break;
1866
        case FF_CMP_SATD:
1867
            cmp[i]= c->hadamard8_diff[i];
1868
            break;
1869
        case FF_CMP_SSE:
1870
            cmp[i]= c->sse[i];
1871
            break;
1872
        case FF_CMP_DCT:
1873
            cmp[i]= c->dct_sad[i];
1874
            break;
1875
        case FF_CMP_DCT264:
1876
            cmp[i]= c->dct264_sad[i];
1877
            break;
1878
        case FF_CMP_DCTMAX:
1879
            cmp[i]= c->dct_max[i];
1880
            break;
1881
        case FF_CMP_PSNR:
1882
            cmp[i]= c->quant_psnr[i];
1883
            break;
1884
        case FF_CMP_BIT:
1885
            cmp[i]= c->bit[i];
1886
            break;
1887
        case FF_CMP_RD:
1888
            cmp[i]= c->rd[i];
1889
            break;
1890
        case FF_CMP_VSAD:
1891
            cmp[i]= c->vsad[i];
1892
            break;
1893
        case FF_CMP_VSSE:
1894
            cmp[i]= c->vsse[i];
1895
            break;
1896
        case FF_CMP_ZERO:
1897
            cmp[i]= zero_cmp;
1898
            break;
1899
        case FF_CMP_NSSE:
1900
            cmp[i]= c->nsse[i];
1901
            break;
1902
#if CONFIG_DWT
1903
        case FF_CMP_W53:
1904
            cmp[i]= c->w53[i];
1905
            break;
1906
        case FF_CMP_W97:
1907
            cmp[i]= c->w97[i];
1908
            break;
1909
#endif
1910
        default:
1911
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1912
        }
1913
    }
1914
}
1915

    
1916
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1917
    long i;
1918
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1919
        long a = *(long*)(src+i);
1920
        long b = *(long*)(dst+i);
1921
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1922
    }
1923
    for(; i<w; i++)
1924
        dst[i+0] += src[i+0];
1925
}
1926

    
1927
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1928
    long i;
1929
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1930
        long a = *(long*)(src1+i);
1931
        long b = *(long*)(src2+i);
1932
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1933
    }
1934
    for(; i<w; i++)
1935
        dst[i] = src1[i]+src2[i];
1936
}
1937

    
1938
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1939
    long i;
1940
#if !HAVE_FAST_UNALIGNED
1941
    if((long)src2 & (sizeof(long)-1)){
1942
        for(i=0; i+7<w; i+=8){
1943
            dst[i+0] = src1[i+0]-src2[i+0];
1944
            dst[i+1] = src1[i+1]-src2[i+1];
1945
            dst[i+2] = src1[i+2]-src2[i+2];
1946
            dst[i+3] = src1[i+3]-src2[i+3];
1947
            dst[i+4] = src1[i+4]-src2[i+4];
1948
            dst[i+5] = src1[i+5]-src2[i+5];
1949
            dst[i+6] = src1[i+6]-src2[i+6];
1950
            dst[i+7] = src1[i+7]-src2[i+7];
1951
        }
1952
    }else
1953
#endif
1954
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1955
        long a = *(long*)(src1+i);
1956
        long b = *(long*)(src2+i);
1957
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1958
    }
1959
    for(; i<w; i++)
1960
        dst[i+0] = src1[i+0]-src2[i+0];
1961
}
1962

    
1963
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1964
    int i;
1965
    uint8_t l, lt;
1966

    
1967
    l= *left;
1968
    lt= *left_top;
1969

    
1970
    for(i=0; i<w; i++){
1971
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1972
        lt= src1[i];
1973
        dst[i]= l;
1974
    }
1975

    
1976
    *left= l;
1977
    *left_top= lt;
1978
}
1979

    
1980
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1981
    int i;
1982
    uint8_t l, lt;
1983

    
1984
    l= *left;
1985
    lt= *left_top;
1986

    
1987
    for(i=0; i<w; i++){
1988
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1989
        lt= src1[i];
1990
        l= src2[i];
1991
        dst[i]= l - pred;
1992
    }
1993

    
1994
    *left= l;
1995
    *left_top= lt;
1996
}
1997

    
1998
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1999
    int i;
2000

    
2001
    for(i=0; i<w-1; i++){
2002
        acc+= src[i];
2003
        dst[i]= acc;
2004
        i++;
2005
        acc+= src[i];
2006
        dst[i]= acc;
2007
    }
2008

    
2009
    for(; i<w; i++){
2010
        acc+= src[i];
2011
        dst[i]= acc;
2012
    }
2013

    
2014
    return acc;
2015
}
2016

    
2017
#if HAVE_BIGENDIAN
2018
#define B 3
2019
#define G 2
2020
#define R 1
2021
#define A 0
2022
#else
2023
#define B 0
2024
#define G 1
2025
#define R 2
2026
#define A 3
2027
#endif
2028
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2029
    int i;
2030
    int r,g,b,a;
2031
    r= *red;
2032
    g= *green;
2033
    b= *blue;
2034
    a= *alpha;
2035

    
2036
    for(i=0; i<w; i++){
2037
        b+= src[4*i+B];
2038
        g+= src[4*i+G];
2039
        r+= src[4*i+R];
2040
        a+= src[4*i+A];
2041

    
2042
        dst[4*i+B]= b;
2043
        dst[4*i+G]= g;
2044
        dst[4*i+R]= r;
2045
        dst[4*i+A]= a;
2046
    }
2047

    
2048
    *red= r;
2049
    *green= g;
2050
    *blue= b;
2051
    *alpha= a;
2052
}
2053
#undef B
2054
#undef G
2055
#undef R
2056
#undef A
2057

    
2058
#define BUTTERFLY2(o1,o2,i1,i2) \
2059
o1= (i1)+(i2);\
2060
o2= (i1)-(i2);
2061

    
2062
#define BUTTERFLY1(x,y) \
2063
{\
2064
    int a,b;\
2065
    a= x;\
2066
    b= y;\
2067
    x= a+b;\
2068
    y= a-b;\
2069
}
2070

    
2071
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2072

    
2073
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2074
    int i;
2075
    int temp[64];
2076
    int sum=0;
2077

    
2078
    assert(h==8);
2079

    
2080
    for(i=0; i<8; i++){
2081
        //FIXME try pointer walks
2082
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2083
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2084
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2085
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2086

    
2087
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2088
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2089
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2090
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2091

    
2092
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2093
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2094
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2095
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2096
    }
2097

    
2098
    for(i=0; i<8; i++){
2099
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2100
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2101
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2102
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2103

    
2104
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2105
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2106
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2107
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2108

    
2109
        sum +=
2110
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2111
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2112
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2113
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2114
    }
2115
#if 0
2116
static int maxi=0;
2117
if(sum>maxi){
2118
    maxi=sum;
2119
    printf("MAX:%d\n", maxi);
2120
}
2121
#endif
2122
    return sum;
2123
}
2124

    
2125
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2126
    int i;
2127
    int temp[64];
2128
    int sum=0;
2129

    
2130
    assert(h==8);
2131

    
2132
    for(i=0; i<8; i++){
2133
        //FIXME try pointer walks
2134
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2135
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2136
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2137
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2138

    
2139
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2140
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2141
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2142
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2143

    
2144
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2145
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2146
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2147
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2148
    }
2149

    
2150
    for(i=0; i<8; i++){
2151
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2152
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2153
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2154
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2155

    
2156
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2157
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2158
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2159
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2160

    
2161
        sum +=
2162
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2163
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2164
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2165
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2166
    }
2167

    
2168
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2169

    
2170
    return sum;
2171
}
2172

    
2173
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2174
    MpegEncContext * const s= (MpegEncContext *)c;
2175
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2176

    
2177
    assert(h==8);
2178

    
2179
    s->dsp.diff_pixels(temp, src1, src2, stride);
2180
    s->dsp.fdct(temp);
2181
    return s->dsp.sum_abs_dctelem(temp);
2182
}
2183

    
2184
#if CONFIG_GPL
2185
#define DCT8_1D {\
2186
    const int s07 = SRC(0) + SRC(7);\
2187
    const int s16 = SRC(1) + SRC(6);\
2188
    const int s25 = SRC(2) + SRC(5);\
2189
    const int s34 = SRC(3) + SRC(4);\
2190
    const int a0 = s07 + s34;\
2191
    const int a1 = s16 + s25;\
2192
    const int a2 = s07 - s34;\
2193
    const int a3 = s16 - s25;\
2194
    const int d07 = SRC(0) - SRC(7);\
2195
    const int d16 = SRC(1) - SRC(6);\
2196
    const int d25 = SRC(2) - SRC(5);\
2197
    const int d34 = SRC(3) - SRC(4);\
2198
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
2199
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
2200
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
2201
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
2202
    DST(0,  a0 + a1     ) ;\
2203
    DST(1,  a4 + (a7>>2)) ;\
2204
    DST(2,  a2 + (a3>>1)) ;\
2205
    DST(3,  a5 + (a6>>2)) ;\
2206
    DST(4,  a0 - a1     ) ;\
2207
    DST(5,  a6 - (a5>>2)) ;\
2208
    DST(6, (a2>>1) - a3 ) ;\
2209
    DST(7, (a4>>2) - a7 ) ;\
2210
}
2211

    
2212
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2213
    MpegEncContext * const s= (MpegEncContext *)c;
2214
    DCTELEM dct[8][8];
2215
    int i;
2216
    int sum=0;
2217

    
2218
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
2219

    
2220
#define SRC(x) dct[i][x]
2221
#define DST(x,v) dct[i][x]= v
2222
    for( i = 0; i < 8; i++ )
2223
        DCT8_1D
2224
#undef SRC
2225
#undef DST
2226

    
2227
#define SRC(x) dct[x][i]
2228
#define DST(x,v) sum += FFABS(v)
2229
    for( i = 0; i < 8; i++ )
2230
        DCT8_1D
2231
#undef SRC
2232
#undef DST
2233
    return sum;
2234
}
2235
#endif
2236

    
2237
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2238
    MpegEncContext * const s= (MpegEncContext *)c;
2239
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2240
    int sum=0, i;
2241

    
2242
    assert(h==8);
2243

    
2244
    s->dsp.diff_pixels(temp, src1, src2, stride);
2245
    s->dsp.fdct(temp);
2246

    
2247
    for(i=0; i<64; i++)
2248
        sum= FFMAX(sum, FFABS(temp[i]));
2249

    
2250
    return sum;
2251
}
2252

    
2253
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2254
    MpegEncContext * const s= (MpegEncContext *)c;
2255
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2256
    DCTELEM * const bak = temp+64;
2257
    int sum=0, i;
2258

    
2259
    assert(h==8);
2260
    s->mb_intra=0;
2261

    
2262
    s->dsp.diff_pixels(temp, src1, src2, stride);
2263

    
2264
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2265

    
2266
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2267
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
2268
    ff_simple_idct(temp); //FIXME
2269

    
2270
    for(i=0; i<64; i++)
2271
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2272

    
2273
    return sum;
2274
}
2275

    
2276
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2277
    MpegEncContext * const s= (MpegEncContext *)c;
2278
    const uint8_t *scantable= s->intra_scantable.permutated;
2279
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2280
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2281
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2282
    int i, last, run, bits, level, distortion, start_i;
2283
    const int esc_length= s->ac_esc_length;
2284
    uint8_t * length;
2285
    uint8_t * last_length;
2286

    
2287
    assert(h==8);
2288

    
2289
    copy_block8(lsrc1, src1, 8, stride, 8);
2290
    copy_block8(lsrc2, src2, 8, stride, 8);
2291

    
2292
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2293

    
2294
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2295

    
2296
    bits=0;
2297

    
2298
    if (s->mb_intra) {
2299
        start_i = 1;
2300
        length     = s->intra_ac_vlc_length;
2301
        last_length= s->intra_ac_vlc_last_length;
2302
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2303
    } else {
2304
        start_i = 0;
2305
        length     = s->inter_ac_vlc_length;
2306
        last_length= s->inter_ac_vlc_last_length;
2307
    }
2308

    
2309
    if(last>=start_i){
2310
        run=0;
2311
        for(i=start_i; i<last; i++){
2312
            int j= scantable[i];
2313
            level= temp[j];
2314

    
2315
            if(level){
2316
                level+=64;
2317
                if((level&(~127)) == 0){
2318
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2319
                }else
2320
                    bits+= esc_length;
2321
                run=0;
2322
            }else
2323
                run++;
2324
        }
2325
        i= scantable[last];
2326

    
2327
        level= temp[i] + 64;
2328

    
2329
        assert(level - 64);
2330

    
2331
        if((level&(~127)) == 0){
2332
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2333
        }else
2334
            bits+= esc_length;
2335

    
2336
    }
2337

    
2338
    if(last>=0){
2339
        if(s->mb_intra)
2340
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
2341
        else
2342
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
2343
    }
2344

    
2345
    s->dsp.idct_add(lsrc2, 8, temp);
2346

    
2347
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2348

    
2349
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2350
}
2351

    
2352
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2353
    MpegEncContext * const s= (MpegEncContext *)c;
2354
    const uint8_t *scantable= s->intra_scantable.permutated;
2355
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2356
    int i, last, run, bits, level, start_i;
2357
    const int esc_length= s->ac_esc_length;
2358
    uint8_t * length;
2359
    uint8_t * last_length;
2360

    
2361
    assert(h==8);
2362

    
2363
    s->dsp.diff_pixels(temp, src1, src2, stride);
2364

    
2365
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2366

    
2367
    bits=0;
2368

    
2369
    if (s->mb_intra) {
2370
        start_i = 1;
2371
        length     = s->intra_ac_vlc_length;
2372
        last_length= s->intra_ac_vlc_last_length;
2373
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2374
    } else {
2375
        start_i = 0;
2376
        length     = s->inter_ac_vlc_length;
2377
        last_length= s->inter_ac_vlc_last_length;
2378
    }
2379

    
2380
    if(last>=start_i){
2381
        run=0;
2382
        for(i=start_i; i<last; i++){
2383
            int j= scantable[i];
2384
            level= temp[j];
2385

    
2386
            if(level){
2387
                level+=64;
2388
                if((level&(~127)) == 0){
2389
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2390
                }else
2391
                    bits+= esc_length;
2392
                run=0;
2393
            }else
2394
                run++;
2395
        }
2396
        i= scantable[last];
2397

    
2398
        level= temp[i] + 64;
2399

    
2400
        assert(level - 64);
2401

    
2402
        if((level&(~127)) == 0){
2403
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2404
        }else
2405
            bits+= esc_length;
2406
    }
2407

    
2408
    return bits;
2409
}
2410

    
2411
#define VSAD_INTRA(size) \
2412
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2413
    int score=0;                                                                                            \
2414
    int x,y;                                                                                                \
2415
                                                                                                            \
2416
    for(y=1; y<h; y++){                                                                                     \
2417
        for(x=0; x<size; x+=4){                                                                             \
2418
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2419
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2420
        }                                                                                                   \
2421
        s+= stride;                                                                                         \
2422
    }                                                                                                       \
2423
                                                                                                            \
2424
    return score;                                                                                           \
2425
}
2426
VSAD_INTRA(8)
2427
VSAD_INTRA(16)
2428

    
2429
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2430
    int score=0;
2431
    int x,y;
2432

    
2433
    for(y=1; y<h; y++){
2434
        for(x=0; x<16; x++){
2435
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2436
        }
2437
        s1+= stride;
2438
        s2+= stride;
2439
    }
2440

    
2441
    return score;
2442
}
2443

    
2444
#define SQ(a) ((a)*(a))
2445
#define VSSE_INTRA(size) \
2446
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2447
    int score=0;                                                                                            \
2448
    int x,y;                                                                                                \
2449
                                                                                                            \
2450
    for(y=1; y<h; y++){                                                                                     \
2451
        for(x=0; x<size; x+=4){                                                                               \
2452
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2453
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2454
        }                                                                                                   \
2455
        s+= stride;                                                                                         \
2456
    }                                                                                                       \
2457
                                                                                                            \
2458
    return score;                                                                                           \
2459
}
2460
VSSE_INTRA(8)
2461
VSSE_INTRA(16)
2462

    
2463
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2464
    int score=0;
2465
    int x,y;
2466

    
2467
    for(y=1; y<h; y++){
2468
        for(x=0; x<16; x++){
2469
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2470
        }
2471
        s1+= stride;
2472
        s2+= stride;
2473
    }
2474

    
2475
    return score;
2476
}
2477

    
2478
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2479
                               int size){
2480
    int score=0;
2481
    int i;
2482
    for(i=0; i<size; i++)
2483
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2484
    return score;
2485
}
2486

    
2487
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2488
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2489
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2490
#if CONFIG_GPL
2491
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2492
#endif
2493
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2494
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2495
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2496
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2497

    
2498
static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2499
    int i;
2500
    for(i=0; i<len; i++)
2501
        dst[i] = src0[i] * src1[i];
2502
}
2503

    
2504
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2505
    int i;
2506
    src1 += len-1;
2507
    for(i=0; i<len; i++)
2508
        dst[i] = src0[i] * src1[-i];
2509
}
2510

    
2511
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2512
    int i;
2513
    for(i=0; i<len; i++)
2514
        dst[i] = src0[i] * src1[i] + src2[i];
2515
}
2516

    
2517
static void vector_fmul_window_c(float *dst, const float *src0,
2518
                                 const float *src1, const float *win, int len)
2519
{
2520
    int i,j;
2521
    dst += len;
2522
    win += len;
2523
    src0+= len;
2524
    for(i=-len, j=len-1; i<0; i++, j--) {
2525
        float s0 = src0[i];
2526
        float s1 = src1[j];
2527
        float wi = win[i];
2528
        float wj = win[j];
2529
        dst[i] = s0*wj - s1*wi;
2530
        dst[j] = s0*wi + s1*wj;
2531
    }
2532
}
2533

    
2534
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2535
                                 int len)
2536
{
2537
    int i;
2538
    for (i = 0; i < len; i++)
2539
        dst[i] = src[i] * mul;
2540
}
2541

    
2542
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2543
                                      const float **sv, float mul, int len)
2544
{
2545
    int i;
2546
    for (i = 0; i < len; i += 2, sv++) {
2547
        dst[i  ] = src[i  ] * sv[0][0] * mul;
2548
        dst[i+1] = src[i+1] * sv[0][1] * mul;
2549
    }
2550
}
2551

    
2552
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2553
                                      const float **sv, float mul, int len)
2554
{
2555
    int i;
2556
    for (i = 0; i < len; i += 4, sv++) {
2557
        dst[i  ] = src[i  ] * sv[0][0] * mul;
2558
        dst[i+1] = src[i+1] * sv[0][1] * mul;
2559
        dst[i+2] = src[i+2] * sv[0][2] * mul;
2560
        dst[i+3] = src[i+3] * sv[0][3] * mul;
2561
    }
2562
}
2563

    
2564
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2565
                               int len)
2566
{
2567
    int i;
2568
    for (i = 0; i < len; i += 2, sv++) {
2569
        dst[i  ] = sv[0][0] * mul;
2570
        dst[i+1] = sv[0][1] * mul;
2571
    }
2572
}
2573

    
2574
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2575
                               int len)
2576
{
2577
    int i;
2578
    for (i = 0; i < len; i += 4, sv++) {
2579
        dst[i  ] = sv[0][0] * mul;
2580
        dst[i+1] = sv[0][1] * mul;
2581
        dst[i+2] = sv[0][2] * mul;
2582
        dst[i+3] = sv[0][3] * mul;
2583
    }
2584
}
2585

    
2586
static void butterflies_float_c(float *restrict v1, float *restrict v2,
2587
                                int len)
2588
{
2589
    int i;
2590
    for (i = 0; i < len; i++) {
2591
        float t = v1[i] - v2[i];
2592
        v1[i] += v2[i];
2593
        v2[i] = t;
2594
    }
2595
}
2596

    
2597
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2598
{
2599
    float p = 0.0;
2600
    int i;
2601

    
2602
    for (i = 0; i < len; i++)
2603
        p += v1[i] * v2[i];
2604

    
2605
    return p;
2606
}
2607

    
2608
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2609
                   uint32_t maxi, uint32_t maxisign)
2610
{
2611

    
2612
    if(a > mini) return mini;
2613
    else if((a^(1<<31)) > maxisign) return maxi;
2614
    else return a;
2615
}
2616

    
2617
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2618
    int i;
2619
    uint32_t mini = *(uint32_t*)min;
2620
    uint32_t maxi = *(uint32_t*)max;
2621
    uint32_t maxisign = maxi ^ (1<<31);
2622
    uint32_t *dsti = (uint32_t*)dst;
2623
    const uint32_t *srci = (const uint32_t*)src;
2624
    for(i=0; i<len; i+=8) {
2625
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2626
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2627
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2628
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2629
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2630
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2631
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2632
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2633
    }
2634
}
2635
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2636
    int i;
2637
    if(min < 0 && max > 0) {
2638
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2639
    } else {
2640
        for(i=0; i < len; i+=8) {
2641
            dst[i    ] = av_clipf(src[i    ], min, max);
2642
            dst[i + 1] = av_clipf(src[i + 1], min, max);
2643
            dst[i + 2] = av_clipf(src[i + 2], min, max);
2644
            dst[i + 3] = av_clipf(src[i + 3], min, max);
2645
            dst[i + 4] = av_clipf(src[i + 4], min, max);
2646
            dst[i + 5] = av_clipf(src[i + 5], min, max);
2647
            dst[i + 6] = av_clipf(src[i + 6], min, max);
2648
            dst[i + 7] = av_clipf(src[i + 7], min, max);
2649
        }
2650
    }
2651
}
2652

    
2653
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2654
{
2655
    int res = 0;
2656

    
2657
    while (order--)
2658
        res += (*v1++ * *v2++) >> shift;
2659

    
2660
    return res;
2661
}
2662

    
2663
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2664
{
2665
    int res = 0;
2666
    while (order--) {
2667
        res   += *v1 * *v2++;
2668
        *v1++ += mul * *v3++;
2669
    }
2670
    return res;
2671
}
2672

    
2673
static void apply_window_int16_c(int16_t *output, const int16_t *input,
2674
                                 const int16_t *window, unsigned int len)
2675
{
2676
    int i;
2677
    int len2 = len >> 1;
2678

    
2679
    for (i = 0; i < len2; i++) {
2680
        int16_t w       = window[i];
2681
        output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2682
        output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2683
    }
2684
}
2685

    
2686
#define W0 2048
2687
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2688
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2689
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2690
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2691
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2692
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2693
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2694

    
2695
static void wmv2_idct_row(short * b)
2696
{
2697
    int s1,s2;
2698
    int a0,a1,a2,a3,a4,a5,a6,a7;
2699
    /*step 1*/
2700
    a1 = W1*b[1]+W7*b[7];
2701
    a7 = W7*b[1]-W1*b[7];
2702
    a5 = W5*b[5]+W3*b[3];
2703
    a3 = W3*b[5]-W5*b[3];
2704
    a2 = W2*b[2]+W6*b[6];
2705
    a6 = W6*b[2]-W2*b[6];
2706
    a0 = W0*b[0]+W0*b[4];
2707
    a4 = W0*b[0]-W0*b[4];
2708
    /*step 2*/
2709
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2710
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
2711
    /*step 3*/
2712
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2713
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
2714
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
2715
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2716
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2717
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
2718
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
2719
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2720
}
2721
static void wmv2_idct_col(short * b)
2722
{
2723
    int s1,s2;
2724
    int a0,a1,a2,a3,a4,a5,a6,a7;
2725
    /*step 1, with extended precision*/
2726
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2727
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2728
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2729
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2730
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2731
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2732
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2733
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2734
    /*step 2*/
2735
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
2736
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
2737
    /*step 3*/
2738
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2739
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2740
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2741
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2742

    
2743
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2744
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2745
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2746
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2747
}
2748
void ff_wmv2_idct_c(short * block){
2749
    int i;
2750

    
2751
    for(i=0;i<64;i+=8){
2752
        wmv2_idct_row(block+i);
2753
    }
2754
    for(i=0;i<8;i++){
2755
        wmv2_idct_col(block+i);
2756
    }
2757
}
2758
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2759
 converted */
2760
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2761
{
2762
    ff_wmv2_idct_c(block);
2763
    ff_put_pixels_clamped_c(block, dest, line_size);
2764
}
2765
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2766
{
2767
    ff_wmv2_idct_c(block);
2768
    ff_add_pixels_clamped_c(block, dest, line_size);
2769
}
2770
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2771
{
2772
    j_rev_dct (block);
2773
    ff_put_pixels_clamped_c(block, dest, line_size);
2774
}
2775
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2776
{
2777
    j_rev_dct (block);
2778
    ff_add_pixels_clamped_c(block, dest, line_size);
2779
}
2780

    
2781
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2782
{
2783
    j_rev_dct4 (block);
2784
    put_pixels_clamped4_c(block, dest, line_size);
2785
}
2786
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2787
{
2788
    j_rev_dct4 (block);
2789
    add_pixels_clamped4_c(block, dest, line_size);
2790
}
2791

    
2792
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2793
{
2794
    j_rev_dct2 (block);
2795
    put_pixels_clamped2_c(block, dest, line_size);
2796
}
2797
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2798
{
2799
    j_rev_dct2 (block);
2800
    add_pixels_clamped2_c(block, dest, line_size);
2801
}
2802

    
2803
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2804
{
2805
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2806

    
2807
    dest[0] = cm[(block[0] + 4)>>3];
2808
}
2809
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2810
{
2811
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2812

    
2813
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2814
}
2815

    
2816
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2817

    
2818
/* init static data */
2819
av_cold void dsputil_static_init(void)
2820
{
2821
    int i;
2822

    
2823
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2824
    for(i=0;i<MAX_NEG_CROP;i++) {
2825
        ff_cropTbl[i] = 0;
2826
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2827
    }
2828

    
2829
    for(i=0;i<512;i++) {
2830
        ff_squareTbl[i] = (i - 256) * (i - 256);
2831
    }
2832

    
2833
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2834
}
2835

    
2836
int ff_check_alignment(void){
2837
    static int did_fail=0;
2838
    DECLARE_ALIGNED(16, int, aligned);
2839

    
2840
    if((intptr_t)&aligned & 15){
2841
        if(!did_fail){
2842
#if HAVE_MMX || HAVE_ALTIVEC
2843
            av_log(NULL, AV_LOG_ERROR,
2844
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2845
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
2846
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2847
                "Do not report crashes to FFmpeg developers.\n");
2848
#endif
2849
            did_fail=1;
2850
        }
2851
        return -1;
2852
    }
2853
    return 0;
2854
}
2855

    
2856
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2857
{
2858
    int i;
2859

    
2860
    ff_check_alignment();
2861

    
2862
#if CONFIG_ENCODERS
2863
    if(avctx->dct_algo==FF_DCT_FASTINT) {
2864
        c->fdct = fdct_ifast;
2865
        c->fdct248 = fdct_ifast248;
2866
    }
2867
    else if(avctx->dct_algo==FF_DCT_FAAN) {
2868
        c->fdct = ff_faandct;
2869
        c->fdct248 = ff_faandct248;
2870
    }
2871
    else {
2872
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2873
        c->fdct248 = ff_fdct248_islow;
2874
    }
2875
#endif //CONFIG_ENCODERS
2876

    
2877
    if(avctx->lowres==1){
2878
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2879
            c->idct_put= ff_jref_idct4_put;
2880
            c->idct_add= ff_jref_idct4_add;
2881
        }else{
2882
            if (avctx->codec_id != CODEC_ID_H264) {
2883
                c->idct_put= ff_h264_lowres_idct_put_8_c;
2884
                c->idct_add= ff_h264_lowres_idct_add_8_c;
2885
            } else {
2886
                switch (avctx->bits_per_raw_sample) {
2887
                    case 9:
2888
                        c->idct_put= ff_h264_lowres_idct_put_9_c;
2889
                        c->idct_add= ff_h264_lowres_idct_add_9_c;
2890
                        break;
2891
                    case 10:
2892
                        c->idct_put= ff_h264_lowres_idct_put_10_c;
2893
                        c->idct_add= ff_h264_lowres_idct_add_10_c;
2894
                        break;
2895
                    default:
2896
                        c->idct_put= ff_h264_lowres_idct_put_8_c;
2897
                        c->idct_add= ff_h264_lowres_idct_add_8_c;
2898
                }
2899
            }
2900
        }
2901
        c->idct    = j_rev_dct4;
2902
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2903
    }else if(avctx->lowres==2){
2904
        c->idct_put= ff_jref_idct2_put;
2905
        c->idct_add= ff_jref_idct2_add;
2906
        c->idct    = j_rev_dct2;
2907
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2908
    }else if(avctx->lowres==3){
2909
        c->idct_put= ff_jref_idct1_put;
2910
        c->idct_add= ff_jref_idct1_add;
2911
        c->idct    = j_rev_dct1;
2912
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2913
    }else{
2914
        if(avctx->idct_algo==FF_IDCT_INT){
2915
            c->idct_put= ff_jref_idct_put;
2916
            c->idct_add= ff_jref_idct_add;
2917
            c->idct    = j_rev_dct;
2918
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2919
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2920
                avctx->idct_algo==FF_IDCT_VP3){
2921
            c->idct_put= ff_vp3_idct_put_c;
2922
            c->idct_add= ff_vp3_idct_add_c;
2923
            c->idct    = ff_vp3_idct_c;
2924
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2925
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
2926
            c->idct_put= ff_wmv2_idct_put_c;
2927
            c->idct_add= ff_wmv2_idct_add_c;
2928
            c->idct    = ff_wmv2_idct_c;
2929
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2930
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
2931
            c->idct_put= ff_faanidct_put;
2932
            c->idct_add= ff_faanidct_add;
2933
            c->idct    = ff_faanidct;
2934
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2935
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2936
            c->idct_put= ff_ea_idct_put_c;
2937
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2938
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2939
            c->idct     = ff_bink_idct_c;
2940
            c->idct_add = ff_bink_idct_add_c;
2941
            c->idct_put = ff_bink_idct_put_c;
2942
            c->idct_permutation_type = FF_NO_IDCT_PERM;
2943
        }else{ //accurate/default
2944
            c->idct_put= ff_simple_idct_put;
2945
            c->idct_add= ff_simple_idct_add;
2946
            c->idct    = ff_simple_idct;
2947
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2948
        }
2949
    }
2950

    
2951
    c->get_pixels = get_pixels_c;
2952
    c->diff_pixels = diff_pixels_c;
2953
    c->put_pixels_clamped = ff_put_pixels_clamped_c;
2954
    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2955
    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2956
    c->add_pixels_clamped = ff_add_pixels_clamped_c;
2957
    c->sum_abs_dctelem = sum_abs_dctelem_c;
2958
    c->gmc1 = gmc1_c;
2959
    c->gmc = ff_gmc_c;
2960
    c->pix_sum = pix_sum_c;
2961
    c->pix_norm1 = pix_norm1_c;
2962

    
2963
    c->fill_block_tab[0] = fill_block16_c;
2964
    c->fill_block_tab[1] = fill_block8_c;
2965
    c->scale_block = scale_block_c;
2966

    
2967
    /* TODO [0] 16  [1] 8 */
2968
    c->pix_abs[0][0] = pix_abs16_c;
2969
    c->pix_abs[0][1] = pix_abs16_x2_c;
2970
    c->pix_abs[0][2] = pix_abs16_y2_c;
2971
    c->pix_abs[0][3] = pix_abs16_xy2_c;
2972
    c->pix_abs[1][0] = pix_abs8_c;
2973
    c->pix_abs[1][1] = pix_abs8_x2_c;
2974
    c->pix_abs[1][2] = pix_abs8_y2_c;
2975
    c->pix_abs[1][3] = pix_abs8_xy2_c;
2976

    
2977
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2978
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2979
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2980
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2981
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2982
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2983
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2984
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2985
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2986

    
2987
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2988
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2989
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2990
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2991
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2992
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2993
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2994
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2995
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2996

    
2997
#define dspfunc(PFX, IDX, NUM) \
2998
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2999
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3000
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3001
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3002
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3003
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3004
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3005
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3006
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3007
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3008
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3009
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3010
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3011
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3012
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3013
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3014

    
3015
    dspfunc(put_qpel, 0, 16);
3016
    dspfunc(put_no_rnd_qpel, 0, 16);
3017

    
3018
    dspfunc(avg_qpel, 0, 16);
3019
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3020

    
3021
    dspfunc(put_qpel, 1, 8);
3022
    dspfunc(put_no_rnd_qpel, 1, 8);
3023

    
3024
    dspfunc(avg_qpel, 1, 8);
3025
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3026

    
3027
#undef dspfunc
3028

    
3029
#if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3030
    ff_mlp_init(c, avctx);
3031
#endif
3032
#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3033
    ff_intrax8dsp_init(c,avctx);
3034
#endif
3035
#if CONFIG_RV30_DECODER
3036
    ff_rv30dsp_init(c,avctx);
3037
#endif
3038
#if CONFIG_RV40_DECODER
3039
    ff_rv40dsp_init(c,avctx);
3040
    c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3041
    c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3042
    c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3043
    c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3044
#endif
3045

    
3046
    c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3047
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3048
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3049
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3050
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3051
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3052
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3053
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3054

    
3055
#define SET_CMP_FUNC(name) \
3056
    c->name[0]= name ## 16_c;\
3057
    c->name[1]= name ## 8x8_c;
3058

    
3059
    SET_CMP_FUNC(hadamard8_diff)
3060
    c->hadamard8_diff[4]= hadamard8_intra16_c;
3061
    c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3062
    SET_CMP_FUNC(dct_sad)
3063
    SET_CMP_FUNC(dct_max)
3064
#if CONFIG_GPL
3065
    SET_CMP_FUNC(dct264_sad)
3066
#endif
3067
    c->sad[0]= pix_abs16_c;
3068
    c->sad[1]= pix_abs8_c;
3069
    c->sse[0]= sse16_c;
3070
    c->sse[1]= sse8_c;
3071
    c->sse[2]= sse4_c;
3072
    SET_CMP_FUNC(quant_psnr)
3073
    SET_CMP_FUNC(rd)
3074
    SET_CMP_FUNC(bit)
3075
    c->vsad[0]= vsad16_c;
3076
    c->vsad[4]= vsad_intra16_c;
3077
    c->vsad[5]= vsad_intra8_c;
3078
    c->vsse[0]= vsse16_c;
3079
    c->vsse[4]= vsse_intra16_c;
3080
    c->vsse[5]= vsse_intra8_c;
3081
    c->nsse[0]= nsse16_c;
3082
    c->nsse[1]= nsse8_c;
3083
#if CONFIG_DWT
3084
    ff_dsputil_init_dwt(c);
3085
#endif
3086

    
3087
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3088

    
3089
    c->add_bytes= add_bytes_c;
3090
    c->add_bytes_l2= add_bytes_l2_c;
3091
    c->diff_bytes= diff_bytes_c;
3092
    c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3093
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3094
    c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
3095
    c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3096
    c->bswap_buf= bswap_buf;
3097
    c->bswap16_buf = bswap16_buf;
3098
#if CONFIG_PNG_DECODER
3099
    c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3100
#endif
3101

    
3102
    if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3103
        c->h263_h_loop_filter= h263_h_loop_filter_c;
3104
        c->h263_v_loop_filter= h263_v_loop_filter_c;
3105
    }
3106

    
3107
    if (CONFIG_VP3_DECODER) {
3108
        c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3109
        c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3110
        c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3111
    }
3112

    
3113
    c->h261_loop_filter= h261_loop_filter_c;
3114

    
3115
    c->try_8x8basis= try_8x8basis_c;
3116
    c->add_8x8basis= add_8x8basis_c;
3117

    
3118
#if CONFIG_VORBIS_DECODER
3119
    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3120
#endif
3121
#if CONFIG_AC3_DECODER
3122
    c->ac3_downmix = ff_ac3_downmix_c;
3123
#endif
3124
    c->vector_fmul = vector_fmul_c;
3125
    c->vector_fmul_reverse = vector_fmul_reverse_c;
3126
    c->vector_fmul_add = vector_fmul_add_c;
3127
    c->vector_fmul_window = vector_fmul_window_c;
3128
    c->vector_clipf = vector_clipf_c;
3129
    c->scalarproduct_int16 = scalarproduct_int16_c;
3130
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3131
    c->apply_window_int16 = apply_window_int16_c;
3132
    c->scalarproduct_float = scalarproduct_float_c;
3133
    c->butterflies_float = butterflies_float_c;
3134
    c->vector_fmul_scalar = vector_fmul_scalar_c;
3135

    
3136
    c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3137
    c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3138

    
3139
    c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3140
    c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3141

    
3142
    c->shrink[0]= av_image_copy_plane;
3143
    c->shrink[1]= ff_shrink22;
3144
    c->shrink[2]= ff_shrink44;
3145
    c->shrink[3]= ff_shrink88;
3146

    
3147
    c->prefetch= just_return;
3148

    
3149
    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3150
    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3151

    
3152
#undef FUNC
3153
#undef FUNCC
3154
#define FUNC(f, depth) f ## _ ## depth
3155
#define FUNCC(f, depth) f ## _ ## depth ## _c
3156

    
3157
#define dspfunc1(PFX, IDX, NUM, depth)\
3158
    c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3159
    c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3160
    c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3161
    c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3162

    
3163
#define dspfunc2(PFX, IDX, NUM, depth)\
3164
    c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3165
    c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3166
    c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3167
    c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3168
    c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3169
    c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3170
    c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3171
    c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3172
    c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3173
    c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3174
    c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3175
    c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3176
    c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3177
    c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3178
    c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3179
    c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3180

    
3181

    
3182
#define BIT_DEPTH_FUNCS(depth)\
3183
    c->draw_edges                    = FUNCC(draw_edges            , depth);\
3184
    c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3185
    c->clear_block                   = FUNCC(clear_block           , depth);\
3186
    c->clear_blocks                  = FUNCC(clear_blocks          , depth);\
3187
    c->add_pixels8                   = FUNCC(add_pixels8           , depth);\
3188
    c->add_pixels4                   = FUNCC(add_pixels4           , depth);\
3189
    c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3190
    c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3191
\
3192
    c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3193
    c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3194
    c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3195
    c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3196
    c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3197
    c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3198
\
3199
    dspfunc1(put       , 0, 16, depth);\
3200
    dspfunc1(put       , 1,  8, depth);\
3201
    dspfunc1(put       , 2,  4, depth);\
3202
    dspfunc1(put       , 3,  2, depth);\
3203
    dspfunc1(put_no_rnd, 0, 16, depth);\
3204
    dspfunc1(put_no_rnd, 1,  8, depth);\
3205
    dspfunc1(avg       , 0, 16, depth);\
3206
    dspfunc1(avg       , 1,  8, depth);\
3207
    dspfunc1(avg       , 2,  4, depth);\
3208
    dspfunc1(avg       , 3,  2, depth);\
3209
    dspfunc1(avg_no_rnd, 0, 16, depth);\
3210
    dspfunc1(avg_no_rnd, 1,  8, depth);\
3211
\
3212
    dspfunc2(put_h264_qpel, 0, 16, depth);\
3213
    dspfunc2(put_h264_qpel, 1,  8, depth);\
3214
    dspfunc2(put_h264_qpel, 2,  4, depth);\
3215
    dspfunc2(put_h264_qpel, 3,  2, depth);\
3216
    dspfunc2(avg_h264_qpel, 0, 16, depth);\
3217
    dspfunc2(avg_h264_qpel, 1,  8, depth);\
3218
    dspfunc2(avg_h264_qpel, 2,  4, depth);
3219

    
3220
    if (avctx->codec_id != CODEC_ID_H264 || avctx->bits_per_raw_sample == 8) {
3221
        BIT_DEPTH_FUNCS(8)
3222
    } else {
3223
        switch (avctx->bits_per_raw_sample) {
3224
            case 9:
3225
                BIT_DEPTH_FUNCS(9)
3226
                break;
3227
            case 10:
3228
                BIT_DEPTH_FUNCS(10)
3229
                break;
3230
            default:
3231
                av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3232
                BIT_DEPTH_FUNCS(8)
3233
                break;
3234
        }
3235
    }
3236

    
3237

    
3238
    if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3239
    if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3240
    if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3241
    if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3242
    if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3243
    if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3244
    if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3245
    if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3246
    if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3247

    
3248
    for(i=0; i<64; i++){
3249
        if(!c->put_2tap_qpel_pixels_tab[0][i])
3250
            c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3251
        if(!c->avg_2tap_qpel_pixels_tab[0][i])
3252
            c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3253
    }
3254

    
3255
    c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3256
    c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3257
    c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3258
    c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3259

    
3260
    c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3261
    c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3262
    c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3263
    c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3264

    
3265
    switch(c->idct_permutation_type){
3266
    case FF_NO_IDCT_PERM:
3267
        for(i=0; i<64; i++)
3268
            c->idct_permutation[i]= i;
3269
        break;
3270
    case FF_LIBMPEG2_IDCT_PERM:
3271
        for(i=0; i<64; i++)
3272
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3273
        break;
3274
    case FF_SIMPLE_IDCT_PERM:
3275
        for(i=0; i<64; i++)
3276
            c->idct_permutation[i]= simple_mmx_permutation[i];
3277
        break;
3278
    case FF_TRANSPOSE_IDCT_PERM:
3279
        for(i=0; i<64; i++)
3280
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3281
        break;
3282
    case FF_PARTTRANS_IDCT_PERM:
3283
        for(i=0; i<64; i++)
3284
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3285
        break;
3286
    case FF_SSE2_IDCT_PERM:
3287
        for(i=0; i<64; i++)
3288
            c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3289
        break;
3290
    default:
3291
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3292
    }
3293
}
3294