Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 325eefa2

History | View | Annotate | Download (108 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of Libav.
9
 *
10
 * Libav is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * Libav is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with Libav; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "libavutil/imgutils.h"
31
#include "avcodec.h"
32
#include "dsputil.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "faanidct.h"
36
#include "mathops.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
#include "dsputil_template.c"
47

    
48
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
49
#define pb_7f (~0UL/255 * 0x7f)
50
#define pb_80 (~0UL/255 * 0x80)
51

    
52
const uint8_t ff_zigzag_direct[64] = {
53
    0,   1,  8, 16,  9,  2,  3, 10,
54
    17, 24, 32, 25, 18, 11,  4,  5,
55
    12, 19, 26, 33, 40, 48, 41, 34,
56
    27, 20, 13,  6,  7, 14, 21, 28,
57
    35, 42, 49, 56, 57, 50, 43, 36,
58
    29, 22, 15, 23, 30, 37, 44, 51,
59
    58, 59, 52, 45, 38, 31, 39, 46,
60
    53, 60, 61, 54, 47, 55, 62, 63
61
};
62

    
63
/* Specific zigzag scan for 248 idct. NOTE that unlike the
64
   specification, we interleave the fields */
65
const uint8_t ff_zigzag248_direct[64] = {
66
     0,  8,  1,  9, 16, 24,  2, 10,
67
    17, 25, 32, 40, 48, 56, 33, 41,
68
    18, 26,  3, 11,  4, 12, 19, 27,
69
    34, 42, 49, 57, 50, 58, 35, 43,
70
    20, 28,  5, 13,  6, 14, 21, 29,
71
    36, 44, 51, 59, 52, 60, 37, 45,
72
    22, 30,  7, 15, 23, 31, 38, 46,
73
    53, 61, 54, 62, 39, 47, 55, 63,
74
};
75

    
76
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
77
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
78

    
79
const uint8_t ff_alternate_horizontal_scan[64] = {
80
    0,  1,   2,  3,  8,  9, 16, 17,
81
    10, 11,  4,  5,  6,  7, 15, 14,
82
    13, 12, 19, 18, 24, 25, 32, 33,
83
    26, 27, 20, 21, 22, 23, 28, 29,
84
    30, 31, 34, 35, 40, 41, 48, 49,
85
    42, 43, 36, 37, 38, 39, 44, 45,
86
    46, 47, 50, 51, 56, 57, 58, 59,
87
    52, 53, 54, 55, 60, 61, 62, 63,
88
};
89

    
90
const uint8_t ff_alternate_vertical_scan[64] = {
91
    0,  8,  16, 24,  1,  9,  2, 10,
92
    17, 25, 32, 40, 48, 56, 57, 49,
93
    41, 33, 26, 18,  3, 11,  4, 12,
94
    19, 27, 34, 42, 50, 58, 35, 43,
95
    51, 59, 20, 28,  5, 13,  6, 14,
96
    21, 29, 36, 44, 52, 60, 37, 45,
97
    53, 61, 22, 30,  7, 15, 23, 31,
98
    38, 46, 54, 62, 39, 47, 55, 63,
99
};
100

    
101
/* Input permutation for the simple_idct_mmx */
102
static const uint8_t simple_mmx_permutation[64]={
103
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
104
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
105
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
106
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
107
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
108
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
109
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
110
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
111
};
112

    
113
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
114

    
115
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
116
    int i;
117
    int end;
118

    
119
    st->scantable= src_scantable;
120

    
121
    for(i=0; i<64; i++){
122
        int j;
123
        j = src_scantable[i];
124
        st->permutated[i] = permutation[j];
125
#if ARCH_PPC
126
        st->inverse[j] = i;
127
#endif
128
    }
129

    
130
    end=-1;
131
    for(i=0; i<64; i++){
132
        int j;
133
        j = st->permutated[i];
134
        if(j>end) end=j;
135
        st->raster_end[i]= end;
136
    }
137
}
138

    
139
static int pix_sum_c(uint8_t * pix, int line_size)
140
{
141
    int s, i, j;
142

    
143
    s = 0;
144
    for (i = 0; i < 16; i++) {
145
        for (j = 0; j < 16; j += 8) {
146
            s += pix[0];
147
            s += pix[1];
148
            s += pix[2];
149
            s += pix[3];
150
            s += pix[4];
151
            s += pix[5];
152
            s += pix[6];
153
            s += pix[7];
154
            pix += 8;
155
        }
156
        pix += line_size - 16;
157
    }
158
    return s;
159
}
160

    
161
static int pix_norm1_c(uint8_t * pix, int line_size)
162
{
163
    int s, i, j;
164
    uint32_t *sq = ff_squareTbl + 256;
165

    
166
    s = 0;
167
    for (i = 0; i < 16; i++) {
168
        for (j = 0; j < 16; j += 8) {
169
#if 0
170
            s += sq[pix[0]];
171
            s += sq[pix[1]];
172
            s += sq[pix[2]];
173
            s += sq[pix[3]];
174
            s += sq[pix[4]];
175
            s += sq[pix[5]];
176
            s += sq[pix[6]];
177
            s += sq[pix[7]];
178
#else
179
#if LONG_MAX > 2147483647
180
            register uint64_t x=*(uint64_t*)pix;
181
            s += sq[x&0xff];
182
            s += sq[(x>>8)&0xff];
183
            s += sq[(x>>16)&0xff];
184
            s += sq[(x>>24)&0xff];
185
            s += sq[(x>>32)&0xff];
186
            s += sq[(x>>40)&0xff];
187
            s += sq[(x>>48)&0xff];
188
            s += sq[(x>>56)&0xff];
189
#else
190
            register uint32_t x=*(uint32_t*)pix;
191
            s += sq[x&0xff];
192
            s += sq[(x>>8)&0xff];
193
            s += sq[(x>>16)&0xff];
194
            s += sq[(x>>24)&0xff];
195
            x=*(uint32_t*)(pix+4);
196
            s += sq[x&0xff];
197
            s += sq[(x>>8)&0xff];
198
            s += sq[(x>>16)&0xff];
199
            s += sq[(x>>24)&0xff];
200
#endif
201
#endif
202
            pix += 8;
203
        }
204
        pix += line_size - 16;
205
    }
206
    return s;
207
}
208

    
209
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
210
    int i;
211

    
212
    for(i=0; i+8<=w; i+=8){
213
        dst[i+0]= av_bswap32(src[i+0]);
214
        dst[i+1]= av_bswap32(src[i+1]);
215
        dst[i+2]= av_bswap32(src[i+2]);
216
        dst[i+3]= av_bswap32(src[i+3]);
217
        dst[i+4]= av_bswap32(src[i+4]);
218
        dst[i+5]= av_bswap32(src[i+5]);
219
        dst[i+6]= av_bswap32(src[i+6]);
220
        dst[i+7]= av_bswap32(src[i+7]);
221
    }
222
    for(;i<w; i++){
223
        dst[i+0]= av_bswap32(src[i+0]);
224
    }
225
}
226

    
227
static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
228
{
229
    while (len--)
230
        *dst++ = av_bswap16(*src++);
231
}
232

    
233
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
234
{
235
    int s, i;
236
    uint32_t *sq = ff_squareTbl + 256;
237

    
238
    s = 0;
239
    for (i = 0; i < h; i++) {
240
        s += sq[pix1[0] - pix2[0]];
241
        s += sq[pix1[1] - pix2[1]];
242
        s += sq[pix1[2] - pix2[2]];
243
        s += sq[pix1[3] - pix2[3]];
244
        pix1 += line_size;
245
        pix2 += line_size;
246
    }
247
    return s;
248
}
249

    
250
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
251
{
252
    int s, i;
253
    uint32_t *sq = ff_squareTbl + 256;
254

    
255
    s = 0;
256
    for (i = 0; i < h; i++) {
257
        s += sq[pix1[0] - pix2[0]];
258
        s += sq[pix1[1] - pix2[1]];
259
        s += sq[pix1[2] - pix2[2]];
260
        s += sq[pix1[3] - pix2[3]];
261
        s += sq[pix1[4] - pix2[4]];
262
        s += sq[pix1[5] - pix2[5]];
263
        s += sq[pix1[6] - pix2[6]];
264
        s += sq[pix1[7] - pix2[7]];
265
        pix1 += line_size;
266
        pix2 += line_size;
267
    }
268
    return s;
269
}
270

    
271
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
272
{
273
    int s, i;
274
    uint32_t *sq = ff_squareTbl + 256;
275

    
276
    s = 0;
277
    for (i = 0; i < h; i++) {
278
        s += sq[pix1[ 0] - pix2[ 0]];
279
        s += sq[pix1[ 1] - pix2[ 1]];
280
        s += sq[pix1[ 2] - pix2[ 2]];
281
        s += sq[pix1[ 3] - pix2[ 3]];
282
        s += sq[pix1[ 4] - pix2[ 4]];
283
        s += sq[pix1[ 5] - pix2[ 5]];
284
        s += sq[pix1[ 6] - pix2[ 6]];
285
        s += sq[pix1[ 7] - pix2[ 7]];
286
        s += sq[pix1[ 8] - pix2[ 8]];
287
        s += sq[pix1[ 9] - pix2[ 9]];
288
        s += sq[pix1[10] - pix2[10]];
289
        s += sq[pix1[11] - pix2[11]];
290
        s += sq[pix1[12] - pix2[12]];
291
        s += sq[pix1[13] - pix2[13]];
292
        s += sq[pix1[14] - pix2[14]];
293
        s += sq[pix1[15] - pix2[15]];
294

    
295
        pix1 += line_size;
296
        pix2 += line_size;
297
    }
298
    return s;
299
}
300

    
301
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
302
{
303
    int i;
304

    
305
    /* read the pixels */
306
    for(i=0;i<8;i++) {
307
        block[0] = pixels[0];
308
        block[1] = pixels[1];
309
        block[2] = pixels[2];
310
        block[3] = pixels[3];
311
        block[4] = pixels[4];
312
        block[5] = pixels[5];
313
        block[6] = pixels[6];
314
        block[7] = pixels[7];
315
        pixels += line_size;
316
        block += 8;
317
    }
318
}
319

    
320
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
321
                          const uint8_t *s2, int stride){
322
    int i;
323

    
324
    /* read the pixels */
325
    for(i=0;i<8;i++) {
326
        block[0] = s1[0] - s2[0];
327
        block[1] = s1[1] - s2[1];
328
        block[2] = s1[2] - s2[2];
329
        block[3] = s1[3] - s2[3];
330
        block[4] = s1[4] - s2[4];
331
        block[5] = s1[5] - s2[5];
332
        block[6] = s1[6] - s2[6];
333
        block[7] = s1[7] - s2[7];
334
        s1 += stride;
335
        s2 += stride;
336
        block += 8;
337
    }
338
}
339

    
340

    
341
void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
342
                             int line_size)
343
{
344
    int i;
345
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
346

    
347
    /* read the pixels */
348
    for(i=0;i<8;i++) {
349
        pixels[0] = cm[block[0]];
350
        pixels[1] = cm[block[1]];
351
        pixels[2] = cm[block[2]];
352
        pixels[3] = cm[block[3]];
353
        pixels[4] = cm[block[4]];
354
        pixels[5] = cm[block[5]];
355
        pixels[6] = cm[block[6]];
356
        pixels[7] = cm[block[7]];
357

    
358
        pixels += line_size;
359
        block += 8;
360
    }
361
}
362

    
363
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
364
                                 int line_size)
365
{
366
    int i;
367
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
368

    
369
    /* read the pixels */
370
    for(i=0;i<4;i++) {
371
        pixels[0] = cm[block[0]];
372
        pixels[1] = cm[block[1]];
373
        pixels[2] = cm[block[2]];
374
        pixels[3] = cm[block[3]];
375

    
376
        pixels += line_size;
377
        block += 8;
378
    }
379
}
380

    
381
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
382
                                 int line_size)
383
{
384
    int i;
385
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
386

    
387
    /* read the pixels */
388
    for(i=0;i<2;i++) {
389
        pixels[0] = cm[block[0]];
390
        pixels[1] = cm[block[1]];
391

    
392
        pixels += line_size;
393
        block += 8;
394
    }
395
}
396

    
397
void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
398
                                    uint8_t *restrict pixels,
399
                                    int line_size)
400
{
401
    int i, j;
402

    
403
    for (i = 0; i < 8; i++) {
404
        for (j = 0; j < 8; j++) {
405
            if (*block < -128)
406
                *pixels = 0;
407
            else if (*block > 127)
408
                *pixels = 255;
409
            else
410
                *pixels = (uint8_t)(*block + 128);
411
            block++;
412
            pixels++;
413
        }
414
        pixels += (line_size - 8);
415
    }
416
}
417

    
418
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
419
                                    int line_size)
420
{
421
    int i;
422

    
423
    /* read the pixels */
424
    for(i=0;i<8;i++) {
425
        pixels[0] = block[0];
426
        pixels[1] = block[1];
427
        pixels[2] = block[2];
428
        pixels[3] = block[3];
429
        pixels[4] = block[4];
430
        pixels[5] = block[5];
431
        pixels[6] = block[6];
432
        pixels[7] = block[7];
433

    
434
        pixels += line_size;
435
        block += 8;
436
    }
437
}
438

    
439
void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
440
                             int line_size)
441
{
442
    int i;
443
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
444

    
445
    /* read the pixels */
446
    for(i=0;i<8;i++) {
447
        pixels[0] = cm[pixels[0] + block[0]];
448
        pixels[1] = cm[pixels[1] + block[1]];
449
        pixels[2] = cm[pixels[2] + block[2]];
450
        pixels[3] = cm[pixels[3] + block[3]];
451
        pixels[4] = cm[pixels[4] + block[4]];
452
        pixels[5] = cm[pixels[5] + block[5]];
453
        pixels[6] = cm[pixels[6] + block[6]];
454
        pixels[7] = cm[pixels[7] + block[7]];
455
        pixels += line_size;
456
        block += 8;
457
    }
458
}
459

    
460
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
461
                          int line_size)
462
{
463
    int i;
464
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
465

    
466
    /* read the pixels */
467
    for(i=0;i<4;i++) {
468
        pixels[0] = cm[pixels[0] + block[0]];
469
        pixels[1] = cm[pixels[1] + block[1]];
470
        pixels[2] = cm[pixels[2] + block[2]];
471
        pixels[3] = cm[pixels[3] + block[3]];
472
        pixels += line_size;
473
        block += 8;
474
    }
475
}
476

    
477
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
478
                          int line_size)
479
{
480
    int i;
481
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
482

    
483
    /* read the pixels */
484
    for(i=0;i<2;i++) {
485
        pixels[0] = cm[pixels[0] + block[0]];
486
        pixels[1] = cm[pixels[1] + block[1]];
487
        pixels += line_size;
488
        block += 8;
489
    }
490
}
491

    
492
static int sum_abs_dctelem_c(DCTELEM *block)
493
{
494
    int sum=0, i;
495
    for(i=0; i<64; i++)
496
        sum+= FFABS(block[i]);
497
    return sum;
498
}
499

    
500
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
501
{
502
    int i;
503

    
504
    for (i = 0; i < h; i++) {
505
        memset(block, value, 16);
506
        block += line_size;
507
    }
508
}
509

    
510
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
511
{
512
    int i;
513

    
514
    for (i = 0; i < h; i++) {
515
        memset(block, value, 8);
516
        block += line_size;
517
    }
518
}
519

    
520
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
521
{
522
    int i, j;
523
    uint16_t *dst1 = (uint16_t *) dst;
524
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
525

    
526
    for (j = 0; j < 8; j++) {
527
        for (i = 0; i < 8; i++) {
528
            dst1[i] = dst2[i] = src[i] * 0x0101;
529
        }
530
        src  += 8;
531
        dst1 += linesize;
532
        dst2 += linesize;
533
    }
534
}
535

    
536
#define avg2(a,b) ((a+b+1)>>1)
537
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
538

    
539
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
540
{
541
    const int A=(16-x16)*(16-y16);
542
    const int B=(   x16)*(16-y16);
543
    const int C=(16-x16)*(   y16);
544
    const int D=(   x16)*(   y16);
545
    int i;
546

    
547
    for(i=0; i<h; i++)
548
    {
549
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
550
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
551
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
552
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
553
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
554
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
555
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
556
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
557
        dst+= stride;
558
        src+= stride;
559
    }
560
}
561

    
562
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
563
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
564
{
565
    int y, vx, vy;
566
    const int s= 1<<shift;
567

    
568
    width--;
569
    height--;
570

    
571
    for(y=0; y<h; y++){
572
        int x;
573

    
574
        vx= ox;
575
        vy= oy;
576
        for(x=0; x<8; x++){ //XXX FIXME optimize
577
            int src_x, src_y, frac_x, frac_y, index;
578

    
579
            src_x= vx>>16;
580
            src_y= vy>>16;
581
            frac_x= src_x&(s-1);
582
            frac_y= src_y&(s-1);
583
            src_x>>=shift;
584
            src_y>>=shift;
585

    
586
            if((unsigned)src_x < width){
587
                if((unsigned)src_y < height){
588
                    index= src_x + src_y*stride;
589
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
590
                                           + src[index       +1]*   frac_x )*(s-frac_y)
591
                                        + (  src[index+stride  ]*(s-frac_x)
592
                                           + src[index+stride+1]*   frac_x )*   frac_y
593
                                        + r)>>(shift*2);
594
                }else{
595
                    index= src_x + av_clip(src_y, 0, height)*stride;
596
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
597
                                          + src[index       +1]*   frac_x )*s
598
                                        + r)>>(shift*2);
599
                }
600
            }else{
601
                if((unsigned)src_y < height){
602
                    index= av_clip(src_x, 0, width) + src_y*stride;
603
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
604
                                           + src[index+stride  ]*   frac_y )*s
605
                                        + r)>>(shift*2);
606
                }else{
607
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
608
                    dst[y*stride + x]=    src[index         ];
609
                }
610
            }
611

    
612
            vx+= dxx;
613
            vy+= dyx;
614
        }
615
        ox += dxy;
616
        oy += dyy;
617
    }
618
}
619

    
620
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
621
    switch(width){
622
    case 2: put_pixels2_c (dst, src, stride, height); break;
623
    case 4: put_pixels4_c (dst, src, stride, height); break;
624
    case 8: put_pixels8_c (dst, src, stride, height); break;
625
    case 16:put_pixels16_c(dst, src, stride, height); break;
626
    }
627
}
628

    
629
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
630
    int i,j;
631
    for (i=0; i < height; i++) {
632
      for (j=0; j < width; j++) {
633
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
634
      }
635
      src += stride;
636
      dst += stride;
637
    }
638
}
639

    
640
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
641
    int i,j;
642
    for (i=0; i < height; i++) {
643
      for (j=0; j < width; j++) {
644
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
645
      }
646
      src += stride;
647
      dst += stride;
648
    }
649
}
650

    
651
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
652
    int i,j;
653
    for (i=0; i < height; i++) {
654
      for (j=0; j < width; j++) {
655
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
656
      }
657
      src += stride;
658
      dst += stride;
659
    }
660
}
661

    
662
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
663
    int i,j;
664
    for (i=0; i < height; i++) {
665
      for (j=0; j < width; j++) {
666
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
667
      }
668
      src += stride;
669
      dst += stride;
670
    }
671
}
672

    
673
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
674
    int i,j;
675
    for (i=0; i < height; i++) {
676
      for (j=0; j < width; j++) {
677
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
678
      }
679
      src += stride;
680
      dst += stride;
681
    }
682
}
683

    
684
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
685
    int i,j;
686
    for (i=0; i < height; i++) {
687
      for (j=0; j < width; j++) {
688
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
689
      }
690
      src += stride;
691
      dst += stride;
692
    }
693
}
694

    
695
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
696
    int i,j;
697
    for (i=0; i < height; i++) {
698
      for (j=0; j < width; j++) {
699
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
700
      }
701
      src += stride;
702
      dst += stride;
703
    }
704
}
705

    
706
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
707
    int i,j;
708
    for (i=0; i < height; i++) {
709
      for (j=0; j < width; j++) {
710
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
711
      }
712
      src += stride;
713
      dst += stride;
714
    }
715
}
716

    
717
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
718
    switch(width){
719
    case 2: avg_pixels2_c (dst, src, stride, height); break;
720
    case 4: avg_pixels4_c (dst, src, stride, height); break;
721
    case 8: avg_pixels8_c (dst, src, stride, height); break;
722
    case 16:avg_pixels16_c(dst, src, stride, height); break;
723
    }
724
}
725

    
726
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
727
    int i,j;
728
    for (i=0; i < height; i++) {
729
      for (j=0; j < width; j++) {
730
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
731
      }
732
      src += stride;
733
      dst += stride;
734
    }
735
}
736

    
737
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
738
    int i,j;
739
    for (i=0; i < height; i++) {
740
      for (j=0; j < width; j++) {
741
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
742
      }
743
      src += stride;
744
      dst += stride;
745
    }
746
}
747

    
748
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
749
    int i,j;
750
    for (i=0; i < height; i++) {
751
      for (j=0; j < width; j++) {
752
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
753
      }
754
      src += stride;
755
      dst += stride;
756
    }
757
}
758

    
759
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
760
    int i,j;
761
    for (i=0; i < height; i++) {
762
      for (j=0; j < width; j++) {
763
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
764
      }
765
      src += stride;
766
      dst += stride;
767
    }
768
}
769

    
770
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
771
    int i,j;
772
    for (i=0; i < height; i++) {
773
      for (j=0; j < width; j++) {
774
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
775
      }
776
      src += stride;
777
      dst += stride;
778
    }
779
}
780

    
781
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
782
    int i,j;
783
    for (i=0; i < height; i++) {
784
      for (j=0; j < width; j++) {
785
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
786
      }
787
      src += stride;
788
      dst += stride;
789
    }
790
}
791

    
792
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
793
    int i,j;
794
    for (i=0; i < height; i++) {
795
      for (j=0; j < width; j++) {
796
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
797
      }
798
      src += stride;
799
      dst += stride;
800
    }
801
}
802

    
803
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
804
    int i,j;
805
    for (i=0; i < height; i++) {
806
      for (j=0; j < width; j++) {
807
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
808
      }
809
      src += stride;
810
      dst += stride;
811
    }
812
}
813
#if 0
814
#define TPEL_WIDTH(width)\
815
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
816
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
817
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
818
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
819
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
820
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
821
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
822
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
823
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
824
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
825
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
826
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
827
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
828
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
829
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
830
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
831
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
832
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
833
#endif
834

    
835
#define QPEL_MC(r, OPNAME, RND, OP) \
836
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
837
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
838
    int i;\
839
    for(i=0; i<h; i++)\
840
    {\
841
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
842
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
843
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
844
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
845
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
846
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
847
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
848
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
849
        dst+=dstStride;\
850
        src+=srcStride;\
851
    }\
852
}\
853
\
854
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
855
    const int w=8;\
856
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
857
    int i;\
858
    for(i=0; i<w; i++)\
859
    {\
860
        const int src0= src[0*srcStride];\
861
        const int src1= src[1*srcStride];\
862
        const int src2= src[2*srcStride];\
863
        const int src3= src[3*srcStride];\
864
        const int src4= src[4*srcStride];\
865
        const int src5= src[5*srcStride];\
866
        const int src6= src[6*srcStride];\
867
        const int src7= src[7*srcStride];\
868
        const int src8= src[8*srcStride];\
869
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
870
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
871
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
872
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
873
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
874
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
875
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
876
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
877
        dst++;\
878
        src++;\
879
    }\
880
}\
881
\
882
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
883
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
884
    int i;\
885
    \
886
    for(i=0; i<h; i++)\
887
    {\
888
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
889
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
890
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
891
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
892
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
893
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
894
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
895
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
896
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
897
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
898
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
899
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
900
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
901
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
902
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
903
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
904
        dst+=dstStride;\
905
        src+=srcStride;\
906
    }\
907
}\
908
\
909
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
910
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
911
    int i;\
912
    const int w=16;\
913
    for(i=0; i<w; i++)\
914
    {\
915
        const int src0= src[0*srcStride];\
916
        const int src1= src[1*srcStride];\
917
        const int src2= src[2*srcStride];\
918
        const int src3= src[3*srcStride];\
919
        const int src4= src[4*srcStride];\
920
        const int src5= src[5*srcStride];\
921
        const int src6= src[6*srcStride];\
922
        const int src7= src[7*srcStride];\
923
        const int src8= src[8*srcStride];\
924
        const int src9= src[9*srcStride];\
925
        const int src10= src[10*srcStride];\
926
        const int src11= src[11*srcStride];\
927
        const int src12= src[12*srcStride];\
928
        const int src13= src[13*srcStride];\
929
        const int src14= src[14*srcStride];\
930
        const int src15= src[15*srcStride];\
931
        const int src16= src[16*srcStride];\
932
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
933
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
934
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
935
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
936
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
937
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
938
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
939
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
940
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
941
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
942
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
943
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
944
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
945
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
946
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
947
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
948
        dst++;\
949
        src++;\
950
    }\
951
}\
952
\
953
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
954
    uint8_t half[64];\
955
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
956
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
957
}\
958
\
959
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
960
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
961
}\
962
\
963
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
964
    uint8_t half[64];\
965
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
966
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
967
}\
968
\
969
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
970
    uint8_t full[16*9];\
971
    uint8_t half[64];\
972
    copy_block9(full, src, 16, stride, 9);\
973
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
974
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
975
}\
976
\
977
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
978
    uint8_t full[16*9];\
979
    copy_block9(full, src, 16, stride, 9);\
980
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
981
}\
982
\
983
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
984
    uint8_t full[16*9];\
985
    uint8_t half[64];\
986
    copy_block9(full, src, 16, stride, 9);\
987
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
988
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
989
}\
990
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
991
    uint8_t full[16*9];\
992
    uint8_t halfH[72];\
993
    uint8_t halfV[64];\
994
    uint8_t halfHV[64];\
995
    copy_block9(full, src, 16, stride, 9);\
996
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
997
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
998
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
999
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1000
}\
1001
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1002
    uint8_t full[16*9];\
1003
    uint8_t halfH[72];\
1004
    uint8_t halfHV[64];\
1005
    copy_block9(full, src, 16, stride, 9);\
1006
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1007
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1008
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1009
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1010
}\
1011
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1012
    uint8_t full[16*9];\
1013
    uint8_t halfH[72];\
1014
    uint8_t halfV[64];\
1015
    uint8_t halfHV[64];\
1016
    copy_block9(full, src, 16, stride, 9);\
1017
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1018
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1019
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1020
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1021
}\
1022
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1023
    uint8_t full[16*9];\
1024
    uint8_t halfH[72];\
1025
    uint8_t halfHV[64];\
1026
    copy_block9(full, src, 16, stride, 9);\
1027
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1028
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1029
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1030
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1031
}\
1032
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1033
    uint8_t full[16*9];\
1034
    uint8_t halfH[72];\
1035
    uint8_t halfV[64];\
1036
    uint8_t halfHV[64];\
1037
    copy_block9(full, src, 16, stride, 9);\
1038
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1039
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1040
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1041
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1042
}\
1043
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1044
    uint8_t full[16*9];\
1045
    uint8_t halfH[72];\
1046
    uint8_t halfHV[64];\
1047
    copy_block9(full, src, 16, stride, 9);\
1048
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1049
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1050
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1051
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1052
}\
1053
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1054
    uint8_t full[16*9];\
1055
    uint8_t halfH[72];\
1056
    uint8_t halfV[64];\
1057
    uint8_t halfHV[64];\
1058
    copy_block9(full, src, 16, stride, 9);\
1059
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1060
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1061
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1062
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1063
}\
1064
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1065
    uint8_t full[16*9];\
1066
    uint8_t halfH[72];\
1067
    uint8_t halfHV[64];\
1068
    copy_block9(full, src, 16, stride, 9);\
1069
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1070
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1071
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1072
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1073
}\
1074
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1075
    uint8_t halfH[72];\
1076
    uint8_t halfHV[64];\
1077
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1078
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1079
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1080
}\
1081
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1082
    uint8_t halfH[72];\
1083
    uint8_t halfHV[64];\
1084
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1085
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1086
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1087
}\
1088
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1089
    uint8_t full[16*9];\
1090
    uint8_t halfH[72];\
1091
    uint8_t halfV[64];\
1092
    uint8_t halfHV[64];\
1093
    copy_block9(full, src, 16, stride, 9);\
1094
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1095
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1096
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1097
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1098
}\
1099
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1100
    uint8_t full[16*9];\
1101
    uint8_t halfH[72];\
1102
    copy_block9(full, src, 16, stride, 9);\
1103
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1104
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1105
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1106
}\
1107
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1108
    uint8_t full[16*9];\
1109
    uint8_t halfH[72];\
1110
    uint8_t halfV[64];\
1111
    uint8_t halfHV[64];\
1112
    copy_block9(full, src, 16, stride, 9);\
1113
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1114
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1115
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1116
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1117
}\
1118
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1119
    uint8_t full[16*9];\
1120
    uint8_t halfH[72];\
1121
    copy_block9(full, src, 16, stride, 9);\
1122
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1123
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1124
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1125
}\
1126
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1127
    uint8_t halfH[72];\
1128
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1129
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1130
}\
1131
\
1132
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1133
    uint8_t half[256];\
1134
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1135
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1136
}\
1137
\
1138
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1139
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1140
}\
1141
\
1142
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1143
    uint8_t half[256];\
1144
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1145
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1146
}\
1147
\
1148
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1149
    uint8_t full[24*17];\
1150
    uint8_t half[256];\
1151
    copy_block17(full, src, 24, stride, 17);\
1152
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1153
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1154
}\
1155
\
1156
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1157
    uint8_t full[24*17];\
1158
    copy_block17(full, src, 24, stride, 17);\
1159
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1160
}\
1161
\
1162
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1163
    uint8_t full[24*17];\
1164
    uint8_t half[256];\
1165
    copy_block17(full, src, 24, stride, 17);\
1166
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1167
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1168
}\
1169
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1170
    uint8_t full[24*17];\
1171
    uint8_t halfH[272];\
1172
    uint8_t halfV[256];\
1173
    uint8_t halfHV[256];\
1174
    copy_block17(full, src, 24, stride, 17);\
1175
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1176
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1177
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1178
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1179
}\
1180
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1181
    uint8_t full[24*17];\
1182
    uint8_t halfH[272];\
1183
    uint8_t halfHV[256];\
1184
    copy_block17(full, src, 24, stride, 17);\
1185
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1186
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1187
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1188
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1189
}\
1190
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1191
    uint8_t full[24*17];\
1192
    uint8_t halfH[272];\
1193
    uint8_t halfV[256];\
1194
    uint8_t halfHV[256];\
1195
    copy_block17(full, src, 24, stride, 17);\
1196
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1197
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1198
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1199
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1200
}\
1201
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1202
    uint8_t full[24*17];\
1203
    uint8_t halfH[272];\
1204
    uint8_t halfHV[256];\
1205
    copy_block17(full, src, 24, stride, 17);\
1206
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1207
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1208
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1209
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1210
}\
1211
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1212
    uint8_t full[24*17];\
1213
    uint8_t halfH[272];\
1214
    uint8_t halfV[256];\
1215
    uint8_t halfHV[256];\
1216
    copy_block17(full, src, 24, stride, 17);\
1217
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1218
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1219
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1220
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1221
}\
1222
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1223
    uint8_t full[24*17];\
1224
    uint8_t halfH[272];\
1225
    uint8_t halfHV[256];\
1226
    copy_block17(full, src, 24, stride, 17);\
1227
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1228
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1229
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1230
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1231
}\
1232
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1233
    uint8_t full[24*17];\
1234
    uint8_t halfH[272];\
1235
    uint8_t halfV[256];\
1236
    uint8_t halfHV[256];\
1237
    copy_block17(full, src, 24, stride, 17);\
1238
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1239
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1240
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1241
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1242
}\
1243
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1244
    uint8_t full[24*17];\
1245
    uint8_t halfH[272];\
1246
    uint8_t halfHV[256];\
1247
    copy_block17(full, src, 24, stride, 17);\
1248
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1249
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1250
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1251
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1252
}\
1253
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1254
    uint8_t halfH[272];\
1255
    uint8_t halfHV[256];\
1256
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1257
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1258
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1259
}\
1260
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1261
    uint8_t halfH[272];\
1262
    uint8_t halfHV[256];\
1263
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1264
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1265
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1266
}\
1267
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1268
    uint8_t full[24*17];\
1269
    uint8_t halfH[272];\
1270
    uint8_t halfV[256];\
1271
    uint8_t halfHV[256];\
1272
    copy_block17(full, src, 24, stride, 17);\
1273
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1274
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1275
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1276
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1277
}\
1278
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1279
    uint8_t full[24*17];\
1280
    uint8_t halfH[272];\
1281
    copy_block17(full, src, 24, stride, 17);\
1282
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1283
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1284
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1285
}\
1286
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1287
    uint8_t full[24*17];\
1288
    uint8_t halfH[272];\
1289
    uint8_t halfV[256];\
1290
    uint8_t halfHV[256];\
1291
    copy_block17(full, src, 24, stride, 17);\
1292
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1293
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1294
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1295
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1296
}\
1297
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1298
    uint8_t full[24*17];\
1299
    uint8_t halfH[272];\
1300
    copy_block17(full, src, 24, stride, 17);\
1301
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1302
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1303
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1304
}\
1305
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1306
    uint8_t halfH[272];\
1307
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1308
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1309
}
1310

    
1311
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1312
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1313
#define op_put(a, b) a = cm[((b) + 16)>>5]
1314
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1315

    
1316
QPEL_MC(0, put_       , _       , op_put)
1317
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1318
QPEL_MC(0, avg_       , _       , op_avg)
1319
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1320
#undef op_avg
1321
#undef op_avg_no_rnd
1322
#undef op_put
1323
#undef op_put_no_rnd
1324

    
1325
#define put_qpel8_mc00_c  ff_put_pixels8x8_c
1326
#define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1327
#define put_qpel16_mc00_c ff_put_pixels16x16_c
1328
#define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1329
#define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1330
#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1331

    
1332
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1333
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1334
    int i;
1335

    
1336
    for(i=0; i<h; i++){
1337
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1338
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1339
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1340
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1341
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1342
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1343
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1344
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1345
        dst+=dstStride;
1346
        src+=srcStride;
1347
    }
1348
}
1349

    
1350
#if CONFIG_RV40_DECODER
1351
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1352
    put_pixels16_xy2_c(dst, src, stride, 16);
1353
}
1354
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1355
    avg_pixels16_xy2_c(dst, src, stride, 16);
1356
}
1357
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1358
    put_pixels8_xy2_c(dst, src, stride, 8);
1359
}
1360
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1361
    avg_pixels8_xy2_c(dst, src, stride, 8);
1362
}
1363
#endif /* CONFIG_RV40_DECODER */
1364

    
1365
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1366
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1367
    int i;
1368

    
1369
    for(i=0; i<w; i++){
1370
        const int src_1= src[ -srcStride];
1371
        const int src0 = src[0          ];
1372
        const int src1 = src[  srcStride];
1373
        const int src2 = src[2*srcStride];
1374
        const int src3 = src[3*srcStride];
1375
        const int src4 = src[4*srcStride];
1376
        const int src5 = src[5*srcStride];
1377
        const int src6 = src[6*srcStride];
1378
        const int src7 = src[7*srcStride];
1379
        const int src8 = src[8*srcStride];
1380
        const int src9 = src[9*srcStride];
1381
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1382
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1383
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1384
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1385
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1386
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1387
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1388
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1389
        src++;
1390
        dst++;
1391
    }
1392
}
1393

    
1394
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1395
    uint8_t half[64];
1396
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1397
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
1398
}
1399

    
1400
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1401
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1402
}
1403

    
1404
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1405
    uint8_t half[64];
1406
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1407
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
1408
}
1409

    
1410
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1411
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1412
}
1413

    
1414
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1415
    uint8_t halfH[88];
1416
    uint8_t halfV[64];
1417
    uint8_t halfHV[64];
1418
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1419
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1420
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1421
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1422
}
1423
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1424
    uint8_t halfH[88];
1425
    uint8_t halfV[64];
1426
    uint8_t halfHV[64];
1427
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1428
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1429
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1430
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1431
}
1432
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1433
    uint8_t halfH[88];
1434
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1435
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1436
}
1437

    
1438
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1439
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1440
    int x;
1441
    const int strength= ff_h263_loop_filter_strength[qscale];
1442

    
1443
    for(x=0; x<8; x++){
1444
        int d1, d2, ad1;
1445
        int p0= src[x-2*stride];
1446
        int p1= src[x-1*stride];
1447
        int p2= src[x+0*stride];
1448
        int p3= src[x+1*stride];
1449
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1450

    
1451
        if     (d<-2*strength) d1= 0;
1452
        else if(d<-  strength) d1=-2*strength - d;
1453
        else if(d<   strength) d1= d;
1454
        else if(d< 2*strength) d1= 2*strength - d;
1455
        else                   d1= 0;
1456

    
1457
        p1 += d1;
1458
        p2 -= d1;
1459
        if(p1&256) p1= ~(p1>>31);
1460
        if(p2&256) p2= ~(p2>>31);
1461

    
1462
        src[x-1*stride] = p1;
1463
        src[x+0*stride] = p2;
1464

    
1465
        ad1= FFABS(d1)>>1;
1466

    
1467
        d2= av_clip((p0-p3)/4, -ad1, ad1);
1468

    
1469
        src[x-2*stride] = p0 - d2;
1470
        src[x+  stride] = p3 + d2;
1471
    }
1472
    }
1473
}
1474

    
1475
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1476
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1477
    int y;
1478
    const int strength= ff_h263_loop_filter_strength[qscale];
1479

    
1480
    for(y=0; y<8; y++){
1481
        int d1, d2, ad1;
1482
        int p0= src[y*stride-2];
1483
        int p1= src[y*stride-1];
1484
        int p2= src[y*stride+0];
1485
        int p3= src[y*stride+1];
1486
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1487

    
1488
        if     (d<-2*strength) d1= 0;
1489
        else if(d<-  strength) d1=-2*strength - d;
1490
        else if(d<   strength) d1= d;
1491
        else if(d< 2*strength) d1= 2*strength - d;
1492
        else                   d1= 0;
1493

    
1494
        p1 += d1;
1495
        p2 -= d1;
1496
        if(p1&256) p1= ~(p1>>31);
1497
        if(p2&256) p2= ~(p2>>31);
1498

    
1499
        src[y*stride-1] = p1;
1500
        src[y*stride+0] = p2;
1501

    
1502
        ad1= FFABS(d1)>>1;
1503

    
1504
        d2= av_clip((p0-p3)/4, -ad1, ad1);
1505

    
1506
        src[y*stride-2] = p0 - d2;
1507
        src[y*stride+1] = p3 + d2;
1508
    }
1509
    }
1510
}
1511

    
1512
static void h261_loop_filter_c(uint8_t *src, int stride){
1513
    int x,y,xy,yz;
1514
    int temp[64];
1515

    
1516
    for(x=0; x<8; x++){
1517
        temp[x      ] = 4*src[x           ];
1518
        temp[x + 7*8] = 4*src[x + 7*stride];
1519
    }
1520
    for(y=1; y<7; y++){
1521
        for(x=0; x<8; x++){
1522
            xy = y * stride + x;
1523
            yz = y * 8 + x;
1524
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1525
        }
1526
    }
1527

    
1528
    for(y=0; y<8; y++){
1529
        src[  y*stride] = (temp[  y*8] + 2)>>2;
1530
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1531
        for(x=1; x<7; x++){
1532
            xy = y * stride + x;
1533
            yz = y * 8 + x;
1534
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1535
        }
1536
    }
1537
}
1538

    
1539
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1540
{
1541
    int s, i;
1542

    
1543
    s = 0;
1544
    for(i=0;i<h;i++) {
1545
        s += abs(pix1[0] - pix2[0]);
1546
        s += abs(pix1[1] - pix2[1]);
1547
        s += abs(pix1[2] - pix2[2]);
1548
        s += abs(pix1[3] - pix2[3]);
1549
        s += abs(pix1[4] - pix2[4]);
1550
        s += abs(pix1[5] - pix2[5]);
1551
        s += abs(pix1[6] - pix2[6]);
1552
        s += abs(pix1[7] - pix2[7]);
1553
        s += abs(pix1[8] - pix2[8]);
1554
        s += abs(pix1[9] - pix2[9]);
1555
        s += abs(pix1[10] - pix2[10]);
1556
        s += abs(pix1[11] - pix2[11]);
1557
        s += abs(pix1[12] - pix2[12]);
1558
        s += abs(pix1[13] - pix2[13]);
1559
        s += abs(pix1[14] - pix2[14]);
1560
        s += abs(pix1[15] - pix2[15]);
1561
        pix1 += line_size;
1562
        pix2 += line_size;
1563
    }
1564
    return s;
1565
}
1566

    
1567
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1568
{
1569
    int s, i;
1570

    
1571
    s = 0;
1572
    for(i=0;i<h;i++) {
1573
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1574
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1575
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1576
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1577
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1578
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1579
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1580
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1581
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1582
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1583
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1584
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1585
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1586
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1587
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1588
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1589
        pix1 += line_size;
1590
        pix2 += line_size;
1591
    }
1592
    return s;
1593
}
1594

    
1595
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1596
{
1597
    int s, i;
1598
    uint8_t *pix3 = pix2 + line_size;
1599

    
1600
    s = 0;
1601
    for(i=0;i<h;i++) {
1602
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1603
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1604
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1605
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1606
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1607
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1608
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1609
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1610
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1611
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1612
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1613
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1614
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1615
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1616
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1617
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1618
        pix1 += line_size;
1619
        pix2 += line_size;
1620
        pix3 += line_size;
1621
    }
1622
    return s;
1623
}
1624

    
1625
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1626
{
1627
    int s, i;
1628
    uint8_t *pix3 = pix2 + line_size;
1629

    
1630
    s = 0;
1631
    for(i=0;i<h;i++) {
1632
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1633
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1634
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1635
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1636
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1637
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1638
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1639
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1640
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1641
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1642
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1643
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1644
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1645
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1646
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1647
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1648
        pix1 += line_size;
1649
        pix2 += line_size;
1650
        pix3 += line_size;
1651
    }
1652
    return s;
1653
}
1654

    
1655
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1656
{
1657
    int s, i;
1658

    
1659
    s = 0;
1660
    for(i=0;i<h;i++) {
1661
        s += abs(pix1[0] - pix2[0]);
1662
        s += abs(pix1[1] - pix2[1]);
1663
        s += abs(pix1[2] - pix2[2]);
1664
        s += abs(pix1[3] - pix2[3]);
1665
        s += abs(pix1[4] - pix2[4]);
1666
        s += abs(pix1[5] - pix2[5]);
1667
        s += abs(pix1[6] - pix2[6]);
1668
        s += abs(pix1[7] - pix2[7]);
1669
        pix1 += line_size;
1670
        pix2 += line_size;
1671
    }
1672
    return s;
1673
}
1674

    
1675
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1676
{
1677
    int s, i;
1678

    
1679
    s = 0;
1680
    for(i=0;i<h;i++) {
1681
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1682
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1683
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1684
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1685
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1686
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1687
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1688
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1689
        pix1 += line_size;
1690
        pix2 += line_size;
1691
    }
1692
    return s;
1693
}
1694

    
1695
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1696
{
1697
    int s, i;
1698
    uint8_t *pix3 = pix2 + line_size;
1699

    
1700
    s = 0;
1701
    for(i=0;i<h;i++) {
1702
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1703
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1704
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1705
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1706
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1707
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1708
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1709
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1710
        pix1 += line_size;
1711
        pix2 += line_size;
1712
        pix3 += line_size;
1713
    }
1714
    return s;
1715
}
1716

    
1717
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1718
{
1719
    int s, i;
1720
    uint8_t *pix3 = pix2 + line_size;
1721

    
1722
    s = 0;
1723
    for(i=0;i<h;i++) {
1724
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1725
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1726
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1727
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1728
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1729
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1730
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1731
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1732
        pix1 += line_size;
1733
        pix2 += line_size;
1734
        pix3 += line_size;
1735
    }
1736
    return s;
1737
}
1738

    
1739
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1740
    MpegEncContext *c = v;
1741
    int score1=0;
1742
    int score2=0;
1743
    int x,y;
1744

    
1745
    for(y=0; y<h; y++){
1746
        for(x=0; x<16; x++){
1747
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1748
        }
1749
        if(y+1<h){
1750
            for(x=0; x<15; x++){
1751
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
1752
                             - s1[x+1] + s1[x+1+stride])
1753
                        -FFABS(  s2[x  ] - s2[x  +stride]
1754
                             - s2[x+1] + s2[x+1+stride]);
1755
            }
1756
        }
1757
        s1+= stride;
1758
        s2+= stride;
1759
    }
1760

    
1761
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1762
    else  return score1 + FFABS(score2)*8;
1763
}
1764

    
1765
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1766
    MpegEncContext *c = v;
1767
    int score1=0;
1768
    int score2=0;
1769
    int x,y;
1770

    
1771
    for(y=0; y<h; y++){
1772
        for(x=0; x<8; x++){
1773
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1774
        }
1775
        if(y+1<h){
1776
            for(x=0; x<7; x++){
1777
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
1778
                             - s1[x+1] + s1[x+1+stride])
1779
                        -FFABS(  s2[x  ] - s2[x  +stride]
1780
                             - s2[x+1] + s2[x+1+stride]);
1781
            }
1782
        }
1783
        s1+= stride;
1784
        s2+= stride;
1785
    }
1786

    
1787
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1788
    else  return score1 + FFABS(score2)*8;
1789
}
1790

    
1791
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1792
    int i;
1793
    unsigned int sum=0;
1794

    
1795
    for(i=0; i<8*8; i++){
1796
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1797
        int w= weight[i];
1798
        b>>= RECON_SHIFT;
1799
        assert(-512<b && b<512);
1800

    
1801
        sum += (w*b)*(w*b)>>4;
1802
    }
1803
    return sum>>2;
1804
}
1805

    
1806
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1807
    int i;
1808

    
1809
    for(i=0; i<8*8; i++){
1810
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1811
    }
1812
}
1813

    
1814
/**
1815
 * permutes an 8x8 block.
1816
 * @param block the block which will be permuted according to the given permutation vector
1817
 * @param permutation the permutation vector
1818
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1819
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1820
 *                  (inverse) permutated to scantable order!
1821
 */
1822
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1823
{
1824
    int i;
1825
    DCTELEM temp[64];
1826

    
1827
    if(last<=0) return;
1828
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1829

    
1830
    for(i=0; i<=last; i++){
1831
        const int j= scantable[i];
1832
        temp[j]= block[j];
1833
        block[j]=0;
1834
    }
1835

    
1836
    for(i=0; i<=last; i++){
1837
        const int j= scantable[i];
1838
        const int perm_j= permutation[j];
1839
        block[perm_j]= temp[j];
1840
    }
1841
}
1842

    
1843
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1844
    return 0;
1845
}
1846

    
1847
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1848
    int i;
1849

    
1850
    memset(cmp, 0, sizeof(void*)*6);
1851

    
1852
    for(i=0; i<6; i++){
1853
        switch(type&0xFF){
1854
        case FF_CMP_SAD:
1855
            cmp[i]= c->sad[i];
1856
            break;
1857
        case FF_CMP_SATD:
1858
            cmp[i]= c->hadamard8_diff[i];
1859
            break;
1860
        case FF_CMP_SSE:
1861
            cmp[i]= c->sse[i];
1862
            break;
1863
        case FF_CMP_DCT:
1864
            cmp[i]= c->dct_sad[i];
1865
            break;
1866
        case FF_CMP_DCT264:
1867
            cmp[i]= c->dct264_sad[i];
1868
            break;
1869
        case FF_CMP_DCTMAX:
1870
            cmp[i]= c->dct_max[i];
1871
            break;
1872
        case FF_CMP_PSNR:
1873
            cmp[i]= c->quant_psnr[i];
1874
            break;
1875
        case FF_CMP_BIT:
1876
            cmp[i]= c->bit[i];
1877
            break;
1878
        case FF_CMP_RD:
1879
            cmp[i]= c->rd[i];
1880
            break;
1881
        case FF_CMP_VSAD:
1882
            cmp[i]= c->vsad[i];
1883
            break;
1884
        case FF_CMP_VSSE:
1885
            cmp[i]= c->vsse[i];
1886
            break;
1887
        case FF_CMP_ZERO:
1888
            cmp[i]= zero_cmp;
1889
            break;
1890
        case FF_CMP_NSSE:
1891
            cmp[i]= c->nsse[i];
1892
            break;
1893
#if CONFIG_DWT
1894
        case FF_CMP_W53:
1895
            cmp[i]= c->w53[i];
1896
            break;
1897
        case FF_CMP_W97:
1898
            cmp[i]= c->w97[i];
1899
            break;
1900
#endif
1901
        default:
1902
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1903
        }
1904
    }
1905
}
1906

    
1907
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1908
    long i;
1909
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1910
        long a = *(long*)(src+i);
1911
        long b = *(long*)(dst+i);
1912
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1913
    }
1914
    for(; i<w; i++)
1915
        dst[i+0] += src[i+0];
1916
}
1917

    
1918
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1919
    long i;
1920
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1921
        long a = *(long*)(src1+i);
1922
        long b = *(long*)(src2+i);
1923
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1924
    }
1925
    for(; i<w; i++)
1926
        dst[i] = src1[i]+src2[i];
1927
}
1928

    
1929
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1930
    long i;
1931
#if !HAVE_FAST_UNALIGNED
1932
    if((long)src2 & (sizeof(long)-1)){
1933
        for(i=0; i+7<w; i+=8){
1934
            dst[i+0] = src1[i+0]-src2[i+0];
1935
            dst[i+1] = src1[i+1]-src2[i+1];
1936
            dst[i+2] = src1[i+2]-src2[i+2];
1937
            dst[i+3] = src1[i+3]-src2[i+3];
1938
            dst[i+4] = src1[i+4]-src2[i+4];
1939
            dst[i+5] = src1[i+5]-src2[i+5];
1940
            dst[i+6] = src1[i+6]-src2[i+6];
1941
            dst[i+7] = src1[i+7]-src2[i+7];
1942
        }
1943
    }else
1944
#endif
1945
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1946
        long a = *(long*)(src1+i);
1947
        long b = *(long*)(src2+i);
1948
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1949
    }
1950
    for(; i<w; i++)
1951
        dst[i+0] = src1[i+0]-src2[i+0];
1952
}
1953

    
1954
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1955
    int i;
1956
    uint8_t l, lt;
1957

    
1958
    l= *left;
1959
    lt= *left_top;
1960

    
1961
    for(i=0; i<w; i++){
1962
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1963
        lt= src1[i];
1964
        dst[i]= l;
1965
    }
1966

    
1967
    *left= l;
1968
    *left_top= lt;
1969
}
1970

    
1971
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1972
    int i;
1973
    uint8_t l, lt;
1974

    
1975
    l= *left;
1976
    lt= *left_top;
1977

    
1978
    for(i=0; i<w; i++){
1979
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1980
        lt= src1[i];
1981
        l= src2[i];
1982
        dst[i]= l - pred;
1983
    }
1984

    
1985
    *left= l;
1986
    *left_top= lt;
1987
}
1988

    
1989
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1990
    int i;
1991

    
1992
    for(i=0; i<w-1; i++){
1993
        acc+= src[i];
1994
        dst[i]= acc;
1995
        i++;
1996
        acc+= src[i];
1997
        dst[i]= acc;
1998
    }
1999

    
2000
    for(; i<w; i++){
2001
        acc+= src[i];
2002
        dst[i]= acc;
2003
    }
2004

    
2005
    return acc;
2006
}
2007

    
2008
#if HAVE_BIGENDIAN
2009
#define B 3
2010
#define G 2
2011
#define R 1
2012
#define A 0
2013
#else
2014
#define B 0
2015
#define G 1
2016
#define R 2
2017
#define A 3
2018
#endif
2019
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2020
    int i;
2021
    int r,g,b,a;
2022
    r= *red;
2023
    g= *green;
2024
    b= *blue;
2025
    a= *alpha;
2026

    
2027
    for(i=0; i<w; i++){
2028
        b+= src[4*i+B];
2029
        g+= src[4*i+G];
2030
        r+= src[4*i+R];
2031
        a+= src[4*i+A];
2032

    
2033
        dst[4*i+B]= b;
2034
        dst[4*i+G]= g;
2035
        dst[4*i+R]= r;
2036
        dst[4*i+A]= a;
2037
    }
2038

    
2039
    *red= r;
2040
    *green= g;
2041
    *blue= b;
2042
    *alpha= a;
2043
}
2044
#undef B
2045
#undef G
2046
#undef R
2047
#undef A
2048

    
2049
#define BUTTERFLY2(o1,o2,i1,i2) \
2050
o1= (i1)+(i2);\
2051
o2= (i1)-(i2);
2052

    
2053
#define BUTTERFLY1(x,y) \
2054
{\
2055
    int a,b;\
2056
    a= x;\
2057
    b= y;\
2058
    x= a+b;\
2059
    y= a-b;\
2060
}
2061

    
2062
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2063

    
2064
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2065
    int i;
2066
    int temp[64];
2067
    int sum=0;
2068

    
2069
    assert(h==8);
2070

    
2071
    for(i=0; i<8; i++){
2072
        //FIXME try pointer walks
2073
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2074
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2075
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2076
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2077

    
2078
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2079
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2080
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2081
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2082

    
2083
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2084
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2085
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2086
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2087
    }
2088

    
2089
    for(i=0; i<8; i++){
2090
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2091
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2092
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2093
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2094

    
2095
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2096
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2097
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2098
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2099

    
2100
        sum +=
2101
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2102
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2103
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2104
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2105
    }
2106
    return sum;
2107
}
2108

    
2109
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2110
    int i;
2111
    int temp[64];
2112
    int sum=0;
2113

    
2114
    assert(h==8);
2115

    
2116
    for(i=0; i<8; i++){
2117
        //FIXME try pointer walks
2118
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2119
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2120
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2121
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2122

    
2123
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2124
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2125
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2126
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2127

    
2128
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2129
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2130
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2131
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2132
    }
2133

    
2134
    for(i=0; i<8; i++){
2135
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2136
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2137
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2138
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2139

    
2140
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2141
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2142
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2143
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2144

    
2145
        sum +=
2146
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2147
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2148
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2149
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2150
    }
2151

    
2152
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2153

    
2154
    return sum;
2155
}
2156

    
2157
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2158
    MpegEncContext * const s= (MpegEncContext *)c;
2159
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2160

    
2161
    assert(h==8);
2162

    
2163
    s->dsp.diff_pixels(temp, src1, src2, stride);
2164
    s->dsp.fdct(temp);
2165
    return s->dsp.sum_abs_dctelem(temp);
2166
}
2167

    
2168
#if CONFIG_GPL
2169
#define DCT8_1D {\
2170
    const int s07 = SRC(0) + SRC(7);\
2171
    const int s16 = SRC(1) + SRC(6);\
2172
    const int s25 = SRC(2) + SRC(5);\
2173
    const int s34 = SRC(3) + SRC(4);\
2174
    const int a0 = s07 + s34;\
2175
    const int a1 = s16 + s25;\
2176
    const int a2 = s07 - s34;\
2177
    const int a3 = s16 - s25;\
2178
    const int d07 = SRC(0) - SRC(7);\
2179
    const int d16 = SRC(1) - SRC(6);\
2180
    const int d25 = SRC(2) - SRC(5);\
2181
    const int d34 = SRC(3) - SRC(4);\
2182
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
2183
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
2184
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
2185
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
2186
    DST(0,  a0 + a1     ) ;\
2187
    DST(1,  a4 + (a7>>2)) ;\
2188
    DST(2,  a2 + (a3>>1)) ;\
2189
    DST(3,  a5 + (a6>>2)) ;\
2190
    DST(4,  a0 - a1     ) ;\
2191
    DST(5,  a6 - (a5>>2)) ;\
2192
    DST(6, (a2>>1) - a3 ) ;\
2193
    DST(7, (a4>>2) - a7 ) ;\
2194
}
2195

    
2196
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2197
    MpegEncContext * const s= (MpegEncContext *)c;
2198
    DCTELEM dct[8][8];
2199
    int i;
2200
    int sum=0;
2201

    
2202
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
2203

    
2204
#define SRC(x) dct[i][x]
2205
#define DST(x,v) dct[i][x]= v
2206
    for( i = 0; i < 8; i++ )
2207
        DCT8_1D
2208
#undef SRC
2209
#undef DST
2210

    
2211
#define SRC(x) dct[x][i]
2212
#define DST(x,v) sum += FFABS(v)
2213
    for( i = 0; i < 8; i++ )
2214
        DCT8_1D
2215
#undef SRC
2216
#undef DST
2217
    return sum;
2218
}
2219
#endif
2220

    
2221
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2222
    MpegEncContext * const s= (MpegEncContext *)c;
2223
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2224
    int sum=0, i;
2225

    
2226
    assert(h==8);
2227

    
2228
    s->dsp.diff_pixels(temp, src1, src2, stride);
2229
    s->dsp.fdct(temp);
2230

    
2231
    for(i=0; i<64; i++)
2232
        sum= FFMAX(sum, FFABS(temp[i]));
2233

    
2234
    return sum;
2235
}
2236

    
2237
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2238
    MpegEncContext * const s= (MpegEncContext *)c;
2239
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2240
    DCTELEM * const bak = temp+64;
2241
    int sum=0, i;
2242

    
2243
    assert(h==8);
2244
    s->mb_intra=0;
2245

    
2246
    s->dsp.diff_pixels(temp, src1, src2, stride);
2247

    
2248
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2249

    
2250
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2251
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
2252
    ff_simple_idct(temp); //FIXME
2253

    
2254
    for(i=0; i<64; i++)
2255
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2256

    
2257
    return sum;
2258
}
2259

    
2260
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2261
    MpegEncContext * const s= (MpegEncContext *)c;
2262
    const uint8_t *scantable= s->intra_scantable.permutated;
2263
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2264
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2265
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2266
    int i, last, run, bits, level, distortion, start_i;
2267
    const int esc_length= s->ac_esc_length;
2268
    uint8_t * length;
2269
    uint8_t * last_length;
2270

    
2271
    assert(h==8);
2272

    
2273
    copy_block8(lsrc1, src1, 8, stride, 8);
2274
    copy_block8(lsrc2, src2, 8, stride, 8);
2275

    
2276
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2277

    
2278
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2279

    
2280
    bits=0;
2281

    
2282
    if (s->mb_intra) {
2283
        start_i = 1;
2284
        length     = s->intra_ac_vlc_length;
2285
        last_length= s->intra_ac_vlc_last_length;
2286
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2287
    } else {
2288
        start_i = 0;
2289
        length     = s->inter_ac_vlc_length;
2290
        last_length= s->inter_ac_vlc_last_length;
2291
    }
2292

    
2293
    if(last>=start_i){
2294
        run=0;
2295
        for(i=start_i; i<last; i++){
2296
            int j= scantable[i];
2297
            level= temp[j];
2298

    
2299
            if(level){
2300
                level+=64;
2301
                if((level&(~127)) == 0){
2302
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2303
                }else
2304
                    bits+= esc_length;
2305
                run=0;
2306
            }else
2307
                run++;
2308
        }
2309
        i= scantable[last];
2310

    
2311
        level= temp[i] + 64;
2312

    
2313
        assert(level - 64);
2314

    
2315
        if((level&(~127)) == 0){
2316
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2317
        }else
2318
            bits+= esc_length;
2319

    
2320
    }
2321

    
2322
    if(last>=0){
2323
        if(s->mb_intra)
2324
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
2325
        else
2326
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
2327
    }
2328

    
2329
    s->dsp.idct_add(lsrc2, 8, temp);
2330

    
2331
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2332

    
2333
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2334
}
2335

    
2336
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2337
    MpegEncContext * const s= (MpegEncContext *)c;
2338
    const uint8_t *scantable= s->intra_scantable.permutated;
2339
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2340
    int i, last, run, bits, level, start_i;
2341
    const int esc_length= s->ac_esc_length;
2342
    uint8_t * length;
2343
    uint8_t * last_length;
2344

    
2345
    assert(h==8);
2346

    
2347
    s->dsp.diff_pixels(temp, src1, src2, stride);
2348

    
2349
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2350

    
2351
    bits=0;
2352

    
2353
    if (s->mb_intra) {
2354
        start_i = 1;
2355
        length     = s->intra_ac_vlc_length;
2356
        last_length= s->intra_ac_vlc_last_length;
2357
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2358
    } else {
2359
        start_i = 0;
2360
        length     = s->inter_ac_vlc_length;
2361
        last_length= s->inter_ac_vlc_last_length;
2362
    }
2363

    
2364
    if(last>=start_i){
2365
        run=0;
2366
        for(i=start_i; i<last; i++){
2367
            int j= scantable[i];
2368
            level= temp[j];
2369

    
2370
            if(level){
2371
                level+=64;
2372
                if((level&(~127)) == 0){
2373
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2374
                }else
2375
                    bits+= esc_length;
2376
                run=0;
2377
            }else
2378
                run++;
2379
        }
2380
        i= scantable[last];
2381

    
2382
        level= temp[i] + 64;
2383

    
2384
        assert(level - 64);
2385

    
2386
        if((level&(~127)) == 0){
2387
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2388
        }else
2389
            bits+= esc_length;
2390
    }
2391

    
2392
    return bits;
2393
}
2394

    
2395
#define VSAD_INTRA(size) \
2396
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2397
    int score=0;                                                                                            \
2398
    int x,y;                                                                                                \
2399
                                                                                                            \
2400
    for(y=1; y<h; y++){                                                                                     \
2401
        for(x=0; x<size; x+=4){                                                                             \
2402
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2403
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2404
        }                                                                                                   \
2405
        s+= stride;                                                                                         \
2406
    }                                                                                                       \
2407
                                                                                                            \
2408
    return score;                                                                                           \
2409
}
2410
VSAD_INTRA(8)
2411
VSAD_INTRA(16)
2412

    
2413
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2414
    int score=0;
2415
    int x,y;
2416

    
2417
    for(y=1; y<h; y++){
2418
        for(x=0; x<16; x++){
2419
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2420
        }
2421
        s1+= stride;
2422
        s2+= stride;
2423
    }
2424

    
2425
    return score;
2426
}
2427

    
2428
#define SQ(a) ((a)*(a))
2429
#define VSSE_INTRA(size) \
2430
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2431
    int score=0;                                                                                            \
2432
    int x,y;                                                                                                \
2433
                                                                                                            \
2434
    for(y=1; y<h; y++){                                                                                     \
2435
        for(x=0; x<size; x+=4){                                                                               \
2436
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2437
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2438
        }                                                                                                   \
2439
        s+= stride;                                                                                         \
2440
    }                                                                                                       \
2441
                                                                                                            \
2442
    return score;                                                                                           \
2443
}
2444
VSSE_INTRA(8)
2445
VSSE_INTRA(16)
2446

    
2447
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2448
    int score=0;
2449
    int x,y;
2450

    
2451
    for(y=1; y<h; y++){
2452
        for(x=0; x<16; x++){
2453
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2454
        }
2455
        s1+= stride;
2456
        s2+= stride;
2457
    }
2458

    
2459
    return score;
2460
}
2461

    
2462
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2463
                               int size){
2464
    int score=0;
2465
    int i;
2466
    for(i=0; i<size; i++)
2467
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2468
    return score;
2469
}
2470

    
2471
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2472
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2473
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2474
#if CONFIG_GPL
2475
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2476
#endif
2477
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2478
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2479
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2480
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2481

    
2482
static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2483
    int i;
2484
    for(i=0; i<len; i++)
2485
        dst[i] = src0[i] * src1[i];
2486
}
2487

    
2488
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2489
    int i;
2490
    src1 += len-1;
2491
    for(i=0; i<len; i++)
2492
        dst[i] = src0[i] * src1[-i];
2493
}
2494

    
2495
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2496
    int i;
2497
    for(i=0; i<len; i++)
2498
        dst[i] = src0[i] * src1[i] + src2[i];
2499
}
2500

    
2501
static void vector_fmul_window_c(float *dst, const float *src0,
2502
                                 const float *src1, const float *win, int len)
2503
{
2504
    int i,j;
2505
    dst += len;
2506
    win += len;
2507
    src0+= len;
2508
    for(i=-len, j=len-1; i<0; i++, j--) {
2509
        float s0 = src0[i];
2510
        float s1 = src1[j];
2511
        float wi = win[i];
2512
        float wj = win[j];
2513
        dst[i] = s0*wj - s1*wi;
2514
        dst[j] = s0*wi + s1*wj;
2515
    }
2516
}
2517

    
2518
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2519
                                 int len)
2520
{
2521
    int i;
2522
    for (i = 0; i < len; i++)
2523
        dst[i] = src[i] * mul;
2524
}
2525

    
2526
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2527
                                      const float **sv, float mul, int len)
2528
{
2529
    int i;
2530
    for (i = 0; i < len; i += 2, sv++) {
2531
        dst[i  ] = src[i  ] * sv[0][0] * mul;
2532
        dst[i+1] = src[i+1] * sv[0][1] * mul;
2533
    }
2534
}
2535

    
2536
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2537
                                      const float **sv, float mul, int len)
2538
{
2539
    int i;
2540
    for (i = 0; i < len; i += 4, sv++) {
2541
        dst[i  ] = src[i  ] * sv[0][0] * mul;
2542
        dst[i+1] = src[i+1] * sv[0][1] * mul;
2543
        dst[i+2] = src[i+2] * sv[0][2] * mul;
2544
        dst[i+3] = src[i+3] * sv[0][3] * mul;
2545
    }
2546
}
2547

    
2548
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2549
                               int len)
2550
{
2551
    int i;
2552
    for (i = 0; i < len; i += 2, sv++) {
2553
        dst[i  ] = sv[0][0] * mul;
2554
        dst[i+1] = sv[0][1] * mul;
2555
    }
2556
}
2557

    
2558
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2559
                               int len)
2560
{
2561
    int i;
2562
    for (i = 0; i < len; i += 4, sv++) {
2563
        dst[i  ] = sv[0][0] * mul;
2564
        dst[i+1] = sv[0][1] * mul;
2565
        dst[i+2] = sv[0][2] * mul;
2566
        dst[i+3] = sv[0][3] * mul;
2567
    }
2568
}
2569

    
2570
static void butterflies_float_c(float *restrict v1, float *restrict v2,
2571
                                int len)
2572
{
2573
    int i;
2574
    for (i = 0; i < len; i++) {
2575
        float t = v1[i] - v2[i];
2576
        v1[i] += v2[i];
2577
        v2[i] = t;
2578
    }
2579
}
2580

    
2581
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2582
{
2583
    float p = 0.0;
2584
    int i;
2585

    
2586
    for (i = 0; i < len; i++)
2587
        p += v1[i] * v2[i];
2588

    
2589
    return p;
2590
}
2591

    
2592
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2593
                   uint32_t maxi, uint32_t maxisign)
2594
{
2595

    
2596
    if(a > mini) return mini;
2597
    else if((a^(1U<<31)) > maxisign) return maxi;
2598
    else return a;
2599
}
2600

    
2601
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2602
    int i;
2603
    uint32_t mini = *(uint32_t*)min;
2604
    uint32_t maxi = *(uint32_t*)max;
2605
    uint32_t maxisign = maxi ^ (1U<<31);
2606
    uint32_t *dsti = (uint32_t*)dst;
2607
    const uint32_t *srci = (const uint32_t*)src;
2608
    for(i=0; i<len; i+=8) {
2609
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2610
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2611
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2612
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2613
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2614
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2615
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2616
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2617
    }
2618
}
2619
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2620
    int i;
2621
    if(min < 0 && max > 0) {
2622
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2623
    } else {
2624
        for(i=0; i < len; i+=8) {
2625
            dst[i    ] = av_clipf(src[i    ], min, max);
2626
            dst[i + 1] = av_clipf(src[i + 1], min, max);
2627
            dst[i + 2] = av_clipf(src[i + 2], min, max);
2628
            dst[i + 3] = av_clipf(src[i + 3], min, max);
2629
            dst[i + 4] = av_clipf(src[i + 4], min, max);
2630
            dst[i + 5] = av_clipf(src[i + 5], min, max);
2631
            dst[i + 6] = av_clipf(src[i + 6], min, max);
2632
            dst[i + 7] = av_clipf(src[i + 7], min, max);
2633
        }
2634
    }
2635
}
2636

    
2637
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2638
{
2639
    int res = 0;
2640

    
2641
    while (order--)
2642
        res += (*v1++ * *v2++) >> shift;
2643

    
2644
    return res;
2645
}
2646

    
2647
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2648
{
2649
    int res = 0;
2650
    while (order--) {
2651
        res   += *v1 * *v2++;
2652
        *v1++ += mul * *v3++;
2653
    }
2654
    return res;
2655
}
2656

    
2657
static void apply_window_int16_c(int16_t *output, const int16_t *input,
2658
                                 const int16_t *window, unsigned int len)
2659
{
2660
    int i;
2661
    int len2 = len >> 1;
2662

    
2663
    for (i = 0; i < len2; i++) {
2664
        int16_t w       = window[i];
2665
        output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2666
        output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2667
    }
2668
}
2669

    
2670
#define W0 2048
2671
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2672
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2673
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2674
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2675
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2676
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2677
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2678

    
2679
static void wmv2_idct_row(short * b)
2680
{
2681
    int s1,s2;
2682
    int a0,a1,a2,a3,a4,a5,a6,a7;
2683
    /*step 1*/
2684
    a1 = W1*b[1]+W7*b[7];
2685
    a7 = W7*b[1]-W1*b[7];
2686
    a5 = W5*b[5]+W3*b[3];
2687
    a3 = W3*b[5]-W5*b[3];
2688
    a2 = W2*b[2]+W6*b[6];
2689
    a6 = W6*b[2]-W2*b[6];
2690
    a0 = W0*b[0]+W0*b[4];
2691
    a4 = W0*b[0]-W0*b[4];
2692
    /*step 2*/
2693
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2694
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
2695
    /*step 3*/
2696
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2697
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
2698
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
2699
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2700
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2701
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
2702
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
2703
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2704
}
2705
static void wmv2_idct_col(short * b)
2706
{
2707
    int s1,s2;
2708
    int a0,a1,a2,a3,a4,a5,a6,a7;
2709
    /*step 1, with extended precision*/
2710
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2711
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2712
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2713
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2714
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2715
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2716
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2717
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2718
    /*step 2*/
2719
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
2720
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
2721
    /*step 3*/
2722
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2723
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2724
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2725
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2726

    
2727
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2728
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2729
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2730
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2731
}
2732
void ff_wmv2_idct_c(short * block){
2733
    int i;
2734

    
2735
    for(i=0;i<64;i+=8){
2736
        wmv2_idct_row(block+i);
2737
    }
2738
    for(i=0;i<8;i++){
2739
        wmv2_idct_col(block+i);
2740
    }
2741
}
2742
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2743
 converted */
2744
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2745
{
2746
    ff_wmv2_idct_c(block);
2747
    ff_put_pixels_clamped_c(block, dest, line_size);
2748
}
2749
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2750
{
2751
    ff_wmv2_idct_c(block);
2752
    ff_add_pixels_clamped_c(block, dest, line_size);
2753
}
2754
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2755
{
2756
    j_rev_dct (block);
2757
    ff_put_pixels_clamped_c(block, dest, line_size);
2758
}
2759
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2760
{
2761
    j_rev_dct (block);
2762
    ff_add_pixels_clamped_c(block, dest, line_size);
2763
}
2764

    
2765
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2766
{
2767
    j_rev_dct4 (block);
2768
    put_pixels_clamped4_c(block, dest, line_size);
2769
}
2770
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2771
{
2772
    j_rev_dct4 (block);
2773
    add_pixels_clamped4_c(block, dest, line_size);
2774
}
2775

    
2776
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2777
{
2778
    j_rev_dct2 (block);
2779
    put_pixels_clamped2_c(block, dest, line_size);
2780
}
2781
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2782
{
2783
    j_rev_dct2 (block);
2784
    add_pixels_clamped2_c(block, dest, line_size);
2785
}
2786

    
2787
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2788
{
2789
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2790

    
2791
    dest[0] = cm[(block[0] + 4)>>3];
2792
}
2793
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2794
{
2795
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2796

    
2797
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2798
}
2799

    
2800
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2801

    
2802
/* init static data */
2803
av_cold void dsputil_static_init(void)
2804
{
2805
    int i;
2806

    
2807
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2808
    for(i=0;i<MAX_NEG_CROP;i++) {
2809
        ff_cropTbl[i] = 0;
2810
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2811
    }
2812

    
2813
    for(i=0;i<512;i++) {
2814
        ff_squareTbl[i] = (i - 256) * (i - 256);
2815
    }
2816

    
2817
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2818
}
2819

    
2820
int ff_check_alignment(void){
2821
    static int did_fail=0;
2822
    DECLARE_ALIGNED(16, int, aligned);
2823

    
2824
    if((intptr_t)&aligned & 15){
2825
        if(!did_fail){
2826
#if HAVE_MMX || HAVE_ALTIVEC
2827
            av_log(NULL, AV_LOG_ERROR,
2828
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2829
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
2830
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2831
                "Do not report crashes to Libav developers.\n");
2832
#endif
2833
            did_fail=1;
2834
        }
2835
        return -1;
2836
    }
2837
    return 0;
2838
}
2839

    
2840
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2841
{
2842
    int i;
2843

    
2844
    ff_check_alignment();
2845

    
2846
#if CONFIG_ENCODERS
2847
    if(avctx->dct_algo==FF_DCT_FASTINT) {
2848
        c->fdct = fdct_ifast;
2849
        c->fdct248 = fdct_ifast248;
2850
    }
2851
    else if(avctx->dct_algo==FF_DCT_FAAN) {
2852
        c->fdct = ff_faandct;
2853
        c->fdct248 = ff_faandct248;
2854
    }
2855
    else {
2856
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2857
        c->fdct248 = ff_fdct248_islow;
2858
    }
2859
#endif //CONFIG_ENCODERS
2860

    
2861
    if(avctx->lowres==1){
2862
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2863
            c->idct_put= ff_jref_idct4_put;
2864
            c->idct_add= ff_jref_idct4_add;
2865
        }else{
2866
            c->idct_put= ff_h264_lowres_idct_put_c;
2867
            c->idct_add= ff_h264_lowres_idct_add_c;
2868
        }
2869
        c->idct    = j_rev_dct4;
2870
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2871
    }else if(avctx->lowres==2){
2872
        c->idct_put= ff_jref_idct2_put;
2873
        c->idct_add= ff_jref_idct2_add;
2874
        c->idct    = j_rev_dct2;
2875
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2876
    }else if(avctx->lowres==3){
2877
        c->idct_put= ff_jref_idct1_put;
2878
        c->idct_add= ff_jref_idct1_add;
2879
        c->idct    = j_rev_dct1;
2880
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2881
    }else{
2882
        if(avctx->idct_algo==FF_IDCT_INT){
2883
            c->idct_put= ff_jref_idct_put;
2884
            c->idct_add= ff_jref_idct_add;
2885
            c->idct    = j_rev_dct;
2886
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2887
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2888
                avctx->idct_algo==FF_IDCT_VP3){
2889
            c->idct_put= ff_vp3_idct_put_c;
2890
            c->idct_add= ff_vp3_idct_add_c;
2891
            c->idct    = ff_vp3_idct_c;
2892
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2893
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
2894
            c->idct_put= ff_wmv2_idct_put_c;
2895
            c->idct_add= ff_wmv2_idct_add_c;
2896
            c->idct    = ff_wmv2_idct_c;
2897
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2898
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
2899
            c->idct_put= ff_faanidct_put;
2900
            c->idct_add= ff_faanidct_add;
2901
            c->idct    = ff_faanidct;
2902
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2903
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2904
            c->idct_put= ff_ea_idct_put_c;
2905
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2906
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2907
            c->idct     = ff_bink_idct_c;
2908
            c->idct_add = ff_bink_idct_add_c;
2909
            c->idct_put = ff_bink_idct_put_c;
2910
            c->idct_permutation_type = FF_NO_IDCT_PERM;
2911
        }else{ //accurate/default
2912
            c->idct_put= ff_simple_idct_put;
2913
            c->idct_add= ff_simple_idct_add;
2914
            c->idct    = ff_simple_idct;
2915
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2916
        }
2917
    }
2918

    
2919
    c->get_pixels = get_pixels_c;
2920
    c->diff_pixels = diff_pixels_c;
2921
    c->put_pixels_clamped = ff_put_pixels_clamped_c;
2922
    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2923
    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2924
    c->add_pixels_clamped = ff_add_pixels_clamped_c;
2925
    c->add_pixels8 = add_pixels8_c;
2926
    c->add_pixels4 = add_pixels4_c;
2927
    c->sum_abs_dctelem = sum_abs_dctelem_c;
2928
    c->emulated_edge_mc = ff_emulated_edge_mc;
2929
    c->gmc1 = gmc1_c;
2930
    c->gmc = ff_gmc_c;
2931
    c->clear_block = clear_block_c;
2932
    c->clear_blocks = clear_blocks_c;
2933
    c->pix_sum = pix_sum_c;
2934
    c->pix_norm1 = pix_norm1_c;
2935

    
2936
    c->fill_block_tab[0] = fill_block16_c;
2937
    c->fill_block_tab[1] = fill_block8_c;
2938
    c->scale_block = scale_block_c;
2939

    
2940
    /* TODO [0] 16  [1] 8 */
2941
    c->pix_abs[0][0] = pix_abs16_c;
2942
    c->pix_abs[0][1] = pix_abs16_x2_c;
2943
    c->pix_abs[0][2] = pix_abs16_y2_c;
2944
    c->pix_abs[0][3] = pix_abs16_xy2_c;
2945
    c->pix_abs[1][0] = pix_abs8_c;
2946
    c->pix_abs[1][1] = pix_abs8_x2_c;
2947
    c->pix_abs[1][2] = pix_abs8_y2_c;
2948
    c->pix_abs[1][3] = pix_abs8_xy2_c;
2949

    
2950
#define dspfunc(PFX, IDX, NUM) \
2951
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
2952
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
2953
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
2954
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2955

    
2956
    dspfunc(put, 0, 16);
2957
    dspfunc(put_no_rnd, 0, 16);
2958
    dspfunc(put, 1, 8);
2959
    dspfunc(put_no_rnd, 1, 8);
2960
    dspfunc(put, 2, 4);
2961
    dspfunc(put, 3, 2);
2962

    
2963
    dspfunc(avg, 0, 16);
2964
    dspfunc(avg_no_rnd, 0, 16);
2965
    dspfunc(avg, 1, 8);
2966
    dspfunc(avg_no_rnd, 1, 8);
2967
    dspfunc(avg, 2, 4);
2968
    dspfunc(avg, 3, 2);
2969
#undef dspfunc
2970

    
2971
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
2972
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
2973

    
2974
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2975
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2976
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2977
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2978
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2979
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2980
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2981
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2982
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2983

    
2984
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2985
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2986
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2987
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2988
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2989
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2990
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2991
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2992
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2993

    
2994
#define dspfunc(PFX, IDX, NUM) \
2995
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2996
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2997
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2998
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2999
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3000
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3001
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3002
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3003
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3004
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3005
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3006
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3007
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3008
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3009
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3010
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3011

    
3012
    dspfunc(put_qpel, 0, 16);
3013
    dspfunc(put_no_rnd_qpel, 0, 16);
3014

    
3015
    dspfunc(avg_qpel, 0, 16);
3016
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3017

    
3018
    dspfunc(put_qpel, 1, 8);
3019
    dspfunc(put_no_rnd_qpel, 1, 8);
3020

    
3021
    dspfunc(avg_qpel, 1, 8);
3022
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3023

    
3024
    dspfunc(put_h264_qpel, 0, 16);
3025
    dspfunc(put_h264_qpel, 1, 8);
3026
    dspfunc(put_h264_qpel, 2, 4);
3027
    dspfunc(put_h264_qpel, 3, 2);
3028
    dspfunc(avg_h264_qpel, 0, 16);
3029
    dspfunc(avg_h264_qpel, 1, 8);
3030
    dspfunc(avg_h264_qpel, 2, 4);
3031

    
3032
#undef dspfunc
3033
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3034
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3035
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3036
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3037
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3038
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3039

    
3040
    c->draw_edges = draw_edges_c;
3041

    
3042
#if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3043
    ff_mlp_init(c, avctx);
3044
#endif
3045
#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3046
    ff_intrax8dsp_init(c,avctx);
3047
#endif
3048
#if CONFIG_RV30_DECODER
3049
    ff_rv30dsp_init(c,avctx);
3050
#endif
3051
#if CONFIG_RV40_DECODER
3052
    ff_rv40dsp_init(c,avctx);
3053
    c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3054
    c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3055
    c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3056
    c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3057
#endif
3058

    
3059
    c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3060
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3061
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3062
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3063
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3064
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3065
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3066
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3067

    
3068
#define SET_CMP_FUNC(name) \
3069
    c->name[0]= name ## 16_c;\
3070
    c->name[1]= name ## 8x8_c;
3071

    
3072
    SET_CMP_FUNC(hadamard8_diff)
3073
    c->hadamard8_diff[4]= hadamard8_intra16_c;
3074
    c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3075
    SET_CMP_FUNC(dct_sad)
3076
    SET_CMP_FUNC(dct_max)
3077
#if CONFIG_GPL
3078
    SET_CMP_FUNC(dct264_sad)
3079
#endif
3080
    c->sad[0]= pix_abs16_c;
3081
    c->sad[1]= pix_abs8_c;
3082
    c->sse[0]= sse16_c;
3083
    c->sse[1]= sse8_c;
3084
    c->sse[2]= sse4_c;
3085
    SET_CMP_FUNC(quant_psnr)
3086
    SET_CMP_FUNC(rd)
3087
    SET_CMP_FUNC(bit)
3088
    c->vsad[0]= vsad16_c;
3089
    c->vsad[4]= vsad_intra16_c;
3090
    c->vsad[5]= vsad_intra8_c;
3091
    c->vsse[0]= vsse16_c;
3092
    c->vsse[4]= vsse_intra16_c;
3093
    c->vsse[5]= vsse_intra8_c;
3094
    c->nsse[0]= nsse16_c;
3095
    c->nsse[1]= nsse8_c;
3096
#if CONFIG_DWT
3097
    ff_dsputil_init_dwt(c);
3098
#endif
3099

    
3100
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3101

    
3102
    c->add_bytes= add_bytes_c;
3103
    c->add_bytes_l2= add_bytes_l2_c;
3104
    c->diff_bytes= diff_bytes_c;
3105
    c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3106
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3107
    c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
3108
    c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3109
    c->bswap_buf= bswap_buf;
3110
    c->bswap16_buf = bswap16_buf;
3111
#if CONFIG_PNG_DECODER
3112
    c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3113
#endif
3114

    
3115
    if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3116
        c->h263_h_loop_filter= h263_h_loop_filter_c;
3117
        c->h263_v_loop_filter= h263_v_loop_filter_c;
3118
    }
3119

    
3120
    if (CONFIG_VP3_DECODER) {
3121
        c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3122
        c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3123
        c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3124
    }
3125

    
3126
    c->h261_loop_filter= h261_loop_filter_c;
3127

    
3128
    c->try_8x8basis= try_8x8basis_c;
3129
    c->add_8x8basis= add_8x8basis_c;
3130

    
3131
#if CONFIG_VORBIS_DECODER
3132
    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3133
#endif
3134
#if CONFIG_AC3_DECODER
3135
    c->ac3_downmix = ff_ac3_downmix_c;
3136
#endif
3137
    c->vector_fmul = vector_fmul_c;
3138
    c->vector_fmul_reverse = vector_fmul_reverse_c;
3139
    c->vector_fmul_add = vector_fmul_add_c;
3140
    c->vector_fmul_window = vector_fmul_window_c;
3141
    c->vector_clipf = vector_clipf_c;
3142
    c->scalarproduct_int16 = scalarproduct_int16_c;
3143
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3144
    c->apply_window_int16 = apply_window_int16_c;
3145
    c->scalarproduct_float = scalarproduct_float_c;
3146
    c->butterflies_float = butterflies_float_c;
3147
    c->vector_fmul_scalar = vector_fmul_scalar_c;
3148

    
3149
    c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3150
    c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3151

    
3152
    c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3153
    c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3154

    
3155
    c->shrink[0]= av_image_copy_plane;
3156
    c->shrink[1]= ff_shrink22;
3157
    c->shrink[2]= ff_shrink44;
3158
    c->shrink[3]= ff_shrink88;
3159

    
3160
    c->prefetch= just_return;
3161

    
3162
    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3163
    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3164

    
3165
    if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3166
    if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3167
    if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3168
    if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3169
    if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3170
    if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3171
    if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3172
    if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3173
    if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3174

    
3175
    for(i=0; i<64; i++){
3176
        if(!c->put_2tap_qpel_pixels_tab[0][i])
3177
            c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3178
        if(!c->avg_2tap_qpel_pixels_tab[0][i])
3179
            c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3180
    }
3181

    
3182
    c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3183
    c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3184
    c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3185
    c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3186

    
3187
    c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3188
    c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3189
    c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3190
    c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3191

    
3192
    switch(c->idct_permutation_type){
3193
    case FF_NO_IDCT_PERM:
3194
        for(i=0; i<64; i++)
3195
            c->idct_permutation[i]= i;
3196
        break;
3197
    case FF_LIBMPEG2_IDCT_PERM:
3198
        for(i=0; i<64; i++)
3199
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3200
        break;
3201
    case FF_SIMPLE_IDCT_PERM:
3202
        for(i=0; i<64; i++)
3203
            c->idct_permutation[i]= simple_mmx_permutation[i];
3204
        break;
3205
    case FF_TRANSPOSE_IDCT_PERM:
3206
        for(i=0; i<64; i++)
3207
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3208
        break;
3209
    case FF_PARTTRANS_IDCT_PERM:
3210
        for(i=0; i<64; i++)
3211
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3212
        break;
3213
    case FF_SSE2_IDCT_PERM:
3214
        for(i=0; i<64; i++)
3215
            c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3216
        break;
3217
    default:
3218
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3219
    }
3220
}
3221