Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ b5f83deb

History | View | Annotate | Download (157 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "libavcore/imgutils.h"
31
#include "avcodec.h"
32
#include "dsputil.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "faanidct.h"
36
#include "mathops.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47
#define pb_7f (~0UL/255 * 0x7f)
48
#define pb_80 (~0UL/255 * 0x80)
49

    
50
const uint8_t ff_zigzag_direct[64] = {
51
    0,   1,  8, 16,  9,  2,  3, 10,
52
    17, 24, 32, 25, 18, 11,  4,  5,
53
    12, 19, 26, 33, 40, 48, 41, 34,
54
    27, 20, 13,  6,  7, 14, 21, 28,
55
    35, 42, 49, 56, 57, 50, 43, 36,
56
    29, 22, 15, 23, 30, 37, 44, 51,
57
    58, 59, 52, 45, 38, 31, 39, 46,
58
    53, 60, 61, 54, 47, 55, 62, 63
59
};
60

    
61
/* Specific zigzag scan for 248 idct. NOTE that unlike the
62
   specification, we interleave the fields */
63
const uint8_t ff_zigzag248_direct[64] = {
64
     0,  8,  1,  9, 16, 24,  2, 10,
65
    17, 25, 32, 40, 48, 56, 33, 41,
66
    18, 26,  3, 11,  4, 12, 19, 27,
67
    34, 42, 49, 57, 50, 58, 35, 43,
68
    20, 28,  5, 13,  6, 14, 21, 29,
69
    36, 44, 51, 59, 52, 60, 37, 45,
70
    22, 30,  7, 15, 23, 31, 38, 46,
71
    53, 61, 54, 62, 39, 47, 55, 63,
72
};
73

    
74
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
76

    
77
const uint8_t ff_alternate_horizontal_scan[64] = {
78
    0,  1,   2,  3,  8,  9, 16, 17,
79
    10, 11,  4,  5,  6,  7, 15, 14,
80
    13, 12, 19, 18, 24, 25, 32, 33,
81
    26, 27, 20, 21, 22, 23, 28, 29,
82
    30, 31, 34, 35, 40, 41, 48, 49,
83
    42, 43, 36, 37, 38, 39, 44, 45,
84
    46, 47, 50, 51, 56, 57, 58, 59,
85
    52, 53, 54, 55, 60, 61, 62, 63,
86
};
87

    
88
const uint8_t ff_alternate_vertical_scan[64] = {
89
    0,  8,  16, 24,  1,  9,  2, 10,
90
    17, 25, 32, 40, 48, 56, 57, 49,
91
    41, 33, 26, 18,  3, 11,  4, 12,
92
    19, 27, 34, 42, 50, 58, 35, 43,
93
    51, 59, 20, 28,  5, 13,  6, 14,
94
    21, 29, 36, 44, 52, 60, 37, 45,
95
    53, 61, 22, 30,  7, 15, 23, 31,
96
    38, 46, 54, 62, 39, 47, 55, 63,
97
};
98

    
99
/* Input permutation for the simple_idct_mmx */
100
static const uint8_t simple_mmx_permutation[64]={
101
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
102
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
103
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
104
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
105
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
106
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
107
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
108
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
109
};
110

    
111
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
112

    
113
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
114
    int i;
115
    int end;
116

    
117
    st->scantable= src_scantable;
118

    
119
    for(i=0; i<64; i++){
120
        int j;
121
        j = src_scantable[i];
122
        st->permutated[i] = permutation[j];
123
#if ARCH_PPC
124
        st->inverse[j] = i;
125
#endif
126
    }
127

    
128
    end=-1;
129
    for(i=0; i<64; i++){
130
        int j;
131
        j = st->permutated[i];
132
        if(j>end) end=j;
133
        st->raster_end[i]= end;
134
    }
135
}
136

    
137
static int pix_sum_c(uint8_t * pix, int line_size)
138
{
139
    int s, i, j;
140

    
141
    s = 0;
142
    for (i = 0; i < 16; i++) {
143
        for (j = 0; j < 16; j += 8) {
144
            s += pix[0];
145
            s += pix[1];
146
            s += pix[2];
147
            s += pix[3];
148
            s += pix[4];
149
            s += pix[5];
150
            s += pix[6];
151
            s += pix[7];
152
            pix += 8;
153
        }
154
        pix += line_size - 16;
155
    }
156
    return s;
157
}
158

    
159
static int pix_norm1_c(uint8_t * pix, int line_size)
160
{
161
    int s, i, j;
162
    uint32_t *sq = ff_squareTbl + 256;
163

    
164
    s = 0;
165
    for (i = 0; i < 16; i++) {
166
        for (j = 0; j < 16; j += 8) {
167
#if 0
168
            s += sq[pix[0]];
169
            s += sq[pix[1]];
170
            s += sq[pix[2]];
171
            s += sq[pix[3]];
172
            s += sq[pix[4]];
173
            s += sq[pix[5]];
174
            s += sq[pix[6]];
175
            s += sq[pix[7]];
176
#else
177
#if LONG_MAX > 2147483647
178
            register uint64_t x=*(uint64_t*)pix;
179
            s += sq[x&0xff];
180
            s += sq[(x>>8)&0xff];
181
            s += sq[(x>>16)&0xff];
182
            s += sq[(x>>24)&0xff];
183
            s += sq[(x>>32)&0xff];
184
            s += sq[(x>>40)&0xff];
185
            s += sq[(x>>48)&0xff];
186
            s += sq[(x>>56)&0xff];
187
#else
188
            register uint32_t x=*(uint32_t*)pix;
189
            s += sq[x&0xff];
190
            s += sq[(x>>8)&0xff];
191
            s += sq[(x>>16)&0xff];
192
            s += sq[(x>>24)&0xff];
193
            x=*(uint32_t*)(pix+4);
194
            s += sq[x&0xff];
195
            s += sq[(x>>8)&0xff];
196
            s += sq[(x>>16)&0xff];
197
            s += sq[(x>>24)&0xff];
198
#endif
199
#endif
200
            pix += 8;
201
        }
202
        pix += line_size - 16;
203
    }
204
    return s;
205
}
206

    
207
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
208
    int i;
209

    
210
    for(i=0; i+8<=w; i+=8){
211
        dst[i+0]= av_bswap32(src[i+0]);
212
        dst[i+1]= av_bswap32(src[i+1]);
213
        dst[i+2]= av_bswap32(src[i+2]);
214
        dst[i+3]= av_bswap32(src[i+3]);
215
        dst[i+4]= av_bswap32(src[i+4]);
216
        dst[i+5]= av_bswap32(src[i+5]);
217
        dst[i+6]= av_bswap32(src[i+6]);
218
        dst[i+7]= av_bswap32(src[i+7]);
219
    }
220
    for(;i<w; i++){
221
        dst[i+0]= av_bswap32(src[i+0]);
222
    }
223
}
224

    
225
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
226
{
227
    int s, i;
228
    uint32_t *sq = ff_squareTbl + 256;
229

    
230
    s = 0;
231
    for (i = 0; i < h; i++) {
232
        s += sq[pix1[0] - pix2[0]];
233
        s += sq[pix1[1] - pix2[1]];
234
        s += sq[pix1[2] - pix2[2]];
235
        s += sq[pix1[3] - pix2[3]];
236
        pix1 += line_size;
237
        pix2 += line_size;
238
    }
239
    return s;
240
}
241

    
242
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
243
{
244
    int s, i;
245
    uint32_t *sq = ff_squareTbl + 256;
246

    
247
    s = 0;
248
    for (i = 0; i < h; i++) {
249
        s += sq[pix1[0] - pix2[0]];
250
        s += sq[pix1[1] - pix2[1]];
251
        s += sq[pix1[2] - pix2[2]];
252
        s += sq[pix1[3] - pix2[3]];
253
        s += sq[pix1[4] - pix2[4]];
254
        s += sq[pix1[5] - pix2[5]];
255
        s += sq[pix1[6] - pix2[6]];
256
        s += sq[pix1[7] - pix2[7]];
257
        pix1 += line_size;
258
        pix2 += line_size;
259
    }
260
    return s;
261
}
262

    
263
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
264
{
265
    int s, i;
266
    uint32_t *sq = ff_squareTbl + 256;
267

    
268
    s = 0;
269
    for (i = 0; i < h; i++) {
270
        s += sq[pix1[ 0] - pix2[ 0]];
271
        s += sq[pix1[ 1] - pix2[ 1]];
272
        s += sq[pix1[ 2] - pix2[ 2]];
273
        s += sq[pix1[ 3] - pix2[ 3]];
274
        s += sq[pix1[ 4] - pix2[ 4]];
275
        s += sq[pix1[ 5] - pix2[ 5]];
276
        s += sq[pix1[ 6] - pix2[ 6]];
277
        s += sq[pix1[ 7] - pix2[ 7]];
278
        s += sq[pix1[ 8] - pix2[ 8]];
279
        s += sq[pix1[ 9] - pix2[ 9]];
280
        s += sq[pix1[10] - pix2[10]];
281
        s += sq[pix1[11] - pix2[11]];
282
        s += sq[pix1[12] - pix2[12]];
283
        s += sq[pix1[13] - pix2[13]];
284
        s += sq[pix1[14] - pix2[14]];
285
        s += sq[pix1[15] - pix2[15]];
286

    
287
        pix1 += line_size;
288
        pix2 += line_size;
289
    }
290
    return s;
291
}
292

    
293
/* draw the edges of width 'w' of an image of size width, height */
294
//FIXME check that this is ok for mpeg4 interlaced
295
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
296
{
297
    uint8_t *ptr, *last_line;
298
    int i;
299

    
300
    last_line = buf + (height - 1) * wrap;
301
    for(i=0;i<w;i++) {
302
        /* top and bottom */
303
        memcpy(buf - (i + 1) * wrap, buf, width);
304
        memcpy(last_line + (i + 1) * wrap, last_line, width);
305
    }
306
    /* left and right */
307
    ptr = buf;
308
    for(i=0;i<height;i++) {
309
        memset(ptr - w, ptr[0], w);
310
        memset(ptr + width, ptr[width-1], w);
311
        ptr += wrap;
312
    }
313
    /* corners */
314
    for(i=0;i<w;i++) {
315
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
316
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
317
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
318
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
319
    }
320
}
321

    
322
/**
323
 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
324
 * @param buf destination buffer
325
 * @param src source buffer
326
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
327
 * @param block_w width of block
328
 * @param block_h height of block
329
 * @param src_x x coordinate of the top left sample of the block in the source buffer
330
 * @param src_y y coordinate of the top left sample of the block in the source buffer
331
 * @param w width of the source buffer
332
 * @param h height of the source buffer
333
 */
334
void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
335
                                    int src_x, int src_y, int w, int h){
336
    int x, y;
337
    int start_y, start_x, end_y, end_x;
338

    
339
    if(src_y>= h){
340
        src+= (h-1-src_y)*linesize;
341
        src_y=h-1;
342
    }else if(src_y<=-block_h){
343
        src+= (1-block_h-src_y)*linesize;
344
        src_y=1-block_h;
345
    }
346
    if(src_x>= w){
347
        src+= (w-1-src_x);
348
        src_x=w-1;
349
    }else if(src_x<=-block_w){
350
        src+= (1-block_w-src_x);
351
        src_x=1-block_w;
352
    }
353

    
354
    start_y= FFMAX(0, -src_y);
355
    start_x= FFMAX(0, -src_x);
356
    end_y= FFMIN(block_h, h-src_y);
357
    end_x= FFMIN(block_w, w-src_x);
358

    
359
    // copy existing part
360
    for(y=start_y; y<end_y; y++){
361
        for(x=start_x; x<end_x; x++){
362
            buf[x + y*linesize]= src[x + y*linesize];
363
        }
364
    }
365

    
366
    //top
367
    for(y=0; y<start_y; y++){
368
        for(x=start_x; x<end_x; x++){
369
            buf[x + y*linesize]= buf[x + start_y*linesize];
370
        }
371
    }
372

    
373
    //bottom
374
    for(y=end_y; y<block_h; y++){
375
        for(x=start_x; x<end_x; x++){
376
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
377
        }
378
    }
379

    
380
    for(y=0; y<block_h; y++){
381
       //left
382
        for(x=0; x<start_x; x++){
383
            buf[x + y*linesize]= buf[start_x + y*linesize];
384
        }
385

    
386
       //right
387
        for(x=end_x; x<block_w; x++){
388
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
389
        }
390
    }
391
}
392

    
393
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
394
{
395
    int i;
396

    
397
    /* read the pixels */
398
    for(i=0;i<8;i++) {
399
        block[0] = pixels[0];
400
        block[1] = pixels[1];
401
        block[2] = pixels[2];
402
        block[3] = pixels[3];
403
        block[4] = pixels[4];
404
        block[5] = pixels[5];
405
        block[6] = pixels[6];
406
        block[7] = pixels[7];
407
        pixels += line_size;
408
        block += 8;
409
    }
410
}
411

    
412
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
413
                          const uint8_t *s2, int stride){
414
    int i;
415

    
416
    /* read the pixels */
417
    for(i=0;i<8;i++) {
418
        block[0] = s1[0] - s2[0];
419
        block[1] = s1[1] - s2[1];
420
        block[2] = s1[2] - s2[2];
421
        block[3] = s1[3] - s2[3];
422
        block[4] = s1[4] - s2[4];
423
        block[5] = s1[5] - s2[5];
424
        block[6] = s1[6] - s2[6];
425
        block[7] = s1[7] - s2[7];
426
        s1 += stride;
427
        s2 += stride;
428
        block += 8;
429
    }
430
}
431

    
432

    
433
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
434
                                 int line_size)
435
{
436
    int i;
437
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
438

    
439
    /* read the pixels */
440
    for(i=0;i<8;i++) {
441
        pixels[0] = cm[block[0]];
442
        pixels[1] = cm[block[1]];
443
        pixels[2] = cm[block[2]];
444
        pixels[3] = cm[block[3]];
445
        pixels[4] = cm[block[4]];
446
        pixels[5] = cm[block[5]];
447
        pixels[6] = cm[block[6]];
448
        pixels[7] = cm[block[7]];
449

    
450
        pixels += line_size;
451
        block += 8;
452
    }
453
}
454

    
455
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
456
                                 int line_size)
457
{
458
    int i;
459
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
460

    
461
    /* read the pixels */
462
    for(i=0;i<4;i++) {
463
        pixels[0] = cm[block[0]];
464
        pixels[1] = cm[block[1]];
465
        pixels[2] = cm[block[2]];
466
        pixels[3] = cm[block[3]];
467

    
468
        pixels += line_size;
469
        block += 8;
470
    }
471
}
472

    
473
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
474
                                 int line_size)
475
{
476
    int i;
477
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
478

    
479
    /* read the pixels */
480
    for(i=0;i<2;i++) {
481
        pixels[0] = cm[block[0]];
482
        pixels[1] = cm[block[1]];
483

    
484
        pixels += line_size;
485
        block += 8;
486
    }
487
}
488

    
489
static void put_signed_pixels_clamped_c(const DCTELEM *block,
490
                                        uint8_t *restrict pixels,
491
                                        int line_size)
492
{
493
    int i, j;
494

    
495
    for (i = 0; i < 8; i++) {
496
        for (j = 0; j < 8; j++) {
497
            if (*block < -128)
498
                *pixels = 0;
499
            else if (*block > 127)
500
                *pixels = 255;
501
            else
502
                *pixels = (uint8_t)(*block + 128);
503
            block++;
504
            pixels++;
505
        }
506
        pixels += (line_size - 8);
507
    }
508
}
509

    
510
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
511
                                    int line_size)
512
{
513
    int i;
514

    
515
    /* read the pixels */
516
    for(i=0;i<8;i++) {
517
        pixels[0] = block[0];
518
        pixels[1] = block[1];
519
        pixels[2] = block[2];
520
        pixels[3] = block[3];
521
        pixels[4] = block[4];
522
        pixels[5] = block[5];
523
        pixels[6] = block[6];
524
        pixels[7] = block[7];
525

    
526
        pixels += line_size;
527
        block += 8;
528
    }
529
}
530

    
531
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
532
                          int line_size)
533
{
534
    int i;
535
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
536

    
537
    /* read the pixels */
538
    for(i=0;i<8;i++) {
539
        pixels[0] = cm[pixels[0] + block[0]];
540
        pixels[1] = cm[pixels[1] + block[1]];
541
        pixels[2] = cm[pixels[2] + block[2]];
542
        pixels[3] = cm[pixels[3] + block[3]];
543
        pixels[4] = cm[pixels[4] + block[4]];
544
        pixels[5] = cm[pixels[5] + block[5]];
545
        pixels[6] = cm[pixels[6] + block[6]];
546
        pixels[7] = cm[pixels[7] + block[7]];
547
        pixels += line_size;
548
        block += 8;
549
    }
550
}
551

    
552
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
553
                          int line_size)
554
{
555
    int i;
556
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
557

    
558
    /* read the pixels */
559
    for(i=0;i<4;i++) {
560
        pixels[0] = cm[pixels[0] + block[0]];
561
        pixels[1] = cm[pixels[1] + block[1]];
562
        pixels[2] = cm[pixels[2] + block[2]];
563
        pixels[3] = cm[pixels[3] + block[3]];
564
        pixels += line_size;
565
        block += 8;
566
    }
567
}
568

    
569
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
570
                          int line_size)
571
{
572
    int i;
573
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
574

    
575
    /* read the pixels */
576
    for(i=0;i<2;i++) {
577
        pixels[0] = cm[pixels[0] + block[0]];
578
        pixels[1] = cm[pixels[1] + block[1]];
579
        pixels += line_size;
580
        block += 8;
581
    }
582
}
583

    
584
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
585
{
586
    int i;
587
    for(i=0;i<8;i++) {
588
        pixels[0] += block[0];
589
        pixels[1] += block[1];
590
        pixels[2] += block[2];
591
        pixels[3] += block[3];
592
        pixels[4] += block[4];
593
        pixels[5] += block[5];
594
        pixels[6] += block[6];
595
        pixels[7] += block[7];
596
        pixels += line_size;
597
        block += 8;
598
    }
599
}
600

    
601
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
602
{
603
    int i;
604
    for(i=0;i<4;i++) {
605
        pixels[0] += block[0];
606
        pixels[1] += block[1];
607
        pixels[2] += block[2];
608
        pixels[3] += block[3];
609
        pixels += line_size;
610
        block += 4;
611
    }
612
}
613

    
614
static int sum_abs_dctelem_c(DCTELEM *block)
615
{
616
    int sum=0, i;
617
    for(i=0; i<64; i++)
618
        sum+= FFABS(block[i]);
619
    return sum;
620
}
621

    
622
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
623
{
624
    int i;
625

    
626
    for (i = 0; i < h; i++) {
627
        memset(block, value, 16);
628
        block += line_size;
629
    }
630
}
631

    
632
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
633
{
634
    int i;
635

    
636
    for (i = 0; i < h; i++) {
637
        memset(block, value, 8);
638
        block += line_size;
639
    }
640
}
641

    
642
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
643
{
644
    int i, j;
645
    uint16_t *dst1 = (uint16_t *) dst;
646
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
647

    
648
    for (j = 0; j < 8; j++) {
649
        for (i = 0; i < 8; i++) {
650
            dst1[i] = dst2[i] = src[i] * 0x0101;
651
        }
652
        src  += 8;
653
        dst1 += linesize;
654
        dst2 += linesize;
655
    }
656
}
657

    
658
#if 0
659

660
#define PIXOP2(OPNAME, OP) \
661
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
662
{\
663
    int i;\
664
    for(i=0; i<h; i++){\
665
        OP(*((uint64_t*)block), AV_RN64(pixels));\
666
        pixels+=line_size;\
667
        block +=line_size;\
668
    }\
669
}\
670
\
671
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
672
{\
673
    int i;\
674
    for(i=0; i<h; i++){\
675
        const uint64_t a= AV_RN64(pixels  );\
676
        const uint64_t b= AV_RN64(pixels+1);\
677
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
678
        pixels+=line_size;\
679
        block +=line_size;\
680
    }\
681
}\
682
\
683
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
684
{\
685
    int i;\
686
    for(i=0; i<h; i++){\
687
        const uint64_t a= AV_RN64(pixels  );\
688
        const uint64_t b= AV_RN64(pixels+1);\
689
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
690
        pixels+=line_size;\
691
        block +=line_size;\
692
    }\
693
}\
694
\
695
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
696
{\
697
    int i;\
698
    for(i=0; i<h; i++){\
699
        const uint64_t a= AV_RN64(pixels          );\
700
        const uint64_t b= AV_RN64(pixels+line_size);\
701
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
702
        pixels+=line_size;\
703
        block +=line_size;\
704
    }\
705
}\
706
\
707
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
708
{\
709
    int i;\
710
    for(i=0; i<h; i++){\
711
        const uint64_t a= AV_RN64(pixels          );\
712
        const uint64_t b= AV_RN64(pixels+line_size);\
713
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
714
        pixels+=line_size;\
715
        block +=line_size;\
716
    }\
717
}\
718
\
719
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
720
{\
721
        int i;\
722
        const uint64_t a= AV_RN64(pixels  );\
723
        const uint64_t b= AV_RN64(pixels+1);\
724
        uint64_t l0=  (a&0x0303030303030303ULL)\
725
                    + (b&0x0303030303030303ULL)\
726
                    + 0x0202020202020202ULL;\
727
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
728
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
729
        uint64_t l1,h1;\
730
\
731
        pixels+=line_size;\
732
        for(i=0; i<h; i+=2){\
733
            uint64_t a= AV_RN64(pixels  );\
734
            uint64_t b= AV_RN64(pixels+1);\
735
            l1=  (a&0x0303030303030303ULL)\
736
               + (b&0x0303030303030303ULL);\
737
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
738
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
739
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
740
            pixels+=line_size;\
741
            block +=line_size;\
742
            a= AV_RN64(pixels  );\
743
            b= AV_RN64(pixels+1);\
744
            l0=  (a&0x0303030303030303ULL)\
745
               + (b&0x0303030303030303ULL)\
746
               + 0x0202020202020202ULL;\
747
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
748
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
749
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
750
            pixels+=line_size;\
751
            block +=line_size;\
752
        }\
753
}\
754
\
755
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
756
{\
757
        int i;\
758
        const uint64_t a= AV_RN64(pixels  );\
759
        const uint64_t b= AV_RN64(pixels+1);\
760
        uint64_t l0=  (a&0x0303030303030303ULL)\
761
                    + (b&0x0303030303030303ULL)\
762
                    + 0x0101010101010101ULL;\
763
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
764
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
765
        uint64_t l1,h1;\
766
\
767
        pixels+=line_size;\
768
        for(i=0; i<h; i+=2){\
769
            uint64_t a= AV_RN64(pixels  );\
770
            uint64_t b= AV_RN64(pixels+1);\
771
            l1=  (a&0x0303030303030303ULL)\
772
               + (b&0x0303030303030303ULL);\
773
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
774
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
775
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
776
            pixels+=line_size;\
777
            block +=line_size;\
778
            a= AV_RN64(pixels  );\
779
            b= AV_RN64(pixels+1);\
780
            l0=  (a&0x0303030303030303ULL)\
781
               + (b&0x0303030303030303ULL)\
782
               + 0x0101010101010101ULL;\
783
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
784
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
785
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
786
            pixels+=line_size;\
787
            block +=line_size;\
788
        }\
789
}\
790
\
791
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
792
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
793
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
794
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
795
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
796
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
797
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
798

799
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
800
#else // 64 bit variant
801

    
802
#define PIXOP2(OPNAME, OP) \
803
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
804
    int i;\
805
    for(i=0; i<h; i++){\
806
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
807
        pixels+=line_size;\
808
        block +=line_size;\
809
    }\
810
}\
811
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
812
    int i;\
813
    for(i=0; i<h; i++){\
814
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
815
        pixels+=line_size;\
816
        block +=line_size;\
817
    }\
818
}\
819
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
820
    int i;\
821
    for(i=0; i<h; i++){\
822
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
823
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
824
        pixels+=line_size;\
825
        block +=line_size;\
826
    }\
827
}\
828
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
829
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
830
}\
831
\
832
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
833
                                                int src_stride1, int src_stride2, int h){\
834
    int i;\
835
    for(i=0; i<h; i++){\
836
        uint32_t a,b;\
837
        a= AV_RN32(&src1[i*src_stride1  ]);\
838
        b= AV_RN32(&src2[i*src_stride2  ]);\
839
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
840
        a= AV_RN32(&src1[i*src_stride1+4]);\
841
        b= AV_RN32(&src2[i*src_stride2+4]);\
842
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
843
    }\
844
}\
845
\
846
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
847
                                                int src_stride1, int src_stride2, int h){\
848
    int i;\
849
    for(i=0; i<h; i++){\
850
        uint32_t a,b;\
851
        a= AV_RN32(&src1[i*src_stride1  ]);\
852
        b= AV_RN32(&src2[i*src_stride2  ]);\
853
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
854
        a= AV_RN32(&src1[i*src_stride1+4]);\
855
        b= AV_RN32(&src2[i*src_stride2+4]);\
856
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
857
    }\
858
}\
859
\
860
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
861
                                                int src_stride1, int src_stride2, int h){\
862
    int i;\
863
    for(i=0; i<h; i++){\
864
        uint32_t a,b;\
865
        a= AV_RN32(&src1[i*src_stride1  ]);\
866
        b= AV_RN32(&src2[i*src_stride2  ]);\
867
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
868
    }\
869
}\
870
\
871
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
872
                                                int src_stride1, int src_stride2, int h){\
873
    int i;\
874
    for(i=0; i<h; i++){\
875
        uint32_t a,b;\
876
        a= AV_RN16(&src1[i*src_stride1  ]);\
877
        b= AV_RN16(&src2[i*src_stride2  ]);\
878
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
879
    }\
880
}\
881
\
882
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
883
                                                int src_stride1, int src_stride2, int h){\
884
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
885
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
886
}\
887
\
888
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
889
                                                int src_stride1, int src_stride2, int h){\
890
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
891
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
892
}\
893
\
894
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
895
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
896
}\
897
\
898
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
899
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
900
}\
901
\
902
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
903
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
904
}\
905
\
906
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
907
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
908
}\
909
\
910
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
911
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
912
    int i;\
913
    for(i=0; i<h; i++){\
914
        uint32_t a, b, c, d, l0, l1, h0, h1;\
915
        a= AV_RN32(&src1[i*src_stride1]);\
916
        b= AV_RN32(&src2[i*src_stride2]);\
917
        c= AV_RN32(&src3[i*src_stride3]);\
918
        d= AV_RN32(&src4[i*src_stride4]);\
919
        l0=  (a&0x03030303UL)\
920
           + (b&0x03030303UL)\
921
           + 0x02020202UL;\
922
        h0= ((a&0xFCFCFCFCUL)>>2)\
923
          + ((b&0xFCFCFCFCUL)>>2);\
924
        l1=  (c&0x03030303UL)\
925
           + (d&0x03030303UL);\
926
        h1= ((c&0xFCFCFCFCUL)>>2)\
927
          + ((d&0xFCFCFCFCUL)>>2);\
928
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
929
        a= AV_RN32(&src1[i*src_stride1+4]);\
930
        b= AV_RN32(&src2[i*src_stride2+4]);\
931
        c= AV_RN32(&src3[i*src_stride3+4]);\
932
        d= AV_RN32(&src4[i*src_stride4+4]);\
933
        l0=  (a&0x03030303UL)\
934
           + (b&0x03030303UL)\
935
           + 0x02020202UL;\
936
        h0= ((a&0xFCFCFCFCUL)>>2)\
937
          + ((b&0xFCFCFCFCUL)>>2);\
938
        l1=  (c&0x03030303UL)\
939
           + (d&0x03030303UL);\
940
        h1= ((c&0xFCFCFCFCUL)>>2)\
941
          + ((d&0xFCFCFCFCUL)>>2);\
942
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
943
    }\
944
}\
945
\
946
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
947
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
948
}\
949
\
950
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
951
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
952
}\
953
\
954
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
955
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
956
}\
957
\
958
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
959
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
960
}\
961
\
962
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
963
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
964
    int i;\
965
    for(i=0; i<h; i++){\
966
        uint32_t a, b, c, d, l0, l1, h0, h1;\
967
        a= AV_RN32(&src1[i*src_stride1]);\
968
        b= AV_RN32(&src2[i*src_stride2]);\
969
        c= AV_RN32(&src3[i*src_stride3]);\
970
        d= AV_RN32(&src4[i*src_stride4]);\
971
        l0=  (a&0x03030303UL)\
972
           + (b&0x03030303UL)\
973
           + 0x01010101UL;\
974
        h0= ((a&0xFCFCFCFCUL)>>2)\
975
          + ((b&0xFCFCFCFCUL)>>2);\
976
        l1=  (c&0x03030303UL)\
977
           + (d&0x03030303UL);\
978
        h1= ((c&0xFCFCFCFCUL)>>2)\
979
          + ((d&0xFCFCFCFCUL)>>2);\
980
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
981
        a= AV_RN32(&src1[i*src_stride1+4]);\
982
        b= AV_RN32(&src2[i*src_stride2+4]);\
983
        c= AV_RN32(&src3[i*src_stride3+4]);\
984
        d= AV_RN32(&src4[i*src_stride4+4]);\
985
        l0=  (a&0x03030303UL)\
986
           + (b&0x03030303UL)\
987
           + 0x01010101UL;\
988
        h0= ((a&0xFCFCFCFCUL)>>2)\
989
          + ((b&0xFCFCFCFCUL)>>2);\
990
        l1=  (c&0x03030303UL)\
991
           + (d&0x03030303UL);\
992
        h1= ((c&0xFCFCFCFCUL)>>2)\
993
          + ((d&0xFCFCFCFCUL)>>2);\
994
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
995
    }\
996
}\
997
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
998
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
999
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1000
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1001
}\
1002
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1003
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1004
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1005
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1006
}\
1007
\
1008
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1009
{\
1010
        int i, a0, b0, a1, b1;\
1011
        a0= pixels[0];\
1012
        b0= pixels[1] + 2;\
1013
        a0 += b0;\
1014
        b0 += pixels[2];\
1015
\
1016
        pixels+=line_size;\
1017
        for(i=0; i<h; i+=2){\
1018
            a1= pixels[0];\
1019
            b1= pixels[1];\
1020
            a1 += b1;\
1021
            b1 += pixels[2];\
1022
\
1023
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1024
            block[1]= (b1+b0)>>2;\
1025
\
1026
            pixels+=line_size;\
1027
            block +=line_size;\
1028
\
1029
            a0= pixels[0];\
1030
            b0= pixels[1] + 2;\
1031
            a0 += b0;\
1032
            b0 += pixels[2];\
1033
\
1034
            block[0]= (a1+a0)>>2;\
1035
            block[1]= (b1+b0)>>2;\
1036
            pixels+=line_size;\
1037
            block +=line_size;\
1038
        }\
1039
}\
1040
\
1041
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1042
{\
1043
        int i;\
1044
        const uint32_t a= AV_RN32(pixels  );\
1045
        const uint32_t b= AV_RN32(pixels+1);\
1046
        uint32_t l0=  (a&0x03030303UL)\
1047
                    + (b&0x03030303UL)\
1048
                    + 0x02020202UL;\
1049
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1050
                   + ((b&0xFCFCFCFCUL)>>2);\
1051
        uint32_t l1,h1;\
1052
\
1053
        pixels+=line_size;\
1054
        for(i=0; i<h; i+=2){\
1055
            uint32_t a= AV_RN32(pixels  );\
1056
            uint32_t b= AV_RN32(pixels+1);\
1057
            l1=  (a&0x03030303UL)\
1058
               + (b&0x03030303UL);\
1059
            h1= ((a&0xFCFCFCFCUL)>>2)\
1060
              + ((b&0xFCFCFCFCUL)>>2);\
1061
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1062
            pixels+=line_size;\
1063
            block +=line_size;\
1064
            a= AV_RN32(pixels  );\
1065
            b= AV_RN32(pixels+1);\
1066
            l0=  (a&0x03030303UL)\
1067
               + (b&0x03030303UL)\
1068
               + 0x02020202UL;\
1069
            h0= ((a&0xFCFCFCFCUL)>>2)\
1070
              + ((b&0xFCFCFCFCUL)>>2);\
1071
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1072
            pixels+=line_size;\
1073
            block +=line_size;\
1074
        }\
1075
}\
1076
\
1077
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1078
{\
1079
    int j;\
1080
    for(j=0; j<2; j++){\
1081
        int i;\
1082
        const uint32_t a= AV_RN32(pixels  );\
1083
        const uint32_t b= AV_RN32(pixels+1);\
1084
        uint32_t l0=  (a&0x03030303UL)\
1085
                    + (b&0x03030303UL)\
1086
                    + 0x02020202UL;\
1087
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1088
                   + ((b&0xFCFCFCFCUL)>>2);\
1089
        uint32_t l1,h1;\
1090
\
1091
        pixels+=line_size;\
1092
        for(i=0; i<h; i+=2){\
1093
            uint32_t a= AV_RN32(pixels  );\
1094
            uint32_t b= AV_RN32(pixels+1);\
1095
            l1=  (a&0x03030303UL)\
1096
               + (b&0x03030303UL);\
1097
            h1= ((a&0xFCFCFCFCUL)>>2)\
1098
              + ((b&0xFCFCFCFCUL)>>2);\
1099
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100
            pixels+=line_size;\
1101
            block +=line_size;\
1102
            a= AV_RN32(pixels  );\
1103
            b= AV_RN32(pixels+1);\
1104
            l0=  (a&0x03030303UL)\
1105
               + (b&0x03030303UL)\
1106
               + 0x02020202UL;\
1107
            h0= ((a&0xFCFCFCFCUL)>>2)\
1108
              + ((b&0xFCFCFCFCUL)>>2);\
1109
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1110
            pixels+=line_size;\
1111
            block +=line_size;\
1112
        }\
1113
        pixels+=4-line_size*(h+1);\
1114
        block +=4-line_size*h;\
1115
    }\
1116
}\
1117
\
1118
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1119
{\
1120
    int j;\
1121
    for(j=0; j<2; j++){\
1122
        int i;\
1123
        const uint32_t a= AV_RN32(pixels  );\
1124
        const uint32_t b= AV_RN32(pixels+1);\
1125
        uint32_t l0=  (a&0x03030303UL)\
1126
                    + (b&0x03030303UL)\
1127
                    + 0x01010101UL;\
1128
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1129
                   + ((b&0xFCFCFCFCUL)>>2);\
1130
        uint32_t l1,h1;\
1131
\
1132
        pixels+=line_size;\
1133
        for(i=0; i<h; i+=2){\
1134
            uint32_t a= AV_RN32(pixels  );\
1135
            uint32_t b= AV_RN32(pixels+1);\
1136
            l1=  (a&0x03030303UL)\
1137
               + (b&0x03030303UL);\
1138
            h1= ((a&0xFCFCFCFCUL)>>2)\
1139
              + ((b&0xFCFCFCFCUL)>>2);\
1140
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1141
            pixels+=line_size;\
1142
            block +=line_size;\
1143
            a= AV_RN32(pixels  );\
1144
            b= AV_RN32(pixels+1);\
1145
            l0=  (a&0x03030303UL)\
1146
               + (b&0x03030303UL)\
1147
               + 0x01010101UL;\
1148
            h0= ((a&0xFCFCFCFCUL)>>2)\
1149
              + ((b&0xFCFCFCFCUL)>>2);\
1150
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151
            pixels+=line_size;\
1152
            block +=line_size;\
1153
        }\
1154
        pixels+=4-line_size*(h+1);\
1155
        block +=4-line_size*h;\
1156
    }\
1157
}\
1158
\
1159
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1160
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1161
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1162
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1163
av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1164
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1165
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1166
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1167

    
1168
#define op_avg(a, b) a = rnd_avg32(a, b)
1169
#endif
1170
#define op_put(a, b) a = b
1171

    
1172
PIXOP2(avg, op_avg)
1173
PIXOP2(put, op_put)
1174
#undef op_avg
1175
#undef op_put
1176

    
1177
#define put_no_rnd_pixels8_c  put_pixels8_c
1178
#define put_no_rnd_pixels16_c put_pixels16_c
1179

    
1180
#define avg2(a,b) ((a+b+1)>>1)
1181
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1182

    
1183
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1184
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1185
}
1186

    
1187
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1188
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1189
}
1190

    
1191
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1192
{
1193
    const int A=(16-x16)*(16-y16);
1194
    const int B=(   x16)*(16-y16);
1195
    const int C=(16-x16)*(   y16);
1196
    const int D=(   x16)*(   y16);
1197
    int i;
1198

    
1199
    for(i=0; i<h; i++)
1200
    {
1201
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1202
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1203
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1204
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1205
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1206
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1207
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1208
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1209
        dst+= stride;
1210
        src+= stride;
1211
    }
1212
}
1213

    
1214
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1215
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1216
{
1217
    int y, vx, vy;
1218
    const int s= 1<<shift;
1219

    
1220
    width--;
1221
    height--;
1222

    
1223
    for(y=0; y<h; y++){
1224
        int x;
1225

    
1226
        vx= ox;
1227
        vy= oy;
1228
        for(x=0; x<8; x++){ //XXX FIXME optimize
1229
            int src_x, src_y, frac_x, frac_y, index;
1230

    
1231
            src_x= vx>>16;
1232
            src_y= vy>>16;
1233
            frac_x= src_x&(s-1);
1234
            frac_y= src_y&(s-1);
1235
            src_x>>=shift;
1236
            src_y>>=shift;
1237

    
1238
            if((unsigned)src_x < width){
1239
                if((unsigned)src_y < height){
1240
                    index= src_x + src_y*stride;
1241
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1242
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1243
                                        + (  src[index+stride  ]*(s-frac_x)
1244
                                           + src[index+stride+1]*   frac_x )*   frac_y
1245
                                        + r)>>(shift*2);
1246
                }else{
1247
                    index= src_x + av_clip(src_y, 0, height)*stride;
1248
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1249
                                          + src[index       +1]*   frac_x )*s
1250
                                        + r)>>(shift*2);
1251
                }
1252
            }else{
1253
                if((unsigned)src_y < height){
1254
                    index= av_clip(src_x, 0, width) + src_y*stride;
1255
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1256
                                           + src[index+stride  ]*   frac_y )*s
1257
                                        + r)>>(shift*2);
1258
                }else{
1259
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1260
                    dst[y*stride + x]=    src[index         ];
1261
                }
1262
            }
1263

    
1264
            vx+= dxx;
1265
            vy+= dyx;
1266
        }
1267
        ox += dxy;
1268
        oy += dyy;
1269
    }
1270
}
1271

    
1272
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1273
    switch(width){
1274
    case 2: put_pixels2_c (dst, src, stride, height); break;
1275
    case 4: put_pixels4_c (dst, src, stride, height); break;
1276
    case 8: put_pixels8_c (dst, src, stride, height); break;
1277
    case 16:put_pixels16_c(dst, src, stride, height); break;
1278
    }
1279
}
1280

    
1281
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1282
    int i,j;
1283
    for (i=0; i < height; i++) {
1284
      for (j=0; j < width; j++) {
1285
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1286
      }
1287
      src += stride;
1288
      dst += stride;
1289
    }
1290
}
1291

    
1292
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1293
    int i,j;
1294
    for (i=0; i < height; i++) {
1295
      for (j=0; j < width; j++) {
1296
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1297
      }
1298
      src += stride;
1299
      dst += stride;
1300
    }
1301
}
1302

    
1303
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1304
    int i,j;
1305
    for (i=0; i < height; i++) {
1306
      for (j=0; j < width; j++) {
1307
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1308
      }
1309
      src += stride;
1310
      dst += stride;
1311
    }
1312
}
1313

    
1314
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1315
    int i,j;
1316
    for (i=0; i < height; i++) {
1317
      for (j=0; j < width; j++) {
1318
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1319
      }
1320
      src += stride;
1321
      dst += stride;
1322
    }
1323
}
1324

    
1325
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1326
    int i,j;
1327
    for (i=0; i < height; i++) {
1328
      for (j=0; j < width; j++) {
1329
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1330
      }
1331
      src += stride;
1332
      dst += stride;
1333
    }
1334
}
1335

    
1336
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1337
    int i,j;
1338
    for (i=0; i < height; i++) {
1339
      for (j=0; j < width; j++) {
1340
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1341
      }
1342
      src += stride;
1343
      dst += stride;
1344
    }
1345
}
1346

    
1347
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1348
    int i,j;
1349
    for (i=0; i < height; i++) {
1350
      for (j=0; j < width; j++) {
1351
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1352
      }
1353
      src += stride;
1354
      dst += stride;
1355
    }
1356
}
1357

    
1358
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359
    int i,j;
1360
    for (i=0; i < height; i++) {
1361
      for (j=0; j < width; j++) {
1362
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1363
      }
1364
      src += stride;
1365
      dst += stride;
1366
    }
1367
}
1368

    
1369
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1370
    switch(width){
1371
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1372
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1373
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1374
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1375
    }
1376
}
1377

    
1378
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379
    int i,j;
1380
    for (i=0; i < height; i++) {
1381
      for (j=0; j < width; j++) {
1382
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1383
      }
1384
      src += stride;
1385
      dst += stride;
1386
    }
1387
}
1388

    
1389
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390
    int i,j;
1391
    for (i=0; i < height; i++) {
1392
      for (j=0; j < width; j++) {
1393
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1394
      }
1395
      src += stride;
1396
      dst += stride;
1397
    }
1398
}
1399

    
1400
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401
    int i,j;
1402
    for (i=0; i < height; i++) {
1403
      for (j=0; j < width; j++) {
1404
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1405
      }
1406
      src += stride;
1407
      dst += stride;
1408
    }
1409
}
1410

    
1411
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412
    int i,j;
1413
    for (i=0; i < height; i++) {
1414
      for (j=0; j < width; j++) {
1415
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1416
      }
1417
      src += stride;
1418
      dst += stride;
1419
    }
1420
}
1421

    
1422
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423
    int i,j;
1424
    for (i=0; i < height; i++) {
1425
      for (j=0; j < width; j++) {
1426
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1427
      }
1428
      src += stride;
1429
      dst += stride;
1430
    }
1431
}
1432

    
1433
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434
    int i,j;
1435
    for (i=0; i < height; i++) {
1436
      for (j=0; j < width; j++) {
1437
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1438
      }
1439
      src += stride;
1440
      dst += stride;
1441
    }
1442
}
1443

    
1444
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445
    int i,j;
1446
    for (i=0; i < height; i++) {
1447
      for (j=0; j < width; j++) {
1448
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1449
      }
1450
      src += stride;
1451
      dst += stride;
1452
    }
1453
}
1454

    
1455
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456
    int i,j;
1457
    for (i=0; i < height; i++) {
1458
      for (j=0; j < width; j++) {
1459
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1460
      }
1461
      src += stride;
1462
      dst += stride;
1463
    }
1464
}
1465
#if 0
1466
#define TPEL_WIDTH(width)\
1467
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1468
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1469
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1470
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1471
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1472
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1473
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1474
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1475
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1476
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1477
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1478
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1479
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1480
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1481
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1482
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1483
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1484
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1485
#endif
1486

    
1487
#define H264_CHROMA_MC(OPNAME, OP)\
1488
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1489
    const int A=(8-x)*(8-y);\
1490
    const int B=(  x)*(8-y);\
1491
    const int C=(8-x)*(  y);\
1492
    const int D=(  x)*(  y);\
1493
    int i;\
1494
    \
1495
    assert(x<8 && y<8 && x>=0 && y>=0);\
1496
\
1497
    if(D){\
1498
        for(i=0; i<h; i++){\
1499
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1500
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1501
            dst+= stride;\
1502
            src+= stride;\
1503
        }\
1504
    }else{\
1505
        const int E= B+C;\
1506
        const int step= C ? stride : 1;\
1507
        for(i=0; i<h; i++){\
1508
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1509
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1510
            dst+= stride;\
1511
            src+= stride;\
1512
        }\
1513
    }\
1514
}\
1515
\
1516
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1517
    const int A=(8-x)*(8-y);\
1518
    const int B=(  x)*(8-y);\
1519
    const int C=(8-x)*(  y);\
1520
    const int D=(  x)*(  y);\
1521
    int i;\
1522
    \
1523
    assert(x<8 && y<8 && x>=0 && y>=0);\
1524
\
1525
    if(D){\
1526
        for(i=0; i<h; i++){\
1527
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1528
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1529
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1530
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1531
            dst+= stride;\
1532
            src+= stride;\
1533
        }\
1534
    }else{\
1535
        const int E= B+C;\
1536
        const int step= C ? stride : 1;\
1537
        for(i=0; i<h; i++){\
1538
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1539
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1540
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1541
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1542
            dst+= stride;\
1543
            src+= stride;\
1544
        }\
1545
    }\
1546
}\
1547
\
1548
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1549
    const int A=(8-x)*(8-y);\
1550
    const int B=(  x)*(8-y);\
1551
    const int C=(8-x)*(  y);\
1552
    const int D=(  x)*(  y);\
1553
    int i;\
1554
    \
1555
    assert(x<8 && y<8 && x>=0 && y>=0);\
1556
\
1557
    if(D){\
1558
        for(i=0; i<h; i++){\
1559
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1560
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1561
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1562
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1563
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1564
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1565
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1566
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1567
            dst+= stride;\
1568
            src+= stride;\
1569
        }\
1570
    }else{\
1571
        const int E= B+C;\
1572
        const int step= C ? stride : 1;\
1573
        for(i=0; i<h; i++){\
1574
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1575
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1576
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1577
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1578
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1579
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1580
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1581
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1582
            dst+= stride;\
1583
            src+= stride;\
1584
        }\
1585
    }\
1586
}
1587

    
1588
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1589
#define op_put(a, b) a = (((b) + 32)>>6)
1590

    
1591
H264_CHROMA_MC(put_       , op_put)
1592
H264_CHROMA_MC(avg_       , op_avg)
1593
#undef op_avg
1594
#undef op_put
1595

    
1596
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1597
    const int A=(8-x)*(8-y);
1598
    const int B=(  x)*(8-y);
1599
    const int C=(8-x)*(  y);
1600
    const int D=(  x)*(  y);
1601
    int i;
1602

    
1603
    assert(x<8 && y<8 && x>=0 && y>=0);
1604

    
1605
    for(i=0; i<h; i++)
1606
    {
1607
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1608
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1609
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1610
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1611
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1612
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1613
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1614
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1615
        dst+= stride;
1616
        src+= stride;
1617
    }
1618
}
1619

    
1620
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1621
    const int A=(8-x)*(8-y);
1622
    const int B=(  x)*(8-y);
1623
    const int C=(8-x)*(  y);
1624
    const int D=(  x)*(  y);
1625
    int i;
1626

    
1627
    assert(x<8 && y<8 && x>=0 && y>=0);
1628

    
1629
    for(i=0; i<h; i++)
1630
    {
1631
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1632
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1633
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1634
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1635
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1636
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1637
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1638
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1639
        dst+= stride;
1640
        src+= stride;
1641
    }
1642
}
1643

    
1644
#define QPEL_MC(r, OPNAME, RND, OP) \
1645
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1646
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1647
    int i;\
1648
    for(i=0; i<h; i++)\
1649
    {\
1650
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1651
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1652
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1653
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1654
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1655
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1656
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1657
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1658
        dst+=dstStride;\
1659
        src+=srcStride;\
1660
    }\
1661
}\
1662
\
1663
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1664
    const int w=8;\
1665
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1666
    int i;\
1667
    for(i=0; i<w; i++)\
1668
    {\
1669
        const int src0= src[0*srcStride];\
1670
        const int src1= src[1*srcStride];\
1671
        const int src2= src[2*srcStride];\
1672
        const int src3= src[3*srcStride];\
1673
        const int src4= src[4*srcStride];\
1674
        const int src5= src[5*srcStride];\
1675
        const int src6= src[6*srcStride];\
1676
        const int src7= src[7*srcStride];\
1677
        const int src8= src[8*srcStride];\
1678
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1679
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1680
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1681
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1682
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1683
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1684
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1685
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1686
        dst++;\
1687
        src++;\
1688
    }\
1689
}\
1690
\
1691
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1692
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1693
    int i;\
1694
    \
1695
    for(i=0; i<h; i++)\
1696
    {\
1697
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1698
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1699
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1700
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1701
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1702
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1703
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1704
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1705
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1706
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1707
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1708
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1709
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1710
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1711
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1712
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1713
        dst+=dstStride;\
1714
        src+=srcStride;\
1715
    }\
1716
}\
1717
\
1718
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1719
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1720
    int i;\
1721
    const int w=16;\
1722
    for(i=0; i<w; i++)\
1723
    {\
1724
        const int src0= src[0*srcStride];\
1725
        const int src1= src[1*srcStride];\
1726
        const int src2= src[2*srcStride];\
1727
        const int src3= src[3*srcStride];\
1728
        const int src4= src[4*srcStride];\
1729
        const int src5= src[5*srcStride];\
1730
        const int src6= src[6*srcStride];\
1731
        const int src7= src[7*srcStride];\
1732
        const int src8= src[8*srcStride];\
1733
        const int src9= src[9*srcStride];\
1734
        const int src10= src[10*srcStride];\
1735
        const int src11= src[11*srcStride];\
1736
        const int src12= src[12*srcStride];\
1737
        const int src13= src[13*srcStride];\
1738
        const int src14= src[14*srcStride];\
1739
        const int src15= src[15*srcStride];\
1740
        const int src16= src[16*srcStride];\
1741
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1742
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1743
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1744
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1745
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1746
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1747
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1748
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1749
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1750
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1751
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1752
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1753
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1754
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1755
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1756
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1757
        dst++;\
1758
        src++;\
1759
    }\
1760
}\
1761
\
1762
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1763
    uint8_t half[64];\
1764
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1765
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1766
}\
1767
\
1768
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1769
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1770
}\
1771
\
1772
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1773
    uint8_t half[64];\
1774
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1775
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1776
}\
1777
\
1778
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1779
    uint8_t full[16*9];\
1780
    uint8_t half[64];\
1781
    copy_block9(full, src, 16, stride, 9);\
1782
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1783
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1784
}\
1785
\
1786
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1787
    uint8_t full[16*9];\
1788
    copy_block9(full, src, 16, stride, 9);\
1789
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1790
}\
1791
\
1792
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1793
    uint8_t full[16*9];\
1794
    uint8_t half[64];\
1795
    copy_block9(full, src, 16, stride, 9);\
1796
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1797
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1798
}\
1799
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1800
    uint8_t full[16*9];\
1801
    uint8_t halfH[72];\
1802
    uint8_t halfV[64];\
1803
    uint8_t halfHV[64];\
1804
    copy_block9(full, src, 16, stride, 9);\
1805
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1806
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1807
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1808
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1809
}\
1810
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1811
    uint8_t full[16*9];\
1812
    uint8_t halfH[72];\
1813
    uint8_t halfHV[64];\
1814
    copy_block9(full, src, 16, stride, 9);\
1815
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1816
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1817
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1818
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1819
}\
1820
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1821
    uint8_t full[16*9];\
1822
    uint8_t halfH[72];\
1823
    uint8_t halfV[64];\
1824
    uint8_t halfHV[64];\
1825
    copy_block9(full, src, 16, stride, 9);\
1826
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1827
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1828
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1829
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1830
}\
1831
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1832
    uint8_t full[16*9];\
1833
    uint8_t halfH[72];\
1834
    uint8_t halfHV[64];\
1835
    copy_block9(full, src, 16, stride, 9);\
1836
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1837
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1838
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1839
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1840
}\
1841
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1842
    uint8_t full[16*9];\
1843
    uint8_t halfH[72];\
1844
    uint8_t halfV[64];\
1845
    uint8_t halfHV[64];\
1846
    copy_block9(full, src, 16, stride, 9);\
1847
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1848
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1849
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1850
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1851
}\
1852
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1853
    uint8_t full[16*9];\
1854
    uint8_t halfH[72];\
1855
    uint8_t halfHV[64];\
1856
    copy_block9(full, src, 16, stride, 9);\
1857
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1858
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1859
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1860
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1861
}\
1862
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1863
    uint8_t full[16*9];\
1864
    uint8_t halfH[72];\
1865
    uint8_t halfV[64];\
1866
    uint8_t halfHV[64];\
1867
    copy_block9(full, src, 16, stride, 9);\
1868
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1869
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1870
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1871
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1872
}\
1873
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1874
    uint8_t full[16*9];\
1875
    uint8_t halfH[72];\
1876
    uint8_t halfHV[64];\
1877
    copy_block9(full, src, 16, stride, 9);\
1878
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1879
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1880
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1881
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1882
}\
1883
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1884
    uint8_t halfH[72];\
1885
    uint8_t halfHV[64];\
1886
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1887
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1888
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1889
}\
1890
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1891
    uint8_t halfH[72];\
1892
    uint8_t halfHV[64];\
1893
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1894
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1896
}\
1897
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1898
    uint8_t full[16*9];\
1899
    uint8_t halfH[72];\
1900
    uint8_t halfV[64];\
1901
    uint8_t halfHV[64];\
1902
    copy_block9(full, src, 16, stride, 9);\
1903
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1904
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1905
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1906
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1907
}\
1908
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1909
    uint8_t full[16*9];\
1910
    uint8_t halfH[72];\
1911
    copy_block9(full, src, 16, stride, 9);\
1912
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1913
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1914
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1915
}\
1916
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1917
    uint8_t full[16*9];\
1918
    uint8_t halfH[72];\
1919
    uint8_t halfV[64];\
1920
    uint8_t halfHV[64];\
1921
    copy_block9(full, src, 16, stride, 9);\
1922
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1923
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1924
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1925
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1926
}\
1927
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1928
    uint8_t full[16*9];\
1929
    uint8_t halfH[72];\
1930
    copy_block9(full, src, 16, stride, 9);\
1931
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1932
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1933
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1934
}\
1935
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1936
    uint8_t halfH[72];\
1937
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1938
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1939
}\
1940
\
1941
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1942
    uint8_t half[256];\
1943
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1944
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1945
}\
1946
\
1947
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1948
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1949
}\
1950
\
1951
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1952
    uint8_t half[256];\
1953
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1954
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1955
}\
1956
\
1957
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1958
    uint8_t full[24*17];\
1959
    uint8_t half[256];\
1960
    copy_block17(full, src, 24, stride, 17);\
1961
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1962
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1963
}\
1964
\
1965
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1966
    uint8_t full[24*17];\
1967
    copy_block17(full, src, 24, stride, 17);\
1968
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1969
}\
1970
\
1971
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1972
    uint8_t full[24*17];\
1973
    uint8_t half[256];\
1974
    copy_block17(full, src, 24, stride, 17);\
1975
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1976
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1977
}\
1978
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1979
    uint8_t full[24*17];\
1980
    uint8_t halfH[272];\
1981
    uint8_t halfV[256];\
1982
    uint8_t halfHV[256];\
1983
    copy_block17(full, src, 24, stride, 17);\
1984
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1985
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1986
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1987
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1988
}\
1989
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1990
    uint8_t full[24*17];\
1991
    uint8_t halfH[272];\
1992
    uint8_t halfHV[256];\
1993
    copy_block17(full, src, 24, stride, 17);\
1994
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1995
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1996
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1997
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1998
}\
1999
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2000
    uint8_t full[24*17];\
2001
    uint8_t halfH[272];\
2002
    uint8_t halfV[256];\
2003
    uint8_t halfHV[256];\
2004
    copy_block17(full, src, 24, stride, 17);\
2005
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2006
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2007
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2008
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2009
}\
2010
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2011
    uint8_t full[24*17];\
2012
    uint8_t halfH[272];\
2013
    uint8_t halfHV[256];\
2014
    copy_block17(full, src, 24, stride, 17);\
2015
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2016
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2017
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2018
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2019
}\
2020
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2021
    uint8_t full[24*17];\
2022
    uint8_t halfH[272];\
2023
    uint8_t halfV[256];\
2024
    uint8_t halfHV[256];\
2025
    copy_block17(full, src, 24, stride, 17);\
2026
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2027
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2028
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2029
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2030
}\
2031
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2032
    uint8_t full[24*17];\
2033
    uint8_t halfH[272];\
2034
    uint8_t halfHV[256];\
2035
    copy_block17(full, src, 24, stride, 17);\
2036
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2037
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2038
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2040
}\
2041
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2042
    uint8_t full[24*17];\
2043
    uint8_t halfH[272];\
2044
    uint8_t halfV[256];\
2045
    uint8_t halfHV[256];\
2046
    copy_block17(full, src, 24, stride, 17);\
2047
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2048
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2049
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2050
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2051
}\
2052
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2053
    uint8_t full[24*17];\
2054
    uint8_t halfH[272];\
2055
    uint8_t halfHV[256];\
2056
    copy_block17(full, src, 24, stride, 17);\
2057
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2059
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2060
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2061
}\
2062
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2063
    uint8_t halfH[272];\
2064
    uint8_t halfHV[256];\
2065
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2066
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2067
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2068
}\
2069
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2070
    uint8_t halfH[272];\
2071
    uint8_t halfHV[256];\
2072
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2073
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2074
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2075
}\
2076
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2077
    uint8_t full[24*17];\
2078
    uint8_t halfH[272];\
2079
    uint8_t halfV[256];\
2080
    uint8_t halfHV[256];\
2081
    copy_block17(full, src, 24, stride, 17);\
2082
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2083
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2084
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2085
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2086
}\
2087
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2088
    uint8_t full[24*17];\
2089
    uint8_t halfH[272];\
2090
    copy_block17(full, src, 24, stride, 17);\
2091
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2092
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2093
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2094
}\
2095
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2096
    uint8_t full[24*17];\
2097
    uint8_t halfH[272];\
2098
    uint8_t halfV[256];\
2099
    uint8_t halfHV[256];\
2100
    copy_block17(full, src, 24, stride, 17);\
2101
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2102
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2103
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2104
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2105
}\
2106
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2107
    uint8_t full[24*17];\
2108
    uint8_t halfH[272];\
2109
    copy_block17(full, src, 24, stride, 17);\
2110
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2111
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2112
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2113
}\
2114
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2115
    uint8_t halfH[272];\
2116
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2117
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2118
}
2119

    
2120
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2121
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2122
#define op_put(a, b) a = cm[((b) + 16)>>5]
2123
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2124

    
2125
QPEL_MC(0, put_       , _       , op_put)
2126
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2127
QPEL_MC(0, avg_       , _       , op_avg)
2128
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2129
#undef op_avg
2130
#undef op_avg_no_rnd
2131
#undef op_put
2132
#undef op_put_no_rnd
2133

    
2134
#define put_qpel8_mc00_c  ff_put_pixels8x8_c
2135
#define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2136
#define put_qpel16_mc00_c ff_put_pixels16x16_c
2137
#define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2138
#define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2139
#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2140

    
2141
#if 1
2142
#define H264_LOWPASS(OPNAME, OP, OP2) \
2143
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2144
    const int h=2;\
2145
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2146
    int i;\
2147
    for(i=0; i<h; i++)\
2148
    {\
2149
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2150
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2151
        dst+=dstStride;\
2152
        src+=srcStride;\
2153
    }\
2154
}\
2155
\
2156
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2157
    const int w=2;\
2158
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2159
    int i;\
2160
    for(i=0; i<w; i++)\
2161
    {\
2162
        const int srcB= src[-2*srcStride];\
2163
        const int srcA= src[-1*srcStride];\
2164
        const int src0= src[0 *srcStride];\
2165
        const int src1= src[1 *srcStride];\
2166
        const int src2= src[2 *srcStride];\
2167
        const int src3= src[3 *srcStride];\
2168
        const int src4= src[4 *srcStride];\
2169
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2170
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2171
        dst++;\
2172
        src++;\
2173
    }\
2174
}\
2175
\
2176
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2177
    const int h=2;\
2178
    const int w=2;\
2179
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2180
    int i;\
2181
    src -= 2*srcStride;\
2182
    for(i=0; i<h+5; i++)\
2183
    {\
2184
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2185
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2186
        tmp+=tmpStride;\
2187
        src+=srcStride;\
2188
    }\
2189
    tmp -= tmpStride*(h+5-2);\
2190
    for(i=0; i<w; i++)\
2191
    {\
2192
        const int tmpB= tmp[-2*tmpStride];\
2193
        const int tmpA= tmp[-1*tmpStride];\
2194
        const int tmp0= tmp[0 *tmpStride];\
2195
        const int tmp1= tmp[1 *tmpStride];\
2196
        const int tmp2= tmp[2 *tmpStride];\
2197
        const int tmp3= tmp[3 *tmpStride];\
2198
        const int tmp4= tmp[4 *tmpStride];\
2199
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2200
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2201
        dst++;\
2202
        tmp++;\
2203
    }\
2204
}\
2205
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2206
    const int h=4;\
2207
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2208
    int i;\
2209
    for(i=0; i<h; i++)\
2210
    {\
2211
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2212
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2213
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2214
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2215
        dst+=dstStride;\
2216
        src+=srcStride;\
2217
    }\
2218
}\
2219
\
2220
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2221
    const int w=4;\
2222
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2223
    int i;\
2224
    for(i=0; i<w; i++)\
2225
    {\
2226
        const int srcB= src[-2*srcStride];\
2227
        const int srcA= src[-1*srcStride];\
2228
        const int src0= src[0 *srcStride];\
2229
        const int src1= src[1 *srcStride];\
2230
        const int src2= src[2 *srcStride];\
2231
        const int src3= src[3 *srcStride];\
2232
        const int src4= src[4 *srcStride];\
2233
        const int src5= src[5 *srcStride];\
2234
        const int src6= src[6 *srcStride];\
2235
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2236
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2237
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2238
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2239
        dst++;\
2240
        src++;\
2241
    }\
2242
}\
2243
\
2244
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2245
    const int h=4;\
2246
    const int w=4;\
2247
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2248
    int i;\
2249
    src -= 2*srcStride;\
2250
    for(i=0; i<h+5; i++)\
2251
    {\
2252
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2253
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2254
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2255
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2256
        tmp+=tmpStride;\
2257
        src+=srcStride;\
2258
    }\
2259
    tmp -= tmpStride*(h+5-2);\
2260
    for(i=0; i<w; i++)\
2261
    {\
2262
        const int tmpB= tmp[-2*tmpStride];\
2263
        const int tmpA= tmp[-1*tmpStride];\
2264
        const int tmp0= tmp[0 *tmpStride];\
2265
        const int tmp1= tmp[1 *tmpStride];\
2266
        const int tmp2= tmp[2 *tmpStride];\
2267
        const int tmp3= tmp[3 *tmpStride];\
2268
        const int tmp4= tmp[4 *tmpStride];\
2269
        const int tmp5= tmp[5 *tmpStride];\
2270
        const int tmp6= tmp[6 *tmpStride];\
2271
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2272
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2273
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2274
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2275
        dst++;\
2276
        tmp++;\
2277
    }\
2278
}\
2279
\
2280
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2281
    const int h=8;\
2282
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2283
    int i;\
2284
    for(i=0; i<h; i++)\
2285
    {\
2286
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2287
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2288
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2289
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2290
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2291
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2292
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2293
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2294
        dst+=dstStride;\
2295
        src+=srcStride;\
2296
    }\
2297
}\
2298
\
2299
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2300
    const int w=8;\
2301
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2302
    int i;\
2303
    for(i=0; i<w; i++)\
2304
    {\
2305
        const int srcB= src[-2*srcStride];\
2306
        const int srcA= src[-1*srcStride];\
2307
        const int src0= src[0 *srcStride];\
2308
        const int src1= src[1 *srcStride];\
2309
        const int src2= src[2 *srcStride];\
2310
        const int src3= src[3 *srcStride];\
2311
        const int src4= src[4 *srcStride];\
2312
        const int src5= src[5 *srcStride];\
2313
        const int src6= src[6 *srcStride];\
2314
        const int src7= src[7 *srcStride];\
2315
        const int src8= src[8 *srcStride];\
2316
        const int src9= src[9 *srcStride];\
2317
        const int src10=src[10*srcStride];\
2318
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2319
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2320
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2321
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2322
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2323
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2324
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2325
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2326
        dst++;\
2327
        src++;\
2328
    }\
2329
}\
2330
\
2331
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2332
    const int h=8;\
2333
    const int w=8;\
2334
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2335
    int i;\
2336
    src -= 2*srcStride;\
2337
    for(i=0; i<h+5; i++)\
2338
    {\
2339
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2340
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2341
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2342
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2343
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2344
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2345
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2346
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2347
        tmp+=tmpStride;\
2348
        src+=srcStride;\
2349
    }\
2350
    tmp -= tmpStride*(h+5-2);\
2351
    for(i=0; i<w; i++)\
2352
    {\
2353
        const int tmpB= tmp[-2*tmpStride];\
2354
        const int tmpA= tmp[-1*tmpStride];\
2355
        const int tmp0= tmp[0 *tmpStride];\
2356
        const int tmp1= tmp[1 *tmpStride];\
2357
        const int tmp2= tmp[2 *tmpStride];\
2358
        const int tmp3= tmp[3 *tmpStride];\
2359
        const int tmp4= tmp[4 *tmpStride];\
2360
        const int tmp5= tmp[5 *tmpStride];\
2361
        const int tmp6= tmp[6 *tmpStride];\
2362
        const int tmp7= tmp[7 *tmpStride];\
2363
        const int tmp8= tmp[8 *tmpStride];\
2364
        const int tmp9= tmp[9 *tmpStride];\
2365
        const int tmp10=tmp[10*tmpStride];\
2366
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2367
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2368
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2369
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2370
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2371
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2372
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2373
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2374
        dst++;\
2375
        tmp++;\
2376
    }\
2377
}\
2378
\
2379
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2380
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2381
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2382
    src += 8*srcStride;\
2383
    dst += 8*dstStride;\
2384
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2385
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2386
}\
2387
\
2388
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2389
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2390
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2391
    src += 8*srcStride;\
2392
    dst += 8*dstStride;\
2393
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2394
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2395
}\
2396
\
2397
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2398
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2399
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2400
    src += 8*srcStride;\
2401
    dst += 8*dstStride;\
2402
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2403
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2404
}\
2405

    
2406
#define H264_MC(OPNAME, SIZE) \
2407
static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2408
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2409
}\
2410
\
2411
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2412
    uint8_t half[SIZE*SIZE];\
2413
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2414
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2415
}\
2416
\
2417
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2418
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2419
}\
2420
\
2421
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2422
    uint8_t half[SIZE*SIZE];\
2423
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2424
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2425
}\
2426
\
2427
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2428
    uint8_t full[SIZE*(SIZE+5)];\
2429
    uint8_t * const full_mid= full + SIZE*2;\
2430
    uint8_t half[SIZE*SIZE];\
2431
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2432
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2433
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2434
}\
2435
\
2436
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2437
    uint8_t full[SIZE*(SIZE+5)];\
2438
    uint8_t * const full_mid= full + SIZE*2;\
2439
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2440
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2441
}\
2442
\
2443
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2444
    uint8_t full[SIZE*(SIZE+5)];\
2445
    uint8_t * const full_mid= full + SIZE*2;\
2446
    uint8_t half[SIZE*SIZE];\
2447
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2448
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2449
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2450
}\
2451
\
2452
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2453
    uint8_t full[SIZE*(SIZE+5)];\
2454
    uint8_t * const full_mid= full + SIZE*2;\
2455
    uint8_t halfH[SIZE*SIZE];\
2456
    uint8_t halfV[SIZE*SIZE];\
2457
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2458
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2459
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2460
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2461
}\
2462
\
2463
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2464
    uint8_t full[SIZE*(SIZE+5)];\
2465
    uint8_t * const full_mid= full + SIZE*2;\
2466
    uint8_t halfH[SIZE*SIZE];\
2467
    uint8_t halfV[SIZE*SIZE];\
2468
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2469
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2470
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2471
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2472
}\
2473
\
2474
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2475
    uint8_t full[SIZE*(SIZE+5)];\
2476
    uint8_t * const full_mid= full + SIZE*2;\
2477
    uint8_t halfH[SIZE*SIZE];\
2478
    uint8_t halfV[SIZE*SIZE];\
2479
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2480
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2481
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2482
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2483
}\
2484
\
2485
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2486
    uint8_t full[SIZE*(SIZE+5)];\
2487
    uint8_t * const full_mid= full + SIZE*2;\
2488
    uint8_t halfH[SIZE*SIZE];\
2489
    uint8_t halfV[SIZE*SIZE];\
2490
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2491
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2492
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2493
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2494
}\
2495
\
2496
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2497
    int16_t tmp[SIZE*(SIZE+5)];\
2498
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2499
}\
2500
\
2501
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2502
    int16_t tmp[SIZE*(SIZE+5)];\
2503
    uint8_t halfH[SIZE*SIZE];\
2504
    uint8_t halfHV[SIZE*SIZE];\
2505
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2506
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2507
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2508
}\
2509
\
2510
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2511
    int16_t tmp[SIZE*(SIZE+5)];\
2512
    uint8_t halfH[SIZE*SIZE];\
2513
    uint8_t halfHV[SIZE*SIZE];\
2514
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2515
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2516
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2517
}\
2518
\
2519
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2520
    uint8_t full[SIZE*(SIZE+5)];\
2521
    uint8_t * const full_mid= full + SIZE*2;\
2522
    int16_t tmp[SIZE*(SIZE+5)];\
2523
    uint8_t halfV[SIZE*SIZE];\
2524
    uint8_t halfHV[SIZE*SIZE];\
2525
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2526
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2527
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2528
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2529
}\
2530
\
2531
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2532
    uint8_t full[SIZE*(SIZE+5)];\
2533
    uint8_t * const full_mid= full + SIZE*2;\
2534
    int16_t tmp[SIZE*(SIZE+5)];\
2535
    uint8_t halfV[SIZE*SIZE];\
2536
    uint8_t halfHV[SIZE*SIZE];\
2537
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2538
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2539
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2540
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2541
}\
2542

    
2543
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2544
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2545
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2546
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2547
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2548

    
2549
H264_LOWPASS(put_       , op_put, op2_put)
2550
H264_LOWPASS(avg_       , op_avg, op2_avg)
2551
H264_MC(put_, 2)
2552
H264_MC(put_, 4)
2553
H264_MC(put_, 8)
2554
H264_MC(put_, 16)
2555
H264_MC(avg_, 4)
2556
H264_MC(avg_, 8)
2557
H264_MC(avg_, 16)
2558

    
2559
#undef op_avg
2560
#undef op_put
2561
#undef op2_avg
2562
#undef op2_put
2563
#endif
2564

    
2565
#define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2566
#define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2567
#define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2568
#define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2569

    
2570
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2571
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2572
    int i;
2573

    
2574
    for(i=0; i<h; i++){
2575
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2576
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2577
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2578
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2579
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2580
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2581
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2582
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2583
        dst+=dstStride;
2584
        src+=srcStride;
2585
    }
2586
}
2587

    
2588
void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2589
    put_pixels8_c(dst, src, stride, 8);
2590
}
2591
void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2592
    avg_pixels8_c(dst, src, stride, 8);
2593
}
2594
void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2595
    put_pixels16_c(dst, src, stride, 16);
2596
}
2597
void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2598
    avg_pixels16_c(dst, src, stride, 16);
2599
}
2600

    
2601
#if CONFIG_RV40_DECODER
2602
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2603
    put_pixels16_xy2_c(dst, src, stride, 16);
2604
}
2605
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2606
    avg_pixels16_xy2_c(dst, src, stride, 16);
2607
}
2608
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2609
    put_pixels8_xy2_c(dst, src, stride, 8);
2610
}
2611
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2612
    avg_pixels8_xy2_c(dst, src, stride, 8);
2613
}
2614
#endif /* CONFIG_RV40_DECODER */
2615

    
2616
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2617
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2618
    int i;
2619

    
2620
    for(i=0; i<w; i++){
2621
        const int src_1= src[ -srcStride];
2622
        const int src0 = src[0          ];
2623
        const int src1 = src[  srcStride];
2624
        const int src2 = src[2*srcStride];
2625
        const int src3 = src[3*srcStride];
2626
        const int src4 = src[4*srcStride];
2627
        const int src5 = src[5*srcStride];
2628
        const int src6 = src[6*srcStride];
2629
        const int src7 = src[7*srcStride];
2630
        const int src8 = src[8*srcStride];
2631
        const int src9 = src[9*srcStride];
2632
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2633
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2634
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2635
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2636
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2637
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2638
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2639
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2640
        src++;
2641
        dst++;
2642
    }
2643
}
2644

    
2645
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2646
    uint8_t half[64];
2647
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2648
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2649
}
2650

    
2651
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2652
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2653
}
2654

    
2655
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2656
    uint8_t half[64];
2657
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2658
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2659
}
2660

    
2661
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2662
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2663
}
2664

    
2665
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2666
    uint8_t halfH[88];
2667
    uint8_t halfV[64];
2668
    uint8_t halfHV[64];
2669
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2670
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2671
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2672
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2673
}
2674
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2675
    uint8_t halfH[88];
2676
    uint8_t halfV[64];
2677
    uint8_t halfHV[64];
2678
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2679
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2680
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2681
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2682
}
2683
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2684
    uint8_t halfH[88];
2685
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2686
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2687
}
2688

    
2689
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2690
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2691
    int x;
2692
    const int strength= ff_h263_loop_filter_strength[qscale];
2693

    
2694
    for(x=0; x<8; x++){
2695
        int d1, d2, ad1;
2696
        int p0= src[x-2*stride];
2697
        int p1= src[x-1*stride];
2698
        int p2= src[x+0*stride];
2699
        int p3= src[x+1*stride];
2700
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2701

    
2702
        if     (d<-2*strength) d1= 0;
2703
        else if(d<-  strength) d1=-2*strength - d;
2704
        else if(d<   strength) d1= d;
2705
        else if(d< 2*strength) d1= 2*strength - d;
2706
        else                   d1= 0;
2707

    
2708
        p1 += d1;
2709
        p2 -= d1;
2710
        if(p1&256) p1= ~(p1>>31);
2711
        if(p2&256) p2= ~(p2>>31);
2712

    
2713
        src[x-1*stride] = p1;
2714
        src[x+0*stride] = p2;
2715

    
2716
        ad1= FFABS(d1)>>1;
2717

    
2718
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2719

    
2720
        src[x-2*stride] = p0 - d2;
2721
        src[x+  stride] = p3 + d2;
2722
    }
2723
    }
2724
}
2725

    
2726
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2727
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2728
    int y;
2729
    const int strength= ff_h263_loop_filter_strength[qscale];
2730

    
2731
    for(y=0; y<8; y++){
2732
        int d1, d2, ad1;
2733
        int p0= src[y*stride-2];
2734
        int p1= src[y*stride-1];
2735
        int p2= src[y*stride+0];
2736
        int p3= src[y*stride+1];
2737
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2738

    
2739
        if     (d<-2*strength) d1= 0;
2740
        else if(d<-  strength) d1=-2*strength - d;
2741
        else if(d<   strength) d1= d;
2742
        else if(d< 2*strength) d1= 2*strength - d;
2743
        else                   d1= 0;
2744

    
2745
        p1 += d1;
2746
        p2 -= d1;
2747
        if(p1&256) p1= ~(p1>>31);
2748
        if(p2&256) p2= ~(p2>>31);
2749

    
2750
        src[y*stride-1] = p1;
2751
        src[y*stride+0] = p2;
2752

    
2753
        ad1= FFABS(d1)>>1;
2754

    
2755
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2756

    
2757
        src[y*stride-2] = p0 - d2;
2758
        src[y*stride+1] = p3 + d2;
2759
    }
2760
    }
2761
}
2762

    
2763
static void h261_loop_filter_c(uint8_t *src, int stride){
2764
    int x,y,xy,yz;
2765
    int temp[64];
2766

    
2767
    for(x=0; x<8; x++){
2768
        temp[x      ] = 4*src[x           ];
2769
        temp[x + 7*8] = 4*src[x + 7*stride];
2770
    }
2771
    for(y=1; y<7; y++){
2772
        for(x=0; x<8; x++){
2773
            xy = y * stride + x;
2774
            yz = y * 8 + x;
2775
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2776
        }
2777
    }
2778

    
2779
    for(y=0; y<8; y++){
2780
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2781
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2782
        for(x=1; x<7; x++){
2783
            xy = y * stride + x;
2784
            yz = y * 8 + x;
2785
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2786
        }
2787
    }
2788
}
2789

    
2790
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2791
{
2792
    int s, i;
2793

    
2794
    s = 0;
2795
    for(i=0;i<h;i++) {
2796
        s += abs(pix1[0] - pix2[0]);
2797
        s += abs(pix1[1] - pix2[1]);
2798
        s += abs(pix1[2] - pix2[2]);
2799
        s += abs(pix1[3] - pix2[3]);
2800
        s += abs(pix1[4] - pix2[4]);
2801
        s += abs(pix1[5] - pix2[5]);
2802
        s += abs(pix1[6] - pix2[6]);
2803
        s += abs(pix1[7] - pix2[7]);
2804
        s += abs(pix1[8] - pix2[8]);
2805
        s += abs(pix1[9] - pix2[9]);
2806
        s += abs(pix1[10] - pix2[10]);
2807
        s += abs(pix1[11] - pix2[11]);
2808
        s += abs(pix1[12] - pix2[12]);
2809
        s += abs(pix1[13] - pix2[13]);
2810
        s += abs(pix1[14] - pix2[14]);
2811
        s += abs(pix1[15] - pix2[15]);
2812
        pix1 += line_size;
2813
        pix2 += line_size;
2814
    }
2815
    return s;
2816
}
2817

    
2818
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2819
{
2820
    int s, i;
2821

    
2822
    s = 0;
2823
    for(i=0;i<h;i++) {
2824
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2825
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2826
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2827
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2828
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2829
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2830
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2831
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2832
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2833
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2834
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2835
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2836
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2837
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2838
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2839
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2840
        pix1 += line_size;
2841
        pix2 += line_size;
2842
    }
2843
    return s;
2844
}
2845

    
2846
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2847
{
2848
    int s, i;
2849
    uint8_t *pix3 = pix2 + line_size;
2850

    
2851
    s = 0;
2852
    for(i=0;i<h;i++) {
2853
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2854
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2855
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2856
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2857
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2858
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2859
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2860
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2861
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2862
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2863
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2864
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2865
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2866
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2867
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2868
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2869
        pix1 += line_size;
2870
        pix2 += line_size;
2871
        pix3 += line_size;
2872
    }
2873
    return s;
2874
}
2875

    
2876
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2877
{
2878
    int s, i;
2879
    uint8_t *pix3 = pix2 + line_size;
2880

    
2881
    s = 0;
2882
    for(i=0;i<h;i++) {
2883
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2884
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2885
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2886
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2887
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2888
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2889
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2890
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2891
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2892
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2893
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2894
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2895
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2896
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2897
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2898
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2899
        pix1 += line_size;
2900
        pix2 += line_size;
2901
        pix3 += line_size;
2902
    }
2903
    return s;
2904
}
2905

    
2906
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2907
{
2908
    int s, i;
2909

    
2910
    s = 0;
2911
    for(i=0;i<h;i++) {
2912
        s += abs(pix1[0] - pix2[0]);
2913
        s += abs(pix1[1] - pix2[1]);
2914
        s += abs(pix1[2] - pix2[2]);
2915
        s += abs(pix1[3] - pix2[3]);
2916
        s += abs(pix1[4] - pix2[4]);
2917
        s += abs(pix1[5] - pix2[5]);
2918
        s += abs(pix1[6] - pix2[6]);
2919
        s += abs(pix1[7] - pix2[7]);
2920
        pix1 += line_size;
2921
        pix2 += line_size;
2922
    }
2923
    return s;
2924
}
2925

    
2926
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2927
{
2928
    int s, i;
2929

    
2930
    s = 0;
2931
    for(i=0;i<h;i++) {
2932
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2933
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2934
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2935
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2936
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2937
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2938
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2939
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2940
        pix1 += line_size;
2941
        pix2 += line_size;
2942
    }
2943
    return s;
2944
}
2945

    
2946
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2947
{
2948
    int s, i;
2949
    uint8_t *pix3 = pix2 + line_size;
2950

    
2951
    s = 0;
2952
    for(i=0;i<h;i++) {
2953
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2954
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2955
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2956
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2957
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2958
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2959
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2960
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2961
        pix1 += line_size;
2962
        pix2 += line_size;
2963
        pix3 += line_size;
2964
    }
2965
    return s;
2966
}
2967

    
2968
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2969
{
2970
    int s, i;
2971
    uint8_t *pix3 = pix2 + line_size;
2972

    
2973
    s = 0;
2974
    for(i=0;i<h;i++) {
2975
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2976
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2977
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2978
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2979
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2980
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2981
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2982
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2983
        pix1 += line_size;
2984
        pix2 += line_size;
2985
        pix3 += line_size;
2986
    }
2987
    return s;
2988
}
2989

    
2990
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2991
    MpegEncContext *c = v;
2992
    int score1=0;
2993
    int score2=0;
2994
    int x,y;
2995

    
2996
    for(y=0; y<h; y++){
2997
        for(x=0; x<16; x++){
2998
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2999
        }
3000
        if(y+1<h){
3001
            for(x=0; x<15; x++){
3002
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3003
                             - s1[x+1] + s1[x+1+stride])
3004
                        -FFABS(  s2[x  ] - s2[x  +stride]
3005
                             - s2[x+1] + s2[x+1+stride]);
3006
            }
3007
        }
3008
        s1+= stride;
3009
        s2+= stride;
3010
    }
3011

    
3012
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3013
    else  return score1 + FFABS(score2)*8;
3014
}
3015

    
3016
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3017
    MpegEncContext *c = v;
3018
    int score1=0;
3019
    int score2=0;
3020
    int x,y;
3021

    
3022
    for(y=0; y<h; y++){
3023
        for(x=0; x<8; x++){
3024
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3025
        }
3026
        if(y+1<h){
3027
            for(x=0; x<7; x++){
3028
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3029
                             - s1[x+1] + s1[x+1+stride])
3030
                        -FFABS(  s2[x  ] - s2[x  +stride]
3031
                             - s2[x+1] + s2[x+1+stride]);
3032
            }
3033
        }
3034
        s1+= stride;
3035
        s2+= stride;
3036
    }
3037

    
3038
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3039
    else  return score1 + FFABS(score2)*8;
3040
}
3041

    
3042
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3043
    int i;
3044
    unsigned int sum=0;
3045

    
3046
    for(i=0; i<8*8; i++){
3047
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3048
        int w= weight[i];
3049
        b>>= RECON_SHIFT;
3050
        assert(-512<b && b<512);
3051

    
3052
        sum += (w*b)*(w*b)>>4;
3053
    }
3054
    return sum>>2;
3055
}
3056

    
3057
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3058
    int i;
3059

    
3060
    for(i=0; i<8*8; i++){
3061
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3062
    }
3063
}
3064

    
3065
/**
3066
 * permutes an 8x8 block.
3067
 * @param block the block which will be permuted according to the given permutation vector
3068
 * @param permutation the permutation vector
3069
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3070
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3071
 *                  (inverse) permutated to scantable order!
3072
 */
3073
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3074
{
3075
    int i;
3076
    DCTELEM temp[64];
3077

    
3078
    if(last<=0) return;
3079
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3080

    
3081
    for(i=0; i<=last; i++){
3082
        const int j= scantable[i];
3083
        temp[j]= block[j];
3084
        block[j]=0;
3085
    }
3086

    
3087
    for(i=0; i<=last; i++){
3088
        const int j= scantable[i];
3089
        const int perm_j= permutation[j];
3090
        block[perm_j]= temp[j];
3091
    }
3092
}
3093

    
3094
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3095
    return 0;
3096
}
3097

    
3098
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3099
    int i;
3100

    
3101
    memset(cmp, 0, sizeof(void*)*6);
3102

    
3103
    for(i=0; i<6; i++){
3104
        switch(type&0xFF){
3105
        case FF_CMP_SAD:
3106
            cmp[i]= c->sad[i];
3107
            break;
3108
        case FF_CMP_SATD:
3109
            cmp[i]= c->hadamard8_diff[i];
3110
            break;
3111
        case FF_CMP_SSE:
3112
            cmp[i]= c->sse[i];
3113
            break;
3114
        case FF_CMP_DCT:
3115
            cmp[i]= c->dct_sad[i];
3116
            break;
3117
        case FF_CMP_DCT264:
3118
            cmp[i]= c->dct264_sad[i];
3119
            break;
3120
        case FF_CMP_DCTMAX:
3121
            cmp[i]= c->dct_max[i];
3122
            break;
3123
        case FF_CMP_PSNR:
3124
            cmp[i]= c->quant_psnr[i];
3125
            break;
3126
        case FF_CMP_BIT:
3127
            cmp[i]= c->bit[i];
3128
            break;
3129
        case FF_CMP_RD:
3130
            cmp[i]= c->rd[i];
3131
            break;
3132
        case FF_CMP_VSAD:
3133
            cmp[i]= c->vsad[i];
3134
            break;
3135
        case FF_CMP_VSSE:
3136
            cmp[i]= c->vsse[i];
3137
            break;
3138
        case FF_CMP_ZERO:
3139
            cmp[i]= zero_cmp;
3140
            break;
3141
        case FF_CMP_NSSE:
3142
            cmp[i]= c->nsse[i];
3143
            break;
3144
#if CONFIG_DWT
3145
        case FF_CMP_W53:
3146
            cmp[i]= c->w53[i];
3147
            break;
3148
        case FF_CMP_W97:
3149
            cmp[i]= c->w97[i];
3150
            break;
3151
#endif
3152
        default:
3153
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3154
        }
3155
    }
3156
}
3157

    
3158
static void clear_block_c(DCTELEM *block)
3159
{
3160
    memset(block, 0, sizeof(DCTELEM)*64);
3161
}
3162

    
3163
/**
3164
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3165
 */
3166
static void clear_blocks_c(DCTELEM *blocks)
3167
{
3168
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3169
}
3170

    
3171
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3172
    long i;
3173
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3174
        long a = *(long*)(src+i);
3175
        long b = *(long*)(dst+i);
3176
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3177
    }
3178
    for(; i<w; i++)
3179
        dst[i+0] += src[i+0];
3180
}
3181

    
3182
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3183
    long i;
3184
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3185
        long a = *(long*)(src1+i);
3186
        long b = *(long*)(src2+i);
3187
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3188
    }
3189
    for(; i<w; i++)
3190
        dst[i] = src1[i]+src2[i];
3191
}
3192

    
3193
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3194
    long i;
3195
#if !HAVE_FAST_UNALIGNED
3196
    if((long)src2 & (sizeof(long)-1)){
3197
        for(i=0; i+7<w; i+=8){
3198
            dst[i+0] = src1[i+0]-src2[i+0];
3199
            dst[i+1] = src1[i+1]-src2[i+1];
3200
            dst[i+2] = src1[i+2]-src2[i+2];
3201
            dst[i+3] = src1[i+3]-src2[i+3];
3202
            dst[i+4] = src1[i+4]-src2[i+4];
3203
            dst[i+5] = src1[i+5]-src2[i+5];
3204
            dst[i+6] = src1[i+6]-src2[i+6];
3205
            dst[i+7] = src1[i+7]-src2[i+7];
3206
        }
3207
    }else
3208
#endif
3209
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3210
        long a = *(long*)(src1+i);
3211
        long b = *(long*)(src2+i);
3212
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3213
    }
3214
    for(; i<w; i++)
3215
        dst[i+0] = src1[i+0]-src2[i+0];
3216
}
3217

    
3218
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3219
    int i;
3220
    uint8_t l, lt;
3221

    
3222
    l= *left;
3223
    lt= *left_top;
3224

    
3225
    for(i=0; i<w; i++){
3226
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3227
        lt= src1[i];
3228
        dst[i]= l;
3229
    }
3230

    
3231
    *left= l;
3232
    *left_top= lt;
3233
}
3234

    
3235
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3236
    int i;
3237
    uint8_t l, lt;
3238

    
3239
    l= *left;
3240
    lt= *left_top;
3241

    
3242
    for(i=0; i<w; i++){
3243
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3244
        lt= src1[i];
3245
        l= src2[i];
3246
        dst[i]= l - pred;
3247
    }
3248

    
3249
    *left= l;
3250
    *left_top= lt;
3251
}
3252

    
3253
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3254
    int i;
3255

    
3256
    for(i=0; i<w-1; i++){
3257
        acc+= src[i];
3258
        dst[i]= acc;
3259
        i++;
3260
        acc+= src[i];
3261
        dst[i]= acc;
3262
    }
3263

    
3264
    for(; i<w; i++){
3265
        acc+= src[i];
3266
        dst[i]= acc;
3267
    }
3268

    
3269
    return acc;
3270
}
3271

    
3272
#if HAVE_BIGENDIAN
3273
#define B 3
3274
#define G 2
3275
#define R 1
3276
#define A 0
3277
#else
3278
#define B 0
3279
#define G 1
3280
#define R 2
3281
#define A 3
3282
#endif
3283
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3284
    int i;
3285
    int r,g,b,a;
3286
    r= *red;
3287
    g= *green;
3288
    b= *blue;
3289
    a= *alpha;
3290

    
3291
    for(i=0; i<w; i++){
3292
        b+= src[4*i+B];
3293
        g+= src[4*i+G];
3294
        r+= src[4*i+R];
3295
        a+= src[4*i+A];
3296

    
3297
        dst[4*i+B]= b;
3298
        dst[4*i+G]= g;
3299
        dst[4*i+R]= r;
3300
        dst[4*i+A]= a;
3301
    }
3302

    
3303
    *red= r;
3304
    *green= g;
3305
    *blue= b;
3306
    *alpha= a;
3307
}
3308
#undef B
3309
#undef G
3310
#undef R
3311
#undef A
3312

    
3313
#define BUTTERFLY2(o1,o2,i1,i2) \
3314
o1= (i1)+(i2);\
3315
o2= (i1)-(i2);
3316

    
3317
#define BUTTERFLY1(x,y) \
3318
{\
3319
    int a,b;\
3320
    a= x;\
3321
    b= y;\
3322
    x= a+b;\
3323
    y= a-b;\
3324
}
3325

    
3326
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3327

    
3328
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3329
    int i;
3330
    int temp[64];
3331
    int sum=0;
3332

    
3333
    assert(h==8);
3334

    
3335
    for(i=0; i<8; i++){
3336
        //FIXME try pointer walks
3337
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3338
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3339
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3340
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3341

    
3342
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3343
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3344
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3345
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3346

    
3347
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3348
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3349
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3350
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3351
    }
3352

    
3353
    for(i=0; i<8; i++){
3354
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3355
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3356
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3357
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3358

    
3359
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3360
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3361
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3362
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3363

    
3364
        sum +=
3365
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3366
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3367
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3368
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3369
    }
3370
#if 0
3371
static int maxi=0;
3372
if(sum>maxi){
3373
    maxi=sum;
3374
    printf("MAX:%d\n", maxi);
3375
}
3376
#endif
3377
    return sum;
3378
}
3379

    
3380
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3381
    int i;
3382
    int temp[64];
3383
    int sum=0;
3384

    
3385
    assert(h==8);
3386

    
3387
    for(i=0; i<8; i++){
3388
        //FIXME try pointer walks
3389
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3390
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3391
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3392
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3393

    
3394
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3395
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3396
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3397
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3398

    
3399
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3400
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3401
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3402
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3403
    }
3404

    
3405
    for(i=0; i<8; i++){
3406
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3407
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3408
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3409
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3410

    
3411
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3412
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3413
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3414
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3415

    
3416
        sum +=
3417
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3418
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3419
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3420
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3421
    }
3422

    
3423
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3424

    
3425
    return sum;
3426
}
3427

    
3428
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3429
    MpegEncContext * const s= (MpegEncContext *)c;
3430
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3431

    
3432
    assert(h==8);
3433

    
3434
    s->dsp.diff_pixels(temp, src1, src2, stride);
3435
    s->dsp.fdct(temp);
3436
    return s->dsp.sum_abs_dctelem(temp);
3437
}
3438

    
3439
#if CONFIG_GPL
3440
#define DCT8_1D {\
3441
    const int s07 = SRC(0) + SRC(7);\
3442
    const int s16 = SRC(1) + SRC(6);\
3443
    const int s25 = SRC(2) + SRC(5);\
3444
    const int s34 = SRC(3) + SRC(4);\
3445
    const int a0 = s07 + s34;\
3446
    const int a1 = s16 + s25;\
3447
    const int a2 = s07 - s34;\
3448
    const int a3 = s16 - s25;\
3449
    const int d07 = SRC(0) - SRC(7);\
3450
    const int d16 = SRC(1) - SRC(6);\
3451
    const int d25 = SRC(2) - SRC(5);\
3452
    const int d34 = SRC(3) - SRC(4);\
3453
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3454
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3455
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3456
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3457
    DST(0,  a0 + a1     ) ;\
3458
    DST(1,  a4 + (a7>>2)) ;\
3459
    DST(2,  a2 + (a3>>1)) ;\
3460
    DST(3,  a5 + (a6>>2)) ;\
3461
    DST(4,  a0 - a1     ) ;\
3462
    DST(5,  a6 - (a5>>2)) ;\
3463
    DST(6, (a2>>1) - a3 ) ;\
3464
    DST(7, (a4>>2) - a7 ) ;\
3465
}
3466

    
3467
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3468
    MpegEncContext * const s= (MpegEncContext *)c;
3469
    DCTELEM dct[8][8];
3470
    int i;
3471
    int sum=0;
3472

    
3473
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3474

    
3475
#define SRC(x) dct[i][x]
3476
#define DST(x,v) dct[i][x]= v
3477
    for( i = 0; i < 8; i++ )
3478
        DCT8_1D
3479
#undef SRC
3480
#undef DST
3481

    
3482
#define SRC(x) dct[x][i]
3483
#define DST(x,v) sum += FFABS(v)
3484
    for( i = 0; i < 8; i++ )
3485
        DCT8_1D
3486
#undef SRC
3487
#undef DST
3488
    return sum;
3489
}
3490
#endif
3491

    
3492
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3493
    MpegEncContext * const s= (MpegEncContext *)c;
3494
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3495
    int sum=0, i;
3496

    
3497
    assert(h==8);
3498

    
3499
    s->dsp.diff_pixels(temp, src1, src2, stride);
3500
    s->dsp.fdct(temp);
3501

    
3502
    for(i=0; i<64; i++)
3503
        sum= FFMAX(sum, FFABS(temp[i]));
3504

    
3505
    return sum;
3506
}
3507

    
3508
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3509
    MpegEncContext * const s= (MpegEncContext *)c;
3510
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3511
    DCTELEM * const bak = temp+64;
3512
    int sum=0, i;
3513

    
3514
    assert(h==8);
3515
    s->mb_intra=0;
3516

    
3517
    s->dsp.diff_pixels(temp, src1, src2, stride);
3518

    
3519
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3520

    
3521
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3522
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3523
    ff_simple_idct(temp); //FIXME
3524

    
3525
    for(i=0; i<64; i++)
3526
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3527

    
3528
    return sum;
3529
}
3530

    
3531
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3532
    MpegEncContext * const s= (MpegEncContext *)c;
3533
    const uint8_t *scantable= s->intra_scantable.permutated;
3534
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3535
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3536
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3537
    int i, last, run, bits, level, distortion, start_i;
3538
    const int esc_length= s->ac_esc_length;
3539
    uint8_t * length;
3540
    uint8_t * last_length;
3541

    
3542
    assert(h==8);
3543

    
3544
    copy_block8(lsrc1, src1, 8, stride, 8);
3545
    copy_block8(lsrc2, src2, 8, stride, 8);
3546

    
3547
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3548

    
3549
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3550

    
3551
    bits=0;
3552

    
3553
    if (s->mb_intra) {
3554
        start_i = 1;
3555
        length     = s->intra_ac_vlc_length;
3556
        last_length= s->intra_ac_vlc_last_length;
3557
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3558
    } else {
3559
        start_i = 0;
3560
        length     = s->inter_ac_vlc_length;
3561
        last_length= s->inter_ac_vlc_last_length;
3562
    }
3563

    
3564
    if(last>=start_i){
3565
        run=0;
3566
        for(i=start_i; i<last; i++){
3567
            int j= scantable[i];
3568
            level= temp[j];
3569

    
3570
            if(level){
3571
                level+=64;
3572
                if((level&(~127)) == 0){
3573
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3574
                }else
3575
                    bits+= esc_length;
3576
                run=0;
3577
            }else
3578
                run++;
3579
        }
3580
        i= scantable[last];
3581

    
3582
        level= temp[i] + 64;
3583

    
3584
        assert(level - 64);
3585

    
3586
        if((level&(~127)) == 0){
3587
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3588
        }else
3589
            bits+= esc_length;
3590

    
3591
    }
3592

    
3593
    if(last>=0){
3594
        if(s->mb_intra)
3595
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3596
        else
3597
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3598
    }
3599

    
3600
    s->dsp.idct_add(lsrc2, 8, temp);
3601

    
3602
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3603

    
3604
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3605
}
3606

    
3607
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3608
    MpegEncContext * const s= (MpegEncContext *)c;
3609
    const uint8_t *scantable= s->intra_scantable.permutated;
3610
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3611
    int i, last, run, bits, level, start_i;
3612
    const int esc_length= s->ac_esc_length;
3613
    uint8_t * length;
3614
    uint8_t * last_length;
3615

    
3616
    assert(h==8);
3617

    
3618
    s->dsp.diff_pixels(temp, src1, src2, stride);
3619

    
3620
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3621

    
3622
    bits=0;
3623

    
3624
    if (s->mb_intra) {
3625
        start_i = 1;
3626
        length     = s->intra_ac_vlc_length;
3627
        last_length= s->intra_ac_vlc_last_length;
3628
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3629
    } else {
3630
        start_i = 0;
3631
        length     = s->inter_ac_vlc_length;
3632
        last_length= s->inter_ac_vlc_last_length;
3633
    }
3634

    
3635
    if(last>=start_i){
3636
        run=0;
3637
        for(i=start_i; i<last; i++){
3638
            int j= scantable[i];
3639
            level= temp[j];
3640

    
3641
            if(level){
3642
                level+=64;
3643
                if((level&(~127)) == 0){
3644
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3645
                }else
3646
                    bits+= esc_length;
3647
                run=0;
3648
            }else
3649
                run++;
3650
        }
3651
        i= scantable[last];
3652

    
3653
        level= temp[i] + 64;
3654

    
3655
        assert(level - 64);
3656

    
3657
        if((level&(~127)) == 0){
3658
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3659
        }else
3660
            bits+= esc_length;
3661
    }
3662

    
3663
    return bits;
3664
}
3665

    
3666
#define VSAD_INTRA(size) \
3667
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3668
    int score=0;                                                                                            \
3669
    int x,y;                                                                                                \
3670
                                                                                                            \
3671
    for(y=1; y<h; y++){                                                                                     \
3672
        for(x=0; x<size; x+=4){                                                                             \
3673
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3674
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3675
        }                                                                                                   \
3676
        s+= stride;                                                                                         \
3677
    }                                                                                                       \
3678
                                                                                                            \
3679
    return score;                                                                                           \
3680
}
3681
VSAD_INTRA(8)
3682
VSAD_INTRA(16)
3683

    
3684
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3685
    int score=0;
3686
    int x,y;
3687

    
3688
    for(y=1; y<h; y++){
3689
        for(x=0; x<16; x++){
3690
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3691
        }
3692
        s1+= stride;
3693
        s2+= stride;
3694
    }
3695

    
3696
    return score;
3697
}
3698

    
3699
#define SQ(a) ((a)*(a))
3700
#define VSSE_INTRA(size) \
3701
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3702
    int score=0;                                                                                            \
3703
    int x,y;                                                                                                \
3704
                                                                                                            \
3705
    for(y=1; y<h; y++){                                                                                     \
3706
        for(x=0; x<size; x+=4){                                                                               \
3707
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3708
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3709
        }                                                                                                   \
3710
        s+= stride;                                                                                         \
3711
    }                                                                                                       \
3712
                                                                                                            \
3713
    return score;                                                                                           \
3714
}
3715
VSSE_INTRA(8)
3716
VSSE_INTRA(16)
3717

    
3718
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3719
    int score=0;
3720
    int x,y;
3721

    
3722
    for(y=1; y<h; y++){
3723
        for(x=0; x<16; x++){
3724
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3725
        }
3726
        s1+= stride;
3727
        s2+= stride;
3728
    }
3729

    
3730
    return score;
3731
}
3732

    
3733
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3734
                               int size){
3735
    int score=0;
3736
    int i;
3737
    for(i=0; i<size; i++)
3738
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3739
    return score;
3740
}
3741

    
3742
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3743
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3744
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3745
#if CONFIG_GPL
3746
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3747
#endif
3748
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3749
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3750
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3751
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3752

    
3753
static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3754
    int i;
3755
    for(i=0; i<len; i++)
3756
        dst[i] = src0[i] * src1[i];
3757
}
3758

    
3759
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3760
    int i;
3761
    src1 += len-1;
3762
    for(i=0; i<len; i++)
3763
        dst[i] = src0[i] * src1[-i];
3764
}
3765

    
3766
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3767
    int i;
3768
    for(i=0; i<len; i++)
3769
        dst[i] = src0[i] * src1[i] + src2[i];
3770
}
3771

    
3772
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3773
    int i,j;
3774
    dst += len;
3775
    win += len;
3776
    src0+= len;
3777
    for(i=-len, j=len-1; i<0; i++, j--) {
3778
        float s0 = src0[i];
3779
        float s1 = src1[j];
3780
        float wi = win[i];
3781
        float wj = win[j];
3782
        dst[i] = s0*wj - s1*wi + add_bias;
3783
        dst[j] = s0*wi + s1*wj + add_bias;
3784
    }
3785
}
3786

    
3787
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3788
                                 int len)
3789
{
3790
    int i;
3791
    for (i = 0; i < len; i++)
3792
        dst[i] = src[i] * mul;
3793
}
3794

    
3795
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3796
                                      const float **sv, float mul, int len)
3797
{
3798
    int i;
3799
    for (i = 0; i < len; i += 2, sv++) {
3800
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3801
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3802
    }
3803
}
3804

    
3805
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3806
                                      const float **sv, float mul, int len)
3807
{
3808
    int i;
3809
    for (i = 0; i < len; i += 4, sv++) {
3810
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3811
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3812
        dst[i+2] = src[i+2] * sv[0][2] * mul;
3813
        dst[i+3] = src[i+3] * sv[0][3] * mul;
3814
    }
3815
}
3816

    
3817
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3818
                               int len)
3819
{
3820
    int i;
3821
    for (i = 0; i < len; i += 2, sv++) {
3822
        dst[i  ] = sv[0][0] * mul;
3823
        dst[i+1] = sv[0][1] * mul;
3824
    }
3825
}
3826

    
3827
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3828
                               int len)
3829
{
3830
    int i;
3831
    for (i = 0; i < len; i += 4, sv++) {
3832
        dst[i  ] = sv[0][0] * mul;
3833
        dst[i+1] = sv[0][1] * mul;
3834
        dst[i+2] = sv[0][2] * mul;
3835
        dst[i+3] = sv[0][3] * mul;
3836
    }
3837
}
3838

    
3839
static void butterflies_float_c(float *restrict v1, float *restrict v2,
3840
                                int len)
3841
{
3842
    int i;
3843
    for (i = 0; i < len; i++) {
3844
        float t = v1[i] - v2[i];
3845
        v1[i] += v2[i];
3846
        v2[i] = t;
3847
    }
3848
}
3849

    
3850
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3851
{
3852
    float p = 0.0;
3853
    int i;
3854

    
3855
    for (i = 0; i < len; i++)
3856
        p += v1[i] * v2[i];
3857

    
3858
    return p;
3859
}
3860

    
3861
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3862
    int i;
3863
    for(i=0; i<len; i++)
3864
        dst[i] = src[i] * mul;
3865
}
3866

    
3867
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3868
                   uint32_t maxi, uint32_t maxisign)
3869
{
3870

    
3871
    if(a > mini) return mini;
3872
    else if((a^(1<<31)) > maxisign) return maxi;
3873
    else return a;
3874
}
3875

    
3876
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3877
    int i;
3878
    uint32_t mini = *(uint32_t*)min;
3879
    uint32_t maxi = *(uint32_t*)max;
3880
    uint32_t maxisign = maxi ^ (1<<31);
3881
    uint32_t *dsti = (uint32_t*)dst;
3882
    const uint32_t *srci = (const uint32_t*)src;
3883
    for(i=0; i<len; i+=8) {
3884
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3885
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3886
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3887
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3888
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3889
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3890
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3891
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3892
    }
3893
}
3894
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3895
    int i;
3896
    if(min < 0 && max > 0) {
3897
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3898
    } else {
3899
        for(i=0; i < len; i+=8) {
3900
            dst[i    ] = av_clipf(src[i    ], min, max);
3901
            dst[i + 1] = av_clipf(src[i + 1], min, max);
3902
            dst[i + 2] = av_clipf(src[i + 2], min, max);
3903
            dst[i + 3] = av_clipf(src[i + 3], min, max);
3904
            dst[i + 4] = av_clipf(src[i + 4], min, max);
3905
            dst[i + 5] = av_clipf(src[i + 5], min, max);
3906
            dst[i + 6] = av_clipf(src[i + 6], min, max);
3907
            dst[i + 7] = av_clipf(src[i + 7], min, max);
3908
        }
3909
    }
3910
}
3911

    
3912
static av_always_inline int float_to_int16_one(const float *src){
3913
    return av_clip_int16(lrintf(*src));
3914
}
3915

    
3916
static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3917
    int i;
3918
    for(i=0; i<len; i++)
3919
        dst[i] = float_to_int16_one(src+i);
3920
}
3921

    
3922
static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3923
    int i,j,c;
3924
    if(channels==2){
3925
        for(i=0; i<len; i++){
3926
            dst[2*i]   = float_to_int16_one(src[0]+i);
3927
            dst[2*i+1] = float_to_int16_one(src[1]+i);
3928
        }
3929
    }else{
3930
        for(c=0; c<channels; c++)
3931
            for(i=0, j=c; i<len; i++, j+=channels)
3932
                dst[j] = float_to_int16_one(src[c]+i);
3933
    }
3934
}
3935

    
3936
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3937
{
3938
    int res = 0;
3939

    
3940
    while (order--)
3941
        res += (*v1++ * *v2++) >> shift;
3942

    
3943
    return res;
3944
}
3945

    
3946
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3947
{
3948
    int res = 0;
3949
    while (order--) {
3950
        res   += *v1 * *v2++;
3951
        *v1++ += mul * *v3++;
3952
    }
3953
    return res;
3954
}
3955

    
3956
#define W0 2048
3957
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3958
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3959
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3960
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3961
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3962
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3963
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3964

    
3965
static void wmv2_idct_row(short * b)
3966
{
3967
    int s1,s2;
3968
    int a0,a1,a2,a3,a4,a5,a6,a7;
3969
    /*step 1*/
3970
    a1 = W1*b[1]+W7*b[7];
3971
    a7 = W7*b[1]-W1*b[7];
3972
    a5 = W5*b[5]+W3*b[3];
3973
    a3 = W3*b[5]-W5*b[3];
3974
    a2 = W2*b[2]+W6*b[6];
3975
    a6 = W6*b[2]-W2*b[6];
3976
    a0 = W0*b[0]+W0*b[4];
3977
    a4 = W0*b[0]-W0*b[4];
3978
    /*step 2*/
3979
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3980
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3981
    /*step 3*/
3982
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3983
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
3984
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
3985
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3986
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3987
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
3988
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
3989
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3990
}
3991
static void wmv2_idct_col(short * b)
3992
{
3993
    int s1,s2;
3994
    int a0,a1,a2,a3,a4,a5,a6,a7;
3995
    /*step 1, with extended precision*/
3996
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3997
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3998
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3999
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4000
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4001
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4002
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4003
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4004
    /*step 2*/
4005
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
4006
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4007
    /*step 3*/
4008
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4009
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4010
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4011
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4012

    
4013
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4014
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4015
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4016
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4017
}
4018
void ff_wmv2_idct_c(short * block){
4019
    int i;
4020

    
4021
    for(i=0;i<64;i+=8){
4022
        wmv2_idct_row(block+i);
4023
    }
4024
    for(i=0;i<8;i++){
4025
        wmv2_idct_col(block+i);
4026
    }
4027
}
4028
/* XXX: those functions should be suppressed ASAP when all IDCTs are
4029
 converted */
4030
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4031
{
4032
    ff_wmv2_idct_c(block);
4033
    put_pixels_clamped_c(block, dest, line_size);
4034
}
4035
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4036
{
4037
    ff_wmv2_idct_c(block);
4038
    add_pixels_clamped_c(block, dest, line_size);
4039
}
4040
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4041
{
4042
    j_rev_dct (block);
4043
    put_pixels_clamped_c(block, dest, line_size);
4044
}
4045
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4046
{
4047
    j_rev_dct (block);
4048
    add_pixels_clamped_c(block, dest, line_size);
4049
}
4050

    
4051
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4052
{
4053
    j_rev_dct4 (block);
4054
    put_pixels_clamped4_c(block, dest, line_size);
4055
}
4056
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4057
{
4058
    j_rev_dct4 (block);
4059
    add_pixels_clamped4_c(block, dest, line_size);
4060
}
4061

    
4062
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4063
{
4064
    j_rev_dct2 (block);
4065
    put_pixels_clamped2_c(block, dest, line_size);
4066
}
4067
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4068
{
4069
    j_rev_dct2 (block);
4070
    add_pixels_clamped2_c(block, dest, line_size);
4071
}
4072

    
4073
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4074
{
4075
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4076

    
4077
    dest[0] = cm[(block[0] + 4)>>3];
4078
}
4079
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4080
{
4081
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4082

    
4083
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4084
}
4085

    
4086
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4087

    
4088
/* init static data */
4089
av_cold void dsputil_static_init(void)
4090
{
4091
    int i;
4092

    
4093
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4094
    for(i=0;i<MAX_NEG_CROP;i++) {
4095
        ff_cropTbl[i] = 0;
4096
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4097
    }
4098

    
4099
    for(i=0;i<512;i++) {
4100
        ff_squareTbl[i] = (i - 256) * (i - 256);
4101
    }
4102

    
4103
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4104
}
4105

    
4106
int ff_check_alignment(void){
4107
    static int did_fail=0;
4108
    DECLARE_ALIGNED(16, int, aligned);
4109

    
4110
    if((intptr_t)&aligned & 15){
4111
        if(!did_fail){
4112
#if HAVE_MMX || HAVE_ALTIVEC
4113
            av_log(NULL, AV_LOG_ERROR,
4114
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4115
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4116
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4117
                "Do not report crashes to FFmpeg developers.\n");
4118
#endif
4119
            did_fail=1;
4120
        }
4121
        return -1;
4122
    }
4123
    return 0;
4124
}
4125

    
4126
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4127
{
4128
    int i;
4129

    
4130
    ff_check_alignment();
4131

    
4132
#if CONFIG_ENCODERS
4133
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4134
        c->fdct = fdct_ifast;
4135
        c->fdct248 = fdct_ifast248;
4136
    }
4137
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4138
        c->fdct = ff_faandct;
4139
        c->fdct248 = ff_faandct248;
4140
    }
4141
    else {
4142
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4143
        c->fdct248 = ff_fdct248_islow;
4144
    }
4145
#endif //CONFIG_ENCODERS
4146

    
4147
    if(avctx->lowres==1){
4148
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4149
            c->idct_put= ff_jref_idct4_put;
4150
            c->idct_add= ff_jref_idct4_add;
4151
        }else{
4152
            c->idct_put= ff_h264_lowres_idct_put_c;
4153
            c->idct_add= ff_h264_lowres_idct_add_c;
4154
        }
4155
        c->idct    = j_rev_dct4;
4156
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4157
    }else if(avctx->lowres==2){
4158
        c->idct_put= ff_jref_idct2_put;
4159
        c->idct_add= ff_jref_idct2_add;
4160
        c->idct    = j_rev_dct2;
4161
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4162
    }else if(avctx->lowres==3){
4163
        c->idct_put= ff_jref_idct1_put;
4164
        c->idct_add= ff_jref_idct1_add;
4165
        c->idct    = j_rev_dct1;
4166
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4167
    }else{
4168
        if(avctx->idct_algo==FF_IDCT_INT){
4169
            c->idct_put= ff_jref_idct_put;
4170
            c->idct_add= ff_jref_idct_add;
4171
            c->idct    = j_rev_dct;
4172
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4173
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4174
                avctx->idct_algo==FF_IDCT_VP3){
4175
            c->idct_put= ff_vp3_idct_put_c;
4176
            c->idct_add= ff_vp3_idct_add_c;
4177
            c->idct    = ff_vp3_idct_c;
4178
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4179
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4180
            c->idct_put= ff_wmv2_idct_put_c;
4181
            c->idct_add= ff_wmv2_idct_add_c;
4182
            c->idct    = ff_wmv2_idct_c;
4183
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4184
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4185
            c->idct_put= ff_faanidct_put;
4186
            c->idct_add= ff_faanidct_add;
4187
            c->idct    = ff_faanidct;
4188
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4189
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4190
            c->idct_put= ff_ea_idct_put_c;
4191
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4192
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4193
            c->idct     = ff_bink_idct_c;
4194
            c->idct_add = ff_bink_idct_add_c;
4195
            c->idct_put = ff_bink_idct_put_c;
4196
            c->idct_permutation_type = FF_NO_IDCT_PERM;
4197
        }else{ //accurate/default
4198
            c->idct_put= ff_simple_idct_put;
4199
            c->idct_add= ff_simple_idct_add;
4200
            c->idct    = ff_simple_idct;
4201
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4202
        }
4203
    }
4204

    
4205
    c->get_pixels = get_pixels_c;
4206
    c->diff_pixels = diff_pixels_c;
4207
    c->put_pixels_clamped = put_pixels_clamped_c;
4208
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4209
    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4210
    c->add_pixels_clamped = add_pixels_clamped_c;
4211
    c->add_pixels8 = add_pixels8_c;
4212
    c->add_pixels4 = add_pixels4_c;
4213
    c->sum_abs_dctelem = sum_abs_dctelem_c;
4214
    c->gmc1 = gmc1_c;
4215
    c->gmc = ff_gmc_c;
4216
    c->clear_block = clear_block_c;
4217
    c->clear_blocks = clear_blocks_c;
4218
    c->pix_sum = pix_sum_c;
4219
    c->pix_norm1 = pix_norm1_c;
4220

    
4221
    c->fill_block_tab[0] = fill_block16_c;
4222
    c->fill_block_tab[1] = fill_block8_c;
4223
    c->scale_block = scale_block_c;
4224

    
4225
    /* TODO [0] 16  [1] 8 */
4226
    c->pix_abs[0][0] = pix_abs16_c;
4227
    c->pix_abs[0][1] = pix_abs16_x2_c;
4228
    c->pix_abs[0][2] = pix_abs16_y2_c;
4229
    c->pix_abs[0][3] = pix_abs16_xy2_c;
4230
    c->pix_abs[1][0] = pix_abs8_c;
4231
    c->pix_abs[1][1] = pix_abs8_x2_c;
4232
    c->pix_abs[1][2] = pix_abs8_y2_c;
4233
    c->pix_abs[1][3] = pix_abs8_xy2_c;
4234

    
4235
#define dspfunc(PFX, IDX, NUM) \
4236
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4237
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4238
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4239
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4240

    
4241
    dspfunc(put, 0, 16);
4242
    dspfunc(put_no_rnd, 0, 16);
4243
    dspfunc(put, 1, 8);
4244
    dspfunc(put_no_rnd, 1, 8);
4245
    dspfunc(put, 2, 4);
4246
    dspfunc(put, 3, 2);
4247

    
4248
    dspfunc(avg, 0, 16);
4249
    dspfunc(avg_no_rnd, 0, 16);
4250
    dspfunc(avg, 1, 8);
4251
    dspfunc(avg_no_rnd, 1, 8);
4252
    dspfunc(avg, 2, 4);
4253
    dspfunc(avg, 3, 2);
4254
#undef dspfunc
4255

    
4256
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4257
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4258

    
4259
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4260
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4261
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4262
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4263
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4264
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4265
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4266
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4267
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4268

    
4269
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4270
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4271
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4272
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4273
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4274
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4275
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4276
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4277
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4278

    
4279
#define dspfunc(PFX, IDX, NUM) \
4280
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4281
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4282
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4283
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM