Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 3fc548df

History | View | Annotate | Download (158 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "mpegvideo.h"
37
#include "config.h"
38
#include "lpc.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42
#include "vp8dsp.h"
43

    
44
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
45
uint32_t ff_squareTbl[512] = {0, };
46

    
47
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
48
#define pb_7f (~0UL/255 * 0x7f)
49
#define pb_80 (~0UL/255 * 0x80)
50

    
51
const uint8_t ff_zigzag_direct[64] = {
52
    0,   1,  8, 16,  9,  2,  3, 10,
53
    17, 24, 32, 25, 18, 11,  4,  5,
54
    12, 19, 26, 33, 40, 48, 41, 34,
55
    27, 20, 13,  6,  7, 14, 21, 28,
56
    35, 42, 49, 56, 57, 50, 43, 36,
57
    29, 22, 15, 23, 30, 37, 44, 51,
58
    58, 59, 52, 45, 38, 31, 39, 46,
59
    53, 60, 61, 54, 47, 55, 62, 63
60
};
61

    
62
/* Specific zigzag scan for 248 idct. NOTE that unlike the
63
   specification, we interleave the fields */
64
const uint8_t ff_zigzag248_direct[64] = {
65
     0,  8,  1,  9, 16, 24,  2, 10,
66
    17, 25, 32, 40, 48, 56, 33, 41,
67
    18, 26,  3, 11,  4, 12, 19, 27,
68
    34, 42, 49, 57, 50, 58, 35, 43,
69
    20, 28,  5, 13,  6, 14, 21, 29,
70
    36, 44, 51, 59, 52, 60, 37, 45,
71
    22, 30,  7, 15, 23, 31, 38, 46,
72
    53, 61, 54, 62, 39, 47, 55, 63,
73
};
74

    
75
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
76
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
77

    
78
const uint8_t ff_alternate_horizontal_scan[64] = {
79
    0,  1,   2,  3,  8,  9, 16, 17,
80
    10, 11,  4,  5,  6,  7, 15, 14,
81
    13, 12, 19, 18, 24, 25, 32, 33,
82
    26, 27, 20, 21, 22, 23, 28, 29,
83
    30, 31, 34, 35, 40, 41, 48, 49,
84
    42, 43, 36, 37, 38, 39, 44, 45,
85
    46, 47, 50, 51, 56, 57, 58, 59,
86
    52, 53, 54, 55, 60, 61, 62, 63,
87
};
88

    
89
const uint8_t ff_alternate_vertical_scan[64] = {
90
    0,  8,  16, 24,  1,  9,  2, 10,
91
    17, 25, 32, 40, 48, 56, 57, 49,
92
    41, 33, 26, 18,  3, 11,  4, 12,
93
    19, 27, 34, 42, 50, 58, 35, 43,
94
    51, 59, 20, 28,  5, 13,  6, 14,
95
    21, 29, 36, 44, 52, 60, 37, 45,
96
    53, 61, 22, 30,  7, 15, 23, 31,
97
    38, 46, 54, 62, 39, 47, 55, 63,
98
};
99

    
100
/* Input permutation for the simple_idct_mmx */
101
static const uint8_t simple_mmx_permutation[64]={
102
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
103
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
104
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
105
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
106
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
107
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
108
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
109
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
110
};
111

    
112
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
113

    
114
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
115
    int i;
116
    int end;
117

    
118
    st->scantable= src_scantable;
119

    
120
    for(i=0; i<64; i++){
121
        int j;
122
        j = src_scantable[i];
123
        st->permutated[i] = permutation[j];
124
#if ARCH_PPC
125
        st->inverse[j] = i;
126
#endif
127
    }
128

    
129
    end=-1;
130
    for(i=0; i<64; i++){
131
        int j;
132
        j = st->permutated[i];
133
        if(j>end) end=j;
134
        st->raster_end[i]= end;
135
    }
136
}
137

    
138
static int pix_sum_c(uint8_t * pix, int line_size)
139
{
140
    int s, i, j;
141

    
142
    s = 0;
143
    for (i = 0; i < 16; i++) {
144
        for (j = 0; j < 16; j += 8) {
145
            s += pix[0];
146
            s += pix[1];
147
            s += pix[2];
148
            s += pix[3];
149
            s += pix[4];
150
            s += pix[5];
151
            s += pix[6];
152
            s += pix[7];
153
            pix += 8;
154
        }
155
        pix += line_size - 16;
156
    }
157
    return s;
158
}
159

    
160
static int pix_norm1_c(uint8_t * pix, int line_size)
161
{
162
    int s, i, j;
163
    uint32_t *sq = ff_squareTbl + 256;
164

    
165
    s = 0;
166
    for (i = 0; i < 16; i++) {
167
        for (j = 0; j < 16; j += 8) {
168
#if 0
169
            s += sq[pix[0]];
170
            s += sq[pix[1]];
171
            s += sq[pix[2]];
172
            s += sq[pix[3]];
173
            s += sq[pix[4]];
174
            s += sq[pix[5]];
175
            s += sq[pix[6]];
176
            s += sq[pix[7]];
177
#else
178
#if LONG_MAX > 2147483647
179
            register uint64_t x=*(uint64_t*)pix;
180
            s += sq[x&0xff];
181
            s += sq[(x>>8)&0xff];
182
            s += sq[(x>>16)&0xff];
183
            s += sq[(x>>24)&0xff];
184
            s += sq[(x>>32)&0xff];
185
            s += sq[(x>>40)&0xff];
186
            s += sq[(x>>48)&0xff];
187
            s += sq[(x>>56)&0xff];
188
#else
189
            register uint32_t x=*(uint32_t*)pix;
190
            s += sq[x&0xff];
191
            s += sq[(x>>8)&0xff];
192
            s += sq[(x>>16)&0xff];
193
            s += sq[(x>>24)&0xff];
194
            x=*(uint32_t*)(pix+4);
195
            s += sq[x&0xff];
196
            s += sq[(x>>8)&0xff];
197
            s += sq[(x>>16)&0xff];
198
            s += sq[(x>>24)&0xff];
199
#endif
200
#endif
201
            pix += 8;
202
        }
203
        pix += line_size - 16;
204
    }
205
    return s;
206
}
207

    
208
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
209
    int i;
210

    
211
    for(i=0; i+8<=w; i+=8){
212
        dst[i+0]= av_bswap32(src[i+0]);
213
        dst[i+1]= av_bswap32(src[i+1]);
214
        dst[i+2]= av_bswap32(src[i+2]);
215
        dst[i+3]= av_bswap32(src[i+3]);
216
        dst[i+4]= av_bswap32(src[i+4]);
217
        dst[i+5]= av_bswap32(src[i+5]);
218
        dst[i+6]= av_bswap32(src[i+6]);
219
        dst[i+7]= av_bswap32(src[i+7]);
220
    }
221
    for(;i<w; i++){
222
        dst[i+0]= av_bswap32(src[i+0]);
223
    }
224
}
225

    
226
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
227
{
228
    int s, i;
229
    uint32_t *sq = ff_squareTbl + 256;
230

    
231
    s = 0;
232
    for (i = 0; i < h; i++) {
233
        s += sq[pix1[0] - pix2[0]];
234
        s += sq[pix1[1] - pix2[1]];
235
        s += sq[pix1[2] - pix2[2]];
236
        s += sq[pix1[3] - pix2[3]];
237
        pix1 += line_size;
238
        pix2 += line_size;
239
    }
240
    return s;
241
}
242

    
243
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
244
{
245
    int s, i;
246
    uint32_t *sq = ff_squareTbl + 256;
247

    
248
    s = 0;
249
    for (i = 0; i < h; i++) {
250
        s += sq[pix1[0] - pix2[0]];
251
        s += sq[pix1[1] - pix2[1]];
252
        s += sq[pix1[2] - pix2[2]];
253
        s += sq[pix1[3] - pix2[3]];
254
        s += sq[pix1[4] - pix2[4]];
255
        s += sq[pix1[5] - pix2[5]];
256
        s += sq[pix1[6] - pix2[6]];
257
        s += sq[pix1[7] - pix2[7]];
258
        pix1 += line_size;
259
        pix2 += line_size;
260
    }
261
    return s;
262
}
263

    
264
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
265
{
266
    int s, i;
267
    uint32_t *sq = ff_squareTbl + 256;
268

    
269
    s = 0;
270
    for (i = 0; i < h; i++) {
271
        s += sq[pix1[ 0] - pix2[ 0]];
272
        s += sq[pix1[ 1] - pix2[ 1]];
273
        s += sq[pix1[ 2] - pix2[ 2]];
274
        s += sq[pix1[ 3] - pix2[ 3]];
275
        s += sq[pix1[ 4] - pix2[ 4]];
276
        s += sq[pix1[ 5] - pix2[ 5]];
277
        s += sq[pix1[ 6] - pix2[ 6]];
278
        s += sq[pix1[ 7] - pix2[ 7]];
279
        s += sq[pix1[ 8] - pix2[ 8]];
280
        s += sq[pix1[ 9] - pix2[ 9]];
281
        s += sq[pix1[10] - pix2[10]];
282
        s += sq[pix1[11] - pix2[11]];
283
        s += sq[pix1[12] - pix2[12]];
284
        s += sq[pix1[13] - pix2[13]];
285
        s += sq[pix1[14] - pix2[14]];
286
        s += sq[pix1[15] - pix2[15]];
287

    
288
        pix1 += line_size;
289
        pix2 += line_size;
290
    }
291
    return s;
292
}
293

    
294
/* draw the edges of width 'w' of an image of size width, height */
295
//FIXME check that this is ok for mpeg4 interlaced
296
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
297
{
298
    uint8_t *ptr, *last_line;
299
    int i;
300

    
301
    last_line = buf + (height - 1) * wrap;
302
    for(i=0;i<w;i++) {
303
        /* top and bottom */
304
        memcpy(buf - (i + 1) * wrap, buf, width);
305
        memcpy(last_line + (i + 1) * wrap, last_line, width);
306
    }
307
    /* left and right */
308
    ptr = buf;
309
    for(i=0;i<height;i++) {
310
        memset(ptr - w, ptr[0], w);
311
        memset(ptr + width, ptr[width-1], w);
312
        ptr += wrap;
313
    }
314
    /* corners */
315
    for(i=0;i<w;i++) {
316
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
317
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
318
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
319
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
320
    }
321
}
322

    
323
/**
324
 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
325
 * @param buf destination buffer
326
 * @param src source buffer
327
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
328
 * @param block_w width of block
329
 * @param block_h height of block
330
 * @param src_x x coordinate of the top left sample of the block in the source buffer
331
 * @param src_y y coordinate of the top left sample of the block in the source buffer
332
 * @param w width of the source buffer
333
 * @param h height of the source buffer
334
 */
335
void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
336
                                    int src_x, int src_y, int w, int h){
337
    int x, y;
338
    int start_y, start_x, end_y, end_x;
339

    
340
    if(src_y>= h){
341
        src+= (h-1-src_y)*linesize;
342
        src_y=h-1;
343
    }else if(src_y<=-block_h){
344
        src+= (1-block_h-src_y)*linesize;
345
        src_y=1-block_h;
346
    }
347
    if(src_x>= w){
348
        src+= (w-1-src_x);
349
        src_x=w-1;
350
    }else if(src_x<=-block_w){
351
        src+= (1-block_w-src_x);
352
        src_x=1-block_w;
353
    }
354

    
355
    start_y= FFMAX(0, -src_y);
356
    start_x= FFMAX(0, -src_x);
357
    end_y= FFMIN(block_h, h-src_y);
358
    end_x= FFMIN(block_w, w-src_x);
359

    
360
    // copy existing part
361
    for(y=start_y; y<end_y; y++){
362
        for(x=start_x; x<end_x; x++){
363
            buf[x + y*linesize]= src[x + y*linesize];
364
        }
365
    }
366

    
367
    //top
368
    for(y=0; y<start_y; y++){
369
        for(x=start_x; x<end_x; x++){
370
            buf[x + y*linesize]= buf[x + start_y*linesize];
371
        }
372
    }
373

    
374
    //bottom
375
    for(y=end_y; y<block_h; y++){
376
        for(x=start_x; x<end_x; x++){
377
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
378
        }
379
    }
380

    
381
    for(y=0; y<block_h; y++){
382
       //left
383
        for(x=0; x<start_x; x++){
384
            buf[x + y*linesize]= buf[start_x + y*linesize];
385
        }
386

    
387
       //right
388
        for(x=end_x; x<block_w; x++){
389
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
390
        }
391
    }
392
}
393

    
394
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
395
{
396
    int i;
397

    
398
    /* read the pixels */
399
    for(i=0;i<8;i++) {
400
        block[0] = pixels[0];
401
        block[1] = pixels[1];
402
        block[2] = pixels[2];
403
        block[3] = pixels[3];
404
        block[4] = pixels[4];
405
        block[5] = pixels[5];
406
        block[6] = pixels[6];
407
        block[7] = pixels[7];
408
        pixels += line_size;
409
        block += 8;
410
    }
411
}
412

    
413
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
414
                          const uint8_t *s2, int stride){
415
    int i;
416

    
417
    /* read the pixels */
418
    for(i=0;i<8;i++) {
419
        block[0] = s1[0] - s2[0];
420
        block[1] = s1[1] - s2[1];
421
        block[2] = s1[2] - s2[2];
422
        block[3] = s1[3] - s2[3];
423
        block[4] = s1[4] - s2[4];
424
        block[5] = s1[5] - s2[5];
425
        block[6] = s1[6] - s2[6];
426
        block[7] = s1[7] - s2[7];
427
        s1 += stride;
428
        s2 += stride;
429
        block += 8;
430
    }
431
}
432

    
433

    
434
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
435
                                 int line_size)
436
{
437
    int i;
438
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
439

    
440
    /* read the pixels */
441
    for(i=0;i<8;i++) {
442
        pixels[0] = cm[block[0]];
443
        pixels[1] = cm[block[1]];
444
        pixels[2] = cm[block[2]];
445
        pixels[3] = cm[block[3]];
446
        pixels[4] = cm[block[4]];
447
        pixels[5] = cm[block[5]];
448
        pixels[6] = cm[block[6]];
449
        pixels[7] = cm[block[7]];
450

    
451
        pixels += line_size;
452
        block += 8;
453
    }
454
}
455

    
456
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
457
                                 int line_size)
458
{
459
    int i;
460
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
461

    
462
    /* read the pixels */
463
    for(i=0;i<4;i++) {
464
        pixels[0] = cm[block[0]];
465
        pixels[1] = cm[block[1]];
466
        pixels[2] = cm[block[2]];
467
        pixels[3] = cm[block[3]];
468

    
469
        pixels += line_size;
470
        block += 8;
471
    }
472
}
473

    
474
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
475
                                 int line_size)
476
{
477
    int i;
478
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
479

    
480
    /* read the pixels */
481
    for(i=0;i<2;i++) {
482
        pixels[0] = cm[block[0]];
483
        pixels[1] = cm[block[1]];
484

    
485
        pixels += line_size;
486
        block += 8;
487
    }
488
}
489

    
490
static void put_signed_pixels_clamped_c(const DCTELEM *block,
491
                                        uint8_t *restrict pixels,
492
                                        int line_size)
493
{
494
    int i, j;
495

    
496
    for (i = 0; i < 8; i++) {
497
        for (j = 0; j < 8; j++) {
498
            if (*block < -128)
499
                *pixels = 0;
500
            else if (*block > 127)
501
                *pixels = 255;
502
            else
503
                *pixels = (uint8_t)(*block + 128);
504
            block++;
505
            pixels++;
506
        }
507
        pixels += (line_size - 8);
508
    }
509
}
510

    
511
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
512
                                    int line_size)
513
{
514
    int i;
515

    
516
    /* read the pixels */
517
    for(i=0;i<8;i++) {
518
        pixels[0] = block[0];
519
        pixels[1] = block[1];
520
        pixels[2] = block[2];
521
        pixels[3] = block[3];
522
        pixels[4] = block[4];
523
        pixels[5] = block[5];
524
        pixels[6] = block[6];
525
        pixels[7] = block[7];
526

    
527
        pixels += line_size;
528
        block += 8;
529
    }
530
}
531

    
532
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
533
                          int line_size)
534
{
535
    int i;
536
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
537

    
538
    /* read the pixels */
539
    for(i=0;i<8;i++) {
540
        pixels[0] = cm[pixels[0] + block[0]];
541
        pixels[1] = cm[pixels[1] + block[1]];
542
        pixels[2] = cm[pixels[2] + block[2]];
543
        pixels[3] = cm[pixels[3] + block[3]];
544
        pixels[4] = cm[pixels[4] + block[4]];
545
        pixels[5] = cm[pixels[5] + block[5]];
546
        pixels[6] = cm[pixels[6] + block[6]];
547
        pixels[7] = cm[pixels[7] + block[7]];
548
        pixels += line_size;
549
        block += 8;
550
    }
551
}
552

    
553
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
554
                          int line_size)
555
{
556
    int i;
557
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
558

    
559
    /* read the pixels */
560
    for(i=0;i<4;i++) {
561
        pixels[0] = cm[pixels[0] + block[0]];
562
        pixels[1] = cm[pixels[1] + block[1]];
563
        pixels[2] = cm[pixels[2] + block[2]];
564
        pixels[3] = cm[pixels[3] + block[3]];
565
        pixels += line_size;
566
        block += 8;
567
    }
568
}
569

    
570
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
571
                          int line_size)
572
{
573
    int i;
574
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
575

    
576
    /* read the pixels */
577
    for(i=0;i<2;i++) {
578
        pixels[0] = cm[pixels[0] + block[0]];
579
        pixels[1] = cm[pixels[1] + block[1]];
580
        pixels += line_size;
581
        block += 8;
582
    }
583
}
584

    
585
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
586
{
587
    int i;
588
    for(i=0;i<8;i++) {
589
        pixels[0] += block[0];
590
        pixels[1] += block[1];
591
        pixels[2] += block[2];
592
        pixels[3] += block[3];
593
        pixels[4] += block[4];
594
        pixels[5] += block[5];
595
        pixels[6] += block[6];
596
        pixels[7] += block[7];
597
        pixels += line_size;
598
        block += 8;
599
    }
600
}
601

    
602
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
603
{
604
    int i;
605
    for(i=0;i<4;i++) {
606
        pixels[0] += block[0];
607
        pixels[1] += block[1];
608
        pixels[2] += block[2];
609
        pixels[3] += block[3];
610
        pixels += line_size;
611
        block += 4;
612
    }
613
}
614

    
615
static int sum_abs_dctelem_c(DCTELEM *block)
616
{
617
    int sum=0, i;
618
    for(i=0; i<64; i++)
619
        sum+= FFABS(block[i]);
620
    return sum;
621
}
622

    
623
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
624
{
625
    int i;
626

    
627
    for (i = 0; i < h; i++) {
628
        memset(block, value, 16);
629
        block += line_size;
630
    }
631
}
632

    
633
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
634
{
635
    int i;
636

    
637
    for (i = 0; i < h; i++) {
638
        memset(block, value, 8);
639
        block += line_size;
640
    }
641
}
642

    
643
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
644
{
645
    int i, j;
646
    uint16_t *dst1 = (uint16_t *) dst;
647
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
648

    
649
    for (j = 0; j < 8; j++) {
650
        for (i = 0; i < 8; i++) {
651
            dst1[i] = dst2[i] = src[i] * 0x0101;
652
        }
653
        src  += 8;
654
        dst1 += linesize;
655
        dst2 += linesize;
656
    }
657
}
658

    
659
#if 0
660

661
#define PIXOP2(OPNAME, OP) \
662
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
663
{\
664
    int i;\
665
    for(i=0; i<h; i++){\
666
        OP(*((uint64_t*)block), AV_RN64(pixels));\
667
        pixels+=line_size;\
668
        block +=line_size;\
669
    }\
670
}\
671
\
672
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
673
{\
674
    int i;\
675
    for(i=0; i<h; i++){\
676
        const uint64_t a= AV_RN64(pixels  );\
677
        const uint64_t b= AV_RN64(pixels+1);\
678
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
679
        pixels+=line_size;\
680
        block +=line_size;\
681
    }\
682
}\
683
\
684
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
685
{\
686
    int i;\
687
    for(i=0; i<h; i++){\
688
        const uint64_t a= AV_RN64(pixels  );\
689
        const uint64_t b= AV_RN64(pixels+1);\
690
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
691
        pixels+=line_size;\
692
        block +=line_size;\
693
    }\
694
}\
695
\
696
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
697
{\
698
    int i;\
699
    for(i=0; i<h; i++){\
700
        const uint64_t a= AV_RN64(pixels          );\
701
        const uint64_t b= AV_RN64(pixels+line_size);\
702
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
703
        pixels+=line_size;\
704
        block +=line_size;\
705
    }\
706
}\
707
\
708
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
709
{\
710
    int i;\
711
    for(i=0; i<h; i++){\
712
        const uint64_t a= AV_RN64(pixels          );\
713
        const uint64_t b= AV_RN64(pixels+line_size);\
714
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
715
        pixels+=line_size;\
716
        block +=line_size;\
717
    }\
718
}\
719
\
720
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
721
{\
722
        int i;\
723
        const uint64_t a= AV_RN64(pixels  );\
724
        const uint64_t b= AV_RN64(pixels+1);\
725
        uint64_t l0=  (a&0x0303030303030303ULL)\
726
                    + (b&0x0303030303030303ULL)\
727
                    + 0x0202020202020202ULL;\
728
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
729
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
730
        uint64_t l1,h1;\
731
\
732
        pixels+=line_size;\
733
        for(i=0; i<h; i+=2){\
734
            uint64_t a= AV_RN64(pixels  );\
735
            uint64_t b= AV_RN64(pixels+1);\
736
            l1=  (a&0x0303030303030303ULL)\
737
               + (b&0x0303030303030303ULL);\
738
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
739
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
740
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
741
            pixels+=line_size;\
742
            block +=line_size;\
743
            a= AV_RN64(pixels  );\
744
            b= AV_RN64(pixels+1);\
745
            l0=  (a&0x0303030303030303ULL)\
746
               + (b&0x0303030303030303ULL)\
747
               + 0x0202020202020202ULL;\
748
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
749
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
750
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
751
            pixels+=line_size;\
752
            block +=line_size;\
753
        }\
754
}\
755
\
756
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
757
{\
758
        int i;\
759
        const uint64_t a= AV_RN64(pixels  );\
760
        const uint64_t b= AV_RN64(pixels+1);\
761
        uint64_t l0=  (a&0x0303030303030303ULL)\
762
                    + (b&0x0303030303030303ULL)\
763
                    + 0x0101010101010101ULL;\
764
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
765
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
766
        uint64_t l1,h1;\
767
\
768
        pixels+=line_size;\
769
        for(i=0; i<h; i+=2){\
770
            uint64_t a= AV_RN64(pixels  );\
771
            uint64_t b= AV_RN64(pixels+1);\
772
            l1=  (a&0x0303030303030303ULL)\
773
               + (b&0x0303030303030303ULL);\
774
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
775
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
776
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
777
            pixels+=line_size;\
778
            block +=line_size;\
779
            a= AV_RN64(pixels  );\
780
            b= AV_RN64(pixels+1);\
781
            l0=  (a&0x0303030303030303ULL)\
782
               + (b&0x0303030303030303ULL)\
783
               + 0x0101010101010101ULL;\
784
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
785
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
786
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
787
            pixels+=line_size;\
788
            block +=line_size;\
789
        }\
790
}\
791
\
792
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
793
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
794
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
795
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
796
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
797
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
798
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
799

800
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
801
#else // 64 bit variant
802

    
803
#define PIXOP2(OPNAME, OP) \
804
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
805
    int i;\
806
    for(i=0; i<h; i++){\
807
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
808
        pixels+=line_size;\
809
        block +=line_size;\
810
    }\
811
}\
812
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
813
    int i;\
814
    for(i=0; i<h; i++){\
815
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
816
        pixels+=line_size;\
817
        block +=line_size;\
818
    }\
819
}\
820
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
821
    int i;\
822
    for(i=0; i<h; i++){\
823
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
824
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
825
        pixels+=line_size;\
826
        block +=line_size;\
827
    }\
828
}\
829
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
830
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
831
}\
832
\
833
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
834
                                                int src_stride1, int src_stride2, int h){\
835
    int i;\
836
    for(i=0; i<h; i++){\
837
        uint32_t a,b;\
838
        a= AV_RN32(&src1[i*src_stride1  ]);\
839
        b= AV_RN32(&src2[i*src_stride2  ]);\
840
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
841
        a= AV_RN32(&src1[i*src_stride1+4]);\
842
        b= AV_RN32(&src2[i*src_stride2+4]);\
843
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
844
    }\
845
}\
846
\
847
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
848
                                                int src_stride1, int src_stride2, int h){\
849
    int i;\
850
    for(i=0; i<h; i++){\
851
        uint32_t a,b;\
852
        a= AV_RN32(&src1[i*src_stride1  ]);\
853
        b= AV_RN32(&src2[i*src_stride2  ]);\
854
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
855
        a= AV_RN32(&src1[i*src_stride1+4]);\
856
        b= AV_RN32(&src2[i*src_stride2+4]);\
857
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
858
    }\
859
}\
860
\
861
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
862
                                                int src_stride1, int src_stride2, int h){\
863
    int i;\
864
    for(i=0; i<h; i++){\
865
        uint32_t a,b;\
866
        a= AV_RN32(&src1[i*src_stride1  ]);\
867
        b= AV_RN32(&src2[i*src_stride2  ]);\
868
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
869
    }\
870
}\
871
\
872
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
873
                                                int src_stride1, int src_stride2, int h){\
874
    int i;\
875
    for(i=0; i<h; i++){\
876
        uint32_t a,b;\
877
        a= AV_RN16(&src1[i*src_stride1  ]);\
878
        b= AV_RN16(&src2[i*src_stride2  ]);\
879
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
880
    }\
881
}\
882
\
883
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
884
                                                int src_stride1, int src_stride2, int h){\
885
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
886
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
887
}\
888
\
889
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
890
                                                int src_stride1, int src_stride2, int h){\
891
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
892
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
893
}\
894
\
895
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
897
}\
898
\
899
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
901
}\
902
\
903
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
905
}\
906
\
907
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
909
}\
910
\
911
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
912
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
913
    int i;\
914
    for(i=0; i<h; i++){\
915
        uint32_t a, b, c, d, l0, l1, h0, h1;\
916
        a= AV_RN32(&src1[i*src_stride1]);\
917
        b= AV_RN32(&src2[i*src_stride2]);\
918
        c= AV_RN32(&src3[i*src_stride3]);\
919
        d= AV_RN32(&src4[i*src_stride4]);\
920
        l0=  (a&0x03030303UL)\
921
           + (b&0x03030303UL)\
922
           + 0x02020202UL;\
923
        h0= ((a&0xFCFCFCFCUL)>>2)\
924
          + ((b&0xFCFCFCFCUL)>>2);\
925
        l1=  (c&0x03030303UL)\
926
           + (d&0x03030303UL);\
927
        h1= ((c&0xFCFCFCFCUL)>>2)\
928
          + ((d&0xFCFCFCFCUL)>>2);\
929
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
930
        a= AV_RN32(&src1[i*src_stride1+4]);\
931
        b= AV_RN32(&src2[i*src_stride2+4]);\
932
        c= AV_RN32(&src3[i*src_stride3+4]);\
933
        d= AV_RN32(&src4[i*src_stride4+4]);\
934
        l0=  (a&0x03030303UL)\
935
           + (b&0x03030303UL)\
936
           + 0x02020202UL;\
937
        h0= ((a&0xFCFCFCFCUL)>>2)\
938
          + ((b&0xFCFCFCFCUL)>>2);\
939
        l1=  (c&0x03030303UL)\
940
           + (d&0x03030303UL);\
941
        h1= ((c&0xFCFCFCFCUL)>>2)\
942
          + ((d&0xFCFCFCFCUL)>>2);\
943
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
944
    }\
945
}\
946
\
947
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
948
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
949
}\
950
\
951
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
952
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
953
}\
954
\
955
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
956
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
957
}\
958
\
959
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
960
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
961
}\
962
\
963
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
964
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
965
    int i;\
966
    for(i=0; i<h; i++){\
967
        uint32_t a, b, c, d, l0, l1, h0, h1;\
968
        a= AV_RN32(&src1[i*src_stride1]);\
969
        b= AV_RN32(&src2[i*src_stride2]);\
970
        c= AV_RN32(&src3[i*src_stride3]);\
971
        d= AV_RN32(&src4[i*src_stride4]);\
972
        l0=  (a&0x03030303UL)\
973
           + (b&0x03030303UL)\
974
           + 0x01010101UL;\
975
        h0= ((a&0xFCFCFCFCUL)>>2)\
976
          + ((b&0xFCFCFCFCUL)>>2);\
977
        l1=  (c&0x03030303UL)\
978
           + (d&0x03030303UL);\
979
        h1= ((c&0xFCFCFCFCUL)>>2)\
980
          + ((d&0xFCFCFCFCUL)>>2);\
981
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
982
        a= AV_RN32(&src1[i*src_stride1+4]);\
983
        b= AV_RN32(&src2[i*src_stride2+4]);\
984
        c= AV_RN32(&src3[i*src_stride3+4]);\
985
        d= AV_RN32(&src4[i*src_stride4+4]);\
986
        l0=  (a&0x03030303UL)\
987
           + (b&0x03030303UL)\
988
           + 0x01010101UL;\
989
        h0= ((a&0xFCFCFCFCUL)>>2)\
990
          + ((b&0xFCFCFCFCUL)>>2);\
991
        l1=  (c&0x03030303UL)\
992
           + (d&0x03030303UL);\
993
        h1= ((c&0xFCFCFCFCUL)>>2)\
994
          + ((d&0xFCFCFCFCUL)>>2);\
995
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
996
    }\
997
}\
998
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
999
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1000
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1001
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1002
}\
1003
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1004
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1005
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1006
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1007
}\
1008
\
1009
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1010
{\
1011
        int i, a0, b0, a1, b1;\
1012
        a0= pixels[0];\
1013
        b0= pixels[1] + 2;\
1014
        a0 += b0;\
1015
        b0 += pixels[2];\
1016
\
1017
        pixels+=line_size;\
1018
        for(i=0; i<h; i+=2){\
1019
            a1= pixels[0];\
1020
            b1= pixels[1];\
1021
            a1 += b1;\
1022
            b1 += pixels[2];\
1023
\
1024
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1025
            block[1]= (b1+b0)>>2;\
1026
\
1027
            pixels+=line_size;\
1028
            block +=line_size;\
1029
\
1030
            a0= pixels[0];\
1031
            b0= pixels[1] + 2;\
1032
            a0 += b0;\
1033
            b0 += pixels[2];\
1034
\
1035
            block[0]= (a1+a0)>>2;\
1036
            block[1]= (b1+b0)>>2;\
1037
            pixels+=line_size;\
1038
            block +=line_size;\
1039
        }\
1040
}\
1041
\
1042
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1043
{\
1044
        int i;\
1045
        const uint32_t a= AV_RN32(pixels  );\
1046
        const uint32_t b= AV_RN32(pixels+1);\
1047
        uint32_t l0=  (a&0x03030303UL)\
1048
                    + (b&0x03030303UL)\
1049
                    + 0x02020202UL;\
1050
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1051
                   + ((b&0xFCFCFCFCUL)>>2);\
1052
        uint32_t l1,h1;\
1053
\
1054
        pixels+=line_size;\
1055
        for(i=0; i<h; i+=2){\
1056
            uint32_t a= AV_RN32(pixels  );\
1057
            uint32_t b= AV_RN32(pixels+1);\
1058
            l1=  (a&0x03030303UL)\
1059
               + (b&0x03030303UL);\
1060
            h1= ((a&0xFCFCFCFCUL)>>2)\
1061
              + ((b&0xFCFCFCFCUL)>>2);\
1062
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1063
            pixels+=line_size;\
1064
            block +=line_size;\
1065
            a= AV_RN32(pixels  );\
1066
            b= AV_RN32(pixels+1);\
1067
            l0=  (a&0x03030303UL)\
1068
               + (b&0x03030303UL)\
1069
               + 0x02020202UL;\
1070
            h0= ((a&0xFCFCFCFCUL)>>2)\
1071
              + ((b&0xFCFCFCFCUL)>>2);\
1072
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073
            pixels+=line_size;\
1074
            block +=line_size;\
1075
        }\
1076
}\
1077
\
1078
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1079
{\
1080
    int j;\
1081
    for(j=0; j<2; j++){\
1082
        int i;\
1083
        const uint32_t a= AV_RN32(pixels  );\
1084
        const uint32_t b= AV_RN32(pixels+1);\
1085
        uint32_t l0=  (a&0x03030303UL)\
1086
                    + (b&0x03030303UL)\
1087
                    + 0x02020202UL;\
1088
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1089
                   + ((b&0xFCFCFCFCUL)>>2);\
1090
        uint32_t l1,h1;\
1091
\
1092
        pixels+=line_size;\
1093
        for(i=0; i<h; i+=2){\
1094
            uint32_t a= AV_RN32(pixels  );\
1095
            uint32_t b= AV_RN32(pixels+1);\
1096
            l1=  (a&0x03030303UL)\
1097
               + (b&0x03030303UL);\
1098
            h1= ((a&0xFCFCFCFCUL)>>2)\
1099
              + ((b&0xFCFCFCFCUL)>>2);\
1100
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1101
            pixels+=line_size;\
1102
            block +=line_size;\
1103
            a= AV_RN32(pixels  );\
1104
            b= AV_RN32(pixels+1);\
1105
            l0=  (a&0x03030303UL)\
1106
               + (b&0x03030303UL)\
1107
               + 0x02020202UL;\
1108
            h0= ((a&0xFCFCFCFCUL)>>2)\
1109
              + ((b&0xFCFCFCFCUL)>>2);\
1110
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1111
            pixels+=line_size;\
1112
            block +=line_size;\
1113
        }\
1114
        pixels+=4-line_size*(h+1);\
1115
        block +=4-line_size*h;\
1116
    }\
1117
}\
1118
\
1119
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1120
{\
1121
    int j;\
1122
    for(j=0; j<2; j++){\
1123
        int i;\
1124
        const uint32_t a= AV_RN32(pixels  );\
1125
        const uint32_t b= AV_RN32(pixels+1);\
1126
        uint32_t l0=  (a&0x03030303UL)\
1127
                    + (b&0x03030303UL)\
1128
                    + 0x01010101UL;\
1129
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1130
                   + ((b&0xFCFCFCFCUL)>>2);\
1131
        uint32_t l1,h1;\
1132
\
1133
        pixels+=line_size;\
1134
        for(i=0; i<h; i+=2){\
1135
            uint32_t a= AV_RN32(pixels  );\
1136
            uint32_t b= AV_RN32(pixels+1);\
1137
            l1=  (a&0x03030303UL)\
1138
               + (b&0x03030303UL);\
1139
            h1= ((a&0xFCFCFCFCUL)>>2)\
1140
              + ((b&0xFCFCFCFCUL)>>2);\
1141
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1142
            pixels+=line_size;\
1143
            block +=line_size;\
1144
            a= AV_RN32(pixels  );\
1145
            b= AV_RN32(pixels+1);\
1146
            l0=  (a&0x03030303UL)\
1147
               + (b&0x03030303UL)\
1148
               + 0x01010101UL;\
1149
            h0= ((a&0xFCFCFCFCUL)>>2)\
1150
              + ((b&0xFCFCFCFCUL)>>2);\
1151
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1152
            pixels+=line_size;\
1153
            block +=line_size;\
1154
        }\
1155
        pixels+=4-line_size*(h+1);\
1156
        block +=4-line_size*h;\
1157
    }\
1158
}\
1159
\
1160
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1161
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1162
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1163
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1164
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1165
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1166
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1167
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1168

    
1169
#define op_avg(a, b) a = rnd_avg32(a, b)
1170
#endif
1171
#define op_put(a, b) a = b
1172

    
1173
PIXOP2(avg, op_avg)
1174
PIXOP2(put, op_put)
1175
#undef op_avg
1176
#undef op_put
1177

    
1178
#define avg2(a,b) ((a+b+1)>>1)
1179
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1180

    
1181
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1182
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1183
}
1184

    
1185
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1186
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1187
}
1188

    
1189
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1190
{
1191
    const int A=(16-x16)*(16-y16);
1192
    const int B=(   x16)*(16-y16);
1193
    const int C=(16-x16)*(   y16);
1194
    const int D=(   x16)*(   y16);
1195
    int i;
1196

    
1197
    for(i=0; i<h; i++)
1198
    {
1199
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1200
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1201
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1202
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1203
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1204
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1205
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1206
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1207
        dst+= stride;
1208
        src+= stride;
1209
    }
1210
}
1211

    
1212
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1213
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1214
{
1215
    int y, vx, vy;
1216
    const int s= 1<<shift;
1217

    
1218
    width--;
1219
    height--;
1220

    
1221
    for(y=0; y<h; y++){
1222
        int x;
1223

    
1224
        vx= ox;
1225
        vy= oy;
1226
        for(x=0; x<8; x++){ //XXX FIXME optimize
1227
            int src_x, src_y, frac_x, frac_y, index;
1228

    
1229
            src_x= vx>>16;
1230
            src_y= vy>>16;
1231
            frac_x= src_x&(s-1);
1232
            frac_y= src_y&(s-1);
1233
            src_x>>=shift;
1234
            src_y>>=shift;
1235

    
1236
            if((unsigned)src_x < width){
1237
                if((unsigned)src_y < height){
1238
                    index= src_x + src_y*stride;
1239
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1240
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1241
                                        + (  src[index+stride  ]*(s-frac_x)
1242
                                           + src[index+stride+1]*   frac_x )*   frac_y
1243
                                        + r)>>(shift*2);
1244
                }else{
1245
                    index= src_x + av_clip(src_y, 0, height)*stride;
1246
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1247
                                          + src[index       +1]*   frac_x )*s
1248
                                        + r)>>(shift*2);
1249
                }
1250
            }else{
1251
                if((unsigned)src_y < height){
1252
                    index= av_clip(src_x, 0, width) + src_y*stride;
1253
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1254
                                           + src[index+stride  ]*   frac_y )*s
1255
                                        + r)>>(shift*2);
1256
                }else{
1257
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1258
                    dst[y*stride + x]=    src[index         ];
1259
                }
1260
            }
1261

    
1262
            vx+= dxx;
1263
            vy+= dyx;
1264
        }
1265
        ox += dxy;
1266
        oy += dyy;
1267
    }
1268
}
1269

    
1270
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1271
    switch(width){
1272
    case 2: put_pixels2_c (dst, src, stride, height); break;
1273
    case 4: put_pixels4_c (dst, src, stride, height); break;
1274
    case 8: put_pixels8_c (dst, src, stride, height); break;
1275
    case 16:put_pixels16_c(dst, src, stride, height); break;
1276
    }
1277
}
1278

    
1279
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280
    int i,j;
1281
    for (i=0; i < height; i++) {
1282
      for (j=0; j < width; j++) {
1283
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1284
      }
1285
      src += stride;
1286
      dst += stride;
1287
    }
1288
}
1289

    
1290
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291
    int i,j;
1292
    for (i=0; i < height; i++) {
1293
      for (j=0; j < width; j++) {
1294
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1295
      }
1296
      src += stride;
1297
      dst += stride;
1298
    }
1299
}
1300

    
1301
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302
    int i,j;
1303
    for (i=0; i < height; i++) {
1304
      for (j=0; j < width; j++) {
1305
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1306
      }
1307
      src += stride;
1308
      dst += stride;
1309
    }
1310
}
1311

    
1312
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313
    int i,j;
1314
    for (i=0; i < height; i++) {
1315
      for (j=0; j < width; j++) {
1316
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1317
      }
1318
      src += stride;
1319
      dst += stride;
1320
    }
1321
}
1322

    
1323
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324
    int i,j;
1325
    for (i=0; i < height; i++) {
1326
      for (j=0; j < width; j++) {
1327
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1328
      }
1329
      src += stride;
1330
      dst += stride;
1331
    }
1332
}
1333

    
1334
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1335
    int i,j;
1336
    for (i=0; i < height; i++) {
1337
      for (j=0; j < width; j++) {
1338
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1339
      }
1340
      src += stride;
1341
      dst += stride;
1342
    }
1343
}
1344

    
1345
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1346
    int i,j;
1347
    for (i=0; i < height; i++) {
1348
      for (j=0; j < width; j++) {
1349
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1350
      }
1351
      src += stride;
1352
      dst += stride;
1353
    }
1354
}
1355

    
1356
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1357
    int i,j;
1358
    for (i=0; i < height; i++) {
1359
      for (j=0; j < width; j++) {
1360
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1361
      }
1362
      src += stride;
1363
      dst += stride;
1364
    }
1365
}
1366

    
1367
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368
    switch(width){
1369
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1370
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1371
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1372
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1373
    }
1374
}
1375

    
1376
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377
    int i,j;
1378
    for (i=0; i < height; i++) {
1379
      for (j=0; j < width; j++) {
1380
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1381
      }
1382
      src += stride;
1383
      dst += stride;
1384
    }
1385
}
1386

    
1387
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388
    int i,j;
1389
    for (i=0; i < height; i++) {
1390
      for (j=0; j < width; j++) {
1391
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1392
      }
1393
      src += stride;
1394
      dst += stride;
1395
    }
1396
}
1397

    
1398
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1399
    int i,j;
1400
    for (i=0; i < height; i++) {
1401
      for (j=0; j < width; j++) {
1402
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1403
      }
1404
      src += stride;
1405
      dst += stride;
1406
    }
1407
}
1408

    
1409
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1410
    int i,j;
1411
    for (i=0; i < height; i++) {
1412
      for (j=0; j < width; j++) {
1413
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1414
      }
1415
      src += stride;
1416
      dst += stride;
1417
    }
1418
}
1419

    
1420
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1421
    int i,j;
1422
    for (i=0; i < height; i++) {
1423
      for (j=0; j < width; j++) {
1424
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1425
      }
1426
      src += stride;
1427
      dst += stride;
1428
    }
1429
}
1430

    
1431
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1432
    int i,j;
1433
    for (i=0; i < height; i++) {
1434
      for (j=0; j < width; j++) {
1435
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1436
      }
1437
      src += stride;
1438
      dst += stride;
1439
    }
1440
}
1441

    
1442
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1443
    int i,j;
1444
    for (i=0; i < height; i++) {
1445
      for (j=0; j < width; j++) {
1446
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1447
      }
1448
      src += stride;
1449
      dst += stride;
1450
    }
1451
}
1452

    
1453
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1454
    int i,j;
1455
    for (i=0; i < height; i++) {
1456
      for (j=0; j < width; j++) {
1457
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1458
      }
1459
      src += stride;
1460
      dst += stride;
1461
    }
1462
}
1463
#if 0
1464
#define TPEL_WIDTH(width)\
1465
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1466
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1467
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1468
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1469
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1470
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1471
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1472
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1473
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1474
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1475
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1476
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1477
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1478
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1479
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1480
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1481
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1482
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1483
#endif
1484

    
1485
#define H264_CHROMA_MC(OPNAME, OP)\
1486
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1487
    const int A=(8-x)*(8-y);\
1488
    const int B=(  x)*(8-y);\
1489
    const int C=(8-x)*(  y);\
1490
    const int D=(  x)*(  y);\
1491
    int i;\
1492
    \
1493
    assert(x<8 && y<8 && x>=0 && y>=0);\
1494
\
1495
    if(D){\
1496
        for(i=0; i<h; i++){\
1497
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1498
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1499
            dst+= stride;\
1500
            src+= stride;\
1501
        }\
1502
    }else{\
1503
        const int E= B+C;\
1504
        const int step= C ? stride : 1;\
1505
        for(i=0; i<h; i++){\
1506
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1507
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1508
            dst+= stride;\
1509
            src+= stride;\
1510
        }\
1511
    }\
1512
}\
1513
\
1514
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1515
    const int A=(8-x)*(8-y);\
1516
    const int B=(  x)*(8-y);\
1517
    const int C=(8-x)*(  y);\
1518
    const int D=(  x)*(  y);\
1519
    int i;\
1520
    \
1521
    assert(x<8 && y<8 && x>=0 && y>=0);\
1522
\
1523
    if(D){\
1524
        for(i=0; i<h; i++){\
1525
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1526
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1527
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1528
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1529
            dst+= stride;\
1530
            src+= stride;\
1531
        }\
1532
    }else{\
1533
        const int E= B+C;\
1534
        const int step= C ? stride : 1;\
1535
        for(i=0; i<h; i++){\
1536
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1537
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1538
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1539
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1540
            dst+= stride;\
1541
            src+= stride;\
1542
        }\
1543
    }\
1544
}\
1545
\
1546
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1547
    const int A=(8-x)*(8-y);\
1548
    const int B=(  x)*(8-y);\
1549
    const int C=(8-x)*(  y);\
1550
    const int D=(  x)*(  y);\
1551
    int i;\
1552
    \
1553
    assert(x<8 && y<8 && x>=0 && y>=0);\
1554
\
1555
    if(D){\
1556
        for(i=0; i<h; i++){\
1557
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1558
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1559
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1560
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1561
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1562
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1563
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1564
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1565
            dst+= stride;\
1566
            src+= stride;\
1567
        }\
1568
    }else{\
1569
        const int E= B+C;\
1570
        const int step= C ? stride : 1;\
1571
        for(i=0; i<h; i++){\
1572
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1573
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1574
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1575
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1576
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1577
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1578
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1579
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1580
            dst+= stride;\
1581
            src+= stride;\
1582
        }\
1583
    }\
1584
}
1585

    
1586
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1587
#define op_put(a, b) a = (((b) + 32)>>6)
1588

    
1589
H264_CHROMA_MC(put_       , op_put)
1590
H264_CHROMA_MC(avg_       , op_avg)
1591
#undef op_avg
1592
#undef op_put
1593

    
1594
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1595
    const int A=(8-x)*(8-y);
1596
    const int B=(  x)*(8-y);
1597
    const int C=(8-x)*(  y);
1598
    const int D=(  x)*(  y);
1599
    int i;
1600

    
1601
    assert(x<8 && y<8 && x>=0 && y>=0);
1602

    
1603
    for(i=0; i<h; i++)
1604
    {
1605
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1606
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1607
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1608
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1609
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1610
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1611
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1612
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1613
        dst+= stride;
1614
        src+= stride;
1615
    }
1616
}
1617

    
1618
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1619
    const int A=(8-x)*(8-y);
1620
    const int B=(  x)*(8-y);
1621
    const int C=(8-x)*(  y);
1622
    const int D=(  x)*(  y);
1623
    int i;
1624

    
1625
    assert(x<8 && y<8 && x>=0 && y>=0);
1626

    
1627
    for(i=0; i<h; i++)
1628
    {
1629
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1630
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1631
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1632
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1633
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1634
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1635
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1636
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1637
        dst+= stride;
1638
        src+= stride;
1639
    }
1640
}
1641

    
1642
#define QPEL_MC(r, OPNAME, RND, OP) \
1643
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1644
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1645
    int i;\
1646
    for(i=0; i<h; i++)\
1647
    {\
1648
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1649
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1650
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1651
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1652
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1653
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1654
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1655
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1656
        dst+=dstStride;\
1657
        src+=srcStride;\
1658
    }\
1659
}\
1660
\
1661
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1662
    const int w=8;\
1663
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1664
    int i;\
1665
    for(i=0; i<w; i++)\
1666
    {\
1667
        const int src0= src[0*srcStride];\
1668
        const int src1= src[1*srcStride];\
1669
        const int src2= src[2*srcStride];\
1670
        const int src3= src[3*srcStride];\
1671
        const int src4= src[4*srcStride];\
1672
        const int src5= src[5*srcStride];\
1673
        const int src6= src[6*srcStride];\
1674
        const int src7= src[7*srcStride];\
1675
        const int src8= src[8*srcStride];\
1676
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1677
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1678
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1679
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1680
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1681
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1682
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1683
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1684
        dst++;\
1685
        src++;\
1686
    }\
1687
}\
1688
\
1689
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1690
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1691
    int i;\
1692
    \
1693
    for(i=0; i<h; i++)\
1694
    {\
1695
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1696
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1697
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1698
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1699
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1700
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1701
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1702
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1703
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1704
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1705
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1706
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1707
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1708
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1709
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1710
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1711
        dst+=dstStride;\
1712
        src+=srcStride;\
1713
    }\
1714
}\
1715
\
1716
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1717
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1718
    int i;\
1719
    const int w=16;\
1720
    for(i=0; i<w; i++)\
1721
    {\
1722
        const int src0= src[0*srcStride];\
1723
        const int src1= src[1*srcStride];\
1724
        const int src2= src[2*srcStride];\
1725
        const int src3= src[3*srcStride];\
1726
        const int src4= src[4*srcStride];\
1727
        const int src5= src[5*srcStride];\
1728
        const int src6= src[6*srcStride];\
1729
        const int src7= src[7*srcStride];\
1730
        const int src8= src[8*srcStride];\
1731
        const int src9= src[9*srcStride];\
1732
        const int src10= src[10*srcStride];\
1733
        const int src11= src[11*srcStride];\
1734
        const int src12= src[12*srcStride];\
1735
        const int src13= src[13*srcStride];\
1736
        const int src14= src[14*srcStride];\
1737
        const int src15= src[15*srcStride];\
1738
        const int src16= src[16*srcStride];\
1739
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1740
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1741
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1742
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1743
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1744
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1745
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1746
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1747
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1748
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1749
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1750
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1751
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1752
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1753
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1754
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1755
        dst++;\
1756
        src++;\
1757
    }\
1758
}\
1759
\
1760
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1761
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1762
}\
1763
\
1764
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1765
    uint8_t half[64];\
1766
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1767
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1768
}\
1769
\
1770
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1771
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1772
}\
1773
\
1774
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1775
    uint8_t half[64];\
1776
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1777
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1778
}\
1779
\
1780
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1781
    uint8_t full[16*9];\
1782
    uint8_t half[64];\
1783
    copy_block9(full, src, 16, stride, 9);\
1784
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1785
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1786
}\
1787
\
1788
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1789
    uint8_t full[16*9];\
1790
    copy_block9(full, src, 16, stride, 9);\
1791
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1792
}\
1793
\
1794
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1795
    uint8_t full[16*9];\
1796
    uint8_t half[64];\
1797
    copy_block9(full, src, 16, stride, 9);\
1798
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1799
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1800
}\
1801
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802
    uint8_t full[16*9];\
1803
    uint8_t halfH[72];\
1804
    uint8_t halfV[64];\
1805
    uint8_t halfHV[64];\
1806
    copy_block9(full, src, 16, stride, 9);\
1807
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1809
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1811
}\
1812
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1813
    uint8_t full[16*9];\
1814
    uint8_t halfH[72];\
1815
    uint8_t halfHV[64];\
1816
    copy_block9(full, src, 16, stride, 9);\
1817
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1818
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1819
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1820
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1821
}\
1822
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1823
    uint8_t full[16*9];\
1824
    uint8_t halfH[72];\
1825
    uint8_t halfV[64];\
1826
    uint8_t halfHV[64];\
1827
    copy_block9(full, src, 16, stride, 9);\
1828
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1829
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1830
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1831
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1832
}\
1833
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1834
    uint8_t full[16*9];\
1835
    uint8_t halfH[72];\
1836
    uint8_t halfHV[64];\
1837
    copy_block9(full, src, 16, stride, 9);\
1838
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1839
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1840
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1842
}\
1843
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1844
    uint8_t full[16*9];\
1845
    uint8_t halfH[72];\
1846
    uint8_t halfV[64];\
1847
    uint8_t halfHV[64];\
1848
    copy_block9(full, src, 16, stride, 9);\
1849
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1850
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1851
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1852
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1853
}\
1854
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1855
    uint8_t full[16*9];\
1856
    uint8_t halfH[72];\
1857
    uint8_t halfHV[64];\
1858
    copy_block9(full, src, 16, stride, 9);\
1859
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1860
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1861
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1862
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1863
}\
1864
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865
    uint8_t full[16*9];\
1866
    uint8_t halfH[72];\
1867
    uint8_t halfV[64];\
1868
    uint8_t halfHV[64];\
1869
    copy_block9(full, src, 16, stride, 9);\
1870
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1871
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1872
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1874
}\
1875
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1876
    uint8_t full[16*9];\
1877
    uint8_t halfH[72];\
1878
    uint8_t halfHV[64];\
1879
    copy_block9(full, src, 16, stride, 9);\
1880
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1882
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1884
}\
1885
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1886
    uint8_t halfH[72];\
1887
    uint8_t halfHV[64];\
1888
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1889
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1890
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1891
}\
1892
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1893
    uint8_t halfH[72];\
1894
    uint8_t halfHV[64];\
1895
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1896
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1897
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1898
}\
1899
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1900
    uint8_t full[16*9];\
1901
    uint8_t halfH[72];\
1902
    uint8_t halfV[64];\
1903
    uint8_t halfHV[64];\
1904
    copy_block9(full, src, 16, stride, 9);\
1905
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1906
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1907
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1908
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1909
}\
1910
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1911
    uint8_t full[16*9];\
1912
    uint8_t halfH[72];\
1913
    copy_block9(full, src, 16, stride, 9);\
1914
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1915
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1916
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1917
}\
1918
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1919
    uint8_t full[16*9];\
1920
    uint8_t halfH[72];\
1921
    uint8_t halfV[64];\
1922
    uint8_t halfHV[64];\
1923
    copy_block9(full, src, 16, stride, 9);\
1924
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1925
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1926
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1927
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1928
}\
1929
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1930
    uint8_t full[16*9];\
1931
    uint8_t halfH[72];\
1932
    copy_block9(full, src, 16, stride, 9);\
1933
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1934
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1935
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1936
}\
1937
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1938
    uint8_t halfH[72];\
1939
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1940
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1941
}\
1942
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1943
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1944
}\
1945
\
1946
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1947
    uint8_t half[256];\
1948
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1949
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1950
}\
1951
\
1952
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1953
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1954
}\
1955
\
1956
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1957
    uint8_t half[256];\
1958
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1959
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1960
}\
1961
\
1962
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1963
    uint8_t full[24*17];\
1964
    uint8_t half[256];\
1965
    copy_block17(full, src, 24, stride, 17);\
1966
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1967
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1968
}\
1969
\
1970
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1971
    uint8_t full[24*17];\
1972
    copy_block17(full, src, 24, stride, 17);\
1973
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1974
}\
1975
\
1976
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1977
    uint8_t full[24*17];\
1978
    uint8_t half[256];\
1979
    copy_block17(full, src, 24, stride, 17);\
1980
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1981
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1982
}\
1983
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1984
    uint8_t full[24*17];\
1985
    uint8_t halfH[272];\
1986
    uint8_t halfV[256];\
1987
    uint8_t halfHV[256];\
1988
    copy_block17(full, src, 24, stride, 17);\
1989
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1991
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1993
}\
1994
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1995
    uint8_t full[24*17];\
1996
    uint8_t halfH[272];\
1997
    uint8_t halfHV[256];\
1998
    copy_block17(full, src, 24, stride, 17);\
1999
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2000
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2001
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2002
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2003
}\
2004
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2005
    uint8_t full[24*17];\
2006
    uint8_t halfH[272];\
2007
    uint8_t halfV[256];\
2008
    uint8_t halfHV[256];\
2009
    copy_block17(full, src, 24, stride, 17);\
2010
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2011
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2012
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2013
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2014
}\
2015
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2016
    uint8_t full[24*17];\
2017
    uint8_t halfH[272];\
2018
    uint8_t halfHV[256];\
2019
    copy_block17(full, src, 24, stride, 17);\
2020
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2021
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2022
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2023
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2024
}\
2025
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2026
    uint8_t full[24*17];\
2027
    uint8_t halfH[272];\
2028
    uint8_t halfV[256];\
2029
    uint8_t halfHV[256];\
2030
    copy_block17(full, src, 24, stride, 17);\
2031
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2032
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2033
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2034
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2035
}\
2036
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2037
    uint8_t full[24*17];\
2038
    uint8_t halfH[272];\
2039
    uint8_t halfHV[256];\
2040
    copy_block17(full, src, 24, stride, 17);\
2041
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2042
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2043
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2044
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2045
}\
2046
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047
    uint8_t full[24*17];\
2048
    uint8_t halfH[272];\
2049
    uint8_t halfV[256];\
2050
    uint8_t halfHV[256];\
2051
    copy_block17(full, src, 24, stride, 17);\
2052
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2053
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2054
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2056
}\
2057
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2058
    uint8_t full[24*17];\
2059
    uint8_t halfH[272];\
2060
    uint8_t halfHV[256];\
2061
    copy_block17(full, src, 24, stride, 17);\
2062
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2063
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2064
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2065
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2066
}\
2067
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2068
    uint8_t halfH[272];\
2069
    uint8_t halfHV[256];\
2070
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2071
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2072
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2073
}\
2074
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2075
    uint8_t halfH[272];\
2076
    uint8_t halfHV[256];\
2077
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2078
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2079
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2080
}\
2081
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2082
    uint8_t full[24*17];\
2083
    uint8_t halfH[272];\
2084
    uint8_t halfV[256];\
2085
    uint8_t halfHV[256];\
2086
    copy_block17(full, src, 24, stride, 17);\
2087
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2088
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2089
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2090
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2091
}\
2092
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2093
    uint8_t full[24*17];\
2094
    uint8_t halfH[272];\
2095
    copy_block17(full, src, 24, stride, 17);\
2096
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2097
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2098
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2099
}\
2100
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2101
    uint8_t full[24*17];\
2102
    uint8_t halfH[272];\
2103
    uint8_t halfV[256];\
2104
    uint8_t halfHV[256];\
2105
    copy_block17(full, src, 24, stride, 17);\
2106
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2107
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2108
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2109
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2110
}\
2111
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2112
    uint8_t full[24*17];\
2113
    uint8_t halfH[272];\
2114
    copy_block17(full, src, 24, stride, 17);\
2115
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2116
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2117
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2118
}\
2119
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2120
    uint8_t halfH[272];\
2121
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2122
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2123
}
2124

    
2125
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2126
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2127
#define op_put(a, b) a = cm[((b) + 16)>>5]
2128
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2129

    
2130
QPEL_MC(0, put_       , _       , op_put)
2131
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2132
QPEL_MC(0, avg_       , _       , op_avg)
2133
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2134
#undef op_avg
2135
#undef op_avg_no_rnd
2136
#undef op_put
2137
#undef op_put_no_rnd
2138

    
2139
#if 1
2140
#define H264_LOWPASS(OPNAME, OP, OP2) \
2141
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2142
    const int h=2;\
2143
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2144
    int i;\
2145
    for(i=0; i<h; i++)\
2146
    {\
2147
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2148
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2149
        dst+=dstStride;\
2150
        src+=srcStride;\
2151
    }\
2152
}\
2153
\
2154
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2155
    const int w=2;\
2156
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2157
    int i;\
2158
    for(i=0; i<w; i++)\
2159
    {\
2160
        const int srcB= src[-2*srcStride];\
2161
        const int srcA= src[-1*srcStride];\
2162
        const int src0= src[0 *srcStride];\
2163
        const int src1= src[1 *srcStride];\
2164
        const int src2= src[2 *srcStride];\
2165
        const int src3= src[3 *srcStride];\
2166
        const int src4= src[4 *srcStride];\
2167
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2168
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2169
        dst++;\
2170
        src++;\
2171
    }\
2172
}\
2173
\
2174
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2175
    const int h=2;\
2176
    const int w=2;\
2177
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2178
    int i;\
2179
    src -= 2*srcStride;\
2180
    for(i=0; i<h+5; i++)\
2181
    {\
2182
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2183
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2184
        tmp+=tmpStride;\
2185
        src+=srcStride;\
2186
    }\
2187
    tmp -= tmpStride*(h+5-2);\
2188
    for(i=0; i<w; i++)\
2189
    {\
2190
        const int tmpB= tmp[-2*tmpStride];\
2191
        const int tmpA= tmp[-1*tmpStride];\
2192
        const int tmp0= tmp[0 *tmpStride];\
2193
        const int tmp1= tmp[1 *tmpStride];\
2194
        const int tmp2= tmp[2 *tmpStride];\
2195
        const int tmp3= tmp[3 *tmpStride];\
2196
        const int tmp4= tmp[4 *tmpStride];\
2197
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2198
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2199
        dst++;\
2200
        tmp++;\
2201
    }\
2202
}\
2203
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2204
    const int h=4;\
2205
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2206
    int i;\
2207
    for(i=0; i<h; i++)\
2208
    {\
2209
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2210
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2211
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2212
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2213
        dst+=dstStride;\
2214
        src+=srcStride;\
2215
    }\
2216
}\
2217
\
2218
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2219
    const int w=4;\
2220
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2221
    int i;\
2222
    for(i=0; i<w; i++)\
2223
    {\
2224
        const int srcB= src[-2*srcStride];\
2225
        const int srcA= src[-1*srcStride];\
2226
        const int src0= src[0 *srcStride];\
2227
        const int src1= src[1 *srcStride];\
2228
        const int src2= src[2 *srcStride];\
2229
        const int src3= src[3 *srcStride];\
2230
        const int src4= src[4 *srcStride];\
2231
        const int src5= src[5 *srcStride];\
2232
        const int src6= src[6 *srcStride];\
2233
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2234
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2235
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2236
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2237
        dst++;\
2238
        src++;\
2239
    }\
2240
}\
2241
\
2242
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2243
    const int h=4;\
2244
    const int w=4;\
2245
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2246
    int i;\
2247
    src -= 2*srcStride;\
2248
    for(i=0; i<h+5; i++)\
2249
    {\
2250
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2251
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2252
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2253
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2254
        tmp+=tmpStride;\
2255
        src+=srcStride;\
2256
    }\
2257
    tmp -= tmpStride*(h+5-2);\
2258
    for(i=0; i<w; i++)\
2259
    {\
2260
        const int tmpB= tmp[-2*tmpStride];\
2261
        const int tmpA= tmp[-1*tmpStride];\
2262
        const int tmp0= tmp[0 *tmpStride];\
2263
        const int tmp1= tmp[1 *tmpStride];\
2264
        const int tmp2= tmp[2 *tmpStride];\
2265
        const int tmp3= tmp[3 *tmpStride];\
2266
        const int tmp4= tmp[4 *tmpStride];\
2267
        const int tmp5= tmp[5 *tmpStride];\
2268
        const int tmp6= tmp[6 *tmpStride];\
2269
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2270
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2271
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2272
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2273
        dst++;\
2274
        tmp++;\
2275
    }\
2276
}\
2277
\
2278
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2279
    const int h=8;\
2280
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2281
    int i;\
2282
    for(i=0; i<h; i++)\
2283
    {\
2284
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2285
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2286
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2287
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2288
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2289
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2290
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2291
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2292
        dst+=dstStride;\
2293
        src+=srcStride;\
2294
    }\
2295
}\
2296
\
2297
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2298
    const int w=8;\
2299
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2300
    int i;\
2301
    for(i=0; i<w; i++)\
2302
    {\
2303
        const int srcB= src[-2*srcStride];\
2304
        const int srcA= src[-1*srcStride];\
2305
        const int src0= src[0 *srcStride];\
2306
        const int src1= src[1 *srcStride];\
2307
        const int src2= src[2 *srcStride];\
2308
        const int src3= src[3 *srcStride];\
2309
        const int src4= src[4 *srcStride];\
2310
        const int src5= src[5 *srcStride];\
2311
        const int src6= src[6 *srcStride];\
2312
        const int src7= src[7 *srcStride];\
2313
        const int src8= src[8 *srcStride];\
2314
        const int src9= src[9 *srcStride];\
2315
        const int src10=src[10*srcStride];\
2316
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2317
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2318
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2319
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2320
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2321
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2322
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2323
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2324
        dst++;\
2325
        src++;\
2326
    }\
2327
}\
2328
\
2329
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2330
    const int h=8;\
2331
    const int w=8;\
2332
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2333
    int i;\
2334
    src -= 2*srcStride;\
2335
    for(i=0; i<h+5; i++)\
2336
    {\
2337
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2338
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2339
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2340
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2341
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2342
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2343
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2344
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2345
        tmp+=tmpStride;\
2346
        src+=srcStride;\
2347
    }\
2348
    tmp -= tmpStride*(h+5-2);\
2349
    for(i=0; i<w; i++)\
2350
    {\
2351
        const int tmpB= tmp[-2*tmpStride];\
2352
        const int tmpA= tmp[-1*tmpStride];\
2353
        const int tmp0= tmp[0 *tmpStride];\
2354
        const int tmp1= tmp[1 *tmpStride];\
2355
        const int tmp2= tmp[2 *tmpStride];\
2356
        const int tmp3= tmp[3 *tmpStride];\
2357
        const int tmp4= tmp[4 *tmpStride];\
2358
        const int tmp5= tmp[5 *tmpStride];\
2359
        const int tmp6= tmp[6 *tmpStride];\
2360
        const int tmp7= tmp[7 *tmpStride];\
2361
        const int tmp8= tmp[8 *tmpStride];\
2362
        const int tmp9= tmp[9 *tmpStride];\
2363
        const int tmp10=tmp[10*tmpStride];\
2364
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2365
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2366
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2367
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2368
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2369
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2370
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2371
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2372
        dst++;\
2373
        tmp++;\
2374
    }\
2375
}\
2376
\
2377
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2378
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2379
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2380
    src += 8*srcStride;\
2381
    dst += 8*dstStride;\
2382
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2383
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2384
}\
2385
\
2386
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2387
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2388
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2389
    src += 8*srcStride;\
2390
    dst += 8*dstStride;\
2391
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2392
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2393
}\
2394
\
2395
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2396
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2397
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2398
    src += 8*srcStride;\
2399
    dst += 8*dstStride;\
2400
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2401
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2402
}\
2403

    
2404
#define H264_MC(OPNAME, SIZE) \
2405
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2406
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2407
}\
2408
\
2409
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2410
    uint8_t half[SIZE*SIZE];\
2411
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2412
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2413
}\
2414
\
2415
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2416
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2417
}\
2418
\
2419
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2420
    uint8_t half[SIZE*SIZE];\
2421
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2422
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2423
}\
2424
\
2425
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2426
    uint8_t full[SIZE*(SIZE+5)];\
2427
    uint8_t * const full_mid= full + SIZE*2;\
2428
    uint8_t half[SIZE*SIZE];\
2429
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2430
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2431
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2432
}\
2433
\
2434
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2435
    uint8_t full[SIZE*(SIZE+5)];\
2436
    uint8_t * const full_mid= full + SIZE*2;\
2437
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2438
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2439
}\
2440
\
2441
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2442
    uint8_t full[SIZE*(SIZE+5)];\
2443
    uint8_t * const full_mid= full + SIZE*2;\
2444
    uint8_t half[SIZE*SIZE];\
2445
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2446
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2447
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2448
}\
2449
\
2450
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2451
    uint8_t full[SIZE*(SIZE+5)];\
2452
    uint8_t * const full_mid= full + SIZE*2;\
2453
    uint8_t halfH[SIZE*SIZE];\
2454
    uint8_t halfV[SIZE*SIZE];\
2455
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2456
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2457
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2458
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2459
}\
2460
\
2461
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2462
    uint8_t full[SIZE*(SIZE+5)];\
2463
    uint8_t * const full_mid= full + SIZE*2;\
2464
    uint8_t halfH[SIZE*SIZE];\
2465
    uint8_t halfV[SIZE*SIZE];\
2466
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2467
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2468
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2469
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2470
}\
2471
\
2472
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2473
    uint8_t full[SIZE*(SIZE+5)];\
2474
    uint8_t * const full_mid= full + SIZE*2;\
2475
    uint8_t halfH[SIZE*SIZE];\
2476
    uint8_t halfV[SIZE*SIZE];\
2477
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2478
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2479
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2480
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2481
}\
2482
\
2483
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2484
    uint8_t full[SIZE*(SIZE+5)];\
2485
    uint8_t * const full_mid= full + SIZE*2;\
2486
    uint8_t halfH[SIZE*SIZE];\
2487
    uint8_t halfV[SIZE*SIZE];\
2488
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2489
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2490
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2491
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2492
}\
2493
\
2494
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2495
    int16_t tmp[SIZE*(SIZE+5)];\
2496
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2497
}\
2498
\
2499
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2500
    int16_t tmp[SIZE*(SIZE+5)];\
2501
    uint8_t halfH[SIZE*SIZE];\
2502
    uint8_t halfHV[SIZE*SIZE];\
2503
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2504
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2505
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2506
}\
2507
\
2508
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2509
    int16_t tmp[SIZE*(SIZE+5)];\
2510
    uint8_t halfH[SIZE*SIZE];\
2511
    uint8_t halfHV[SIZE*SIZE];\
2512
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2513
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2514
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2515
}\
2516
\
2517
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2518
    uint8_t full[SIZE*(SIZE+5)];\
2519
    uint8_t * const full_mid= full + SIZE*2;\
2520
    int16_t tmp[SIZE*(SIZE+5)];\
2521
    uint8_t halfV[SIZE*SIZE];\
2522
    uint8_t halfHV[SIZE*SIZE];\
2523
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2524
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2525
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2526
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2527
}\
2528
\
2529
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2530
    uint8_t full[SIZE*(SIZE+5)];\
2531
    uint8_t * const full_mid= full + SIZE*2;\
2532
    int16_t tmp[SIZE*(SIZE+5)];\
2533
    uint8_t halfV[SIZE*SIZE];\
2534
    uint8_t halfHV[SIZE*SIZE];\
2535
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2536
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2537
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2538
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2539
}\
2540

    
2541
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2542
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2543
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2544
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2545
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2546

    
2547
H264_LOWPASS(put_       , op_put, op2_put)
2548
H264_LOWPASS(avg_       , op_avg, op2_avg)
2549
H264_MC(put_, 2)
2550
H264_MC(put_, 4)
2551
H264_MC(put_, 8)
2552
H264_MC(put_, 16)
2553
H264_MC(avg_, 4)
2554
H264_MC(avg_, 8)
2555
H264_MC(avg_, 16)
2556

    
2557
#undef op_avg
2558
#undef op_put
2559
#undef op2_avg
2560
#undef op2_put
2561
#endif
2562

    
2563
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2564
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2565
    int i;
2566

    
2567
    for(i=0; i<h; i++){
2568
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2569
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2570
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2571
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2572
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2573
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2574
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2575
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2576
        dst+=dstStride;
2577
        src+=srcStride;
2578
    }
2579
}
2580

    
2581
#if CONFIG_CAVS_DECODER
2582
/* AVS specific */
2583
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2584
    put_pixels8_c(dst, src, stride, 8);
2585
}
2586
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2587
    avg_pixels8_c(dst, src, stride, 8);
2588
}
2589
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2590
    put_pixels16_c(dst, src, stride, 16);
2591
}
2592
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2593
    avg_pixels16_c(dst, src, stride, 16);
2594
}
2595
#endif /* CONFIG_CAVS_DECODER */
2596

    
2597
#if CONFIG_VC1_DECODER
2598
/* VC-1 specific */
2599
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2600
    put_pixels8_c(dst, src, stride, 8);
2601
}
2602
void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2603
    avg_pixels8_c(dst, src, stride, 8);
2604
}
2605
#endif /* CONFIG_VC1_DECODER */
2606

    
2607
#if CONFIG_RV40_DECODER
2608
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2609
    put_pixels16_xy2_c(dst, src, stride, 16);
2610
}
2611
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2612
    avg_pixels16_xy2_c(dst, src, stride, 16);
2613
}
2614
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2615
    put_pixels8_xy2_c(dst, src, stride, 8);
2616
}
2617
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2618
    avg_pixels8_xy2_c(dst, src, stride, 8);
2619
}
2620
#endif /* CONFIG_RV40_DECODER */
2621

    
2622
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2623
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2624
    int i;
2625

    
2626
    for(i=0; i<w; i++){
2627
        const int src_1= src[ -srcStride];
2628
        const int src0 = src[0          ];
2629
        const int src1 = src[  srcStride];
2630
        const int src2 = src[2*srcStride];
2631
        const int src3 = src[3*srcStride];
2632
        const int src4 = src[4*srcStride];
2633
        const int src5 = src[5*srcStride];
2634
        const int src6 = src[6*srcStride];
2635
        const int src7 = src[7*srcStride];
2636
        const int src8 = src[8*srcStride];
2637
        const int src9 = src[9*srcStride];
2638
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2639
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2640
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2641
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2642
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2643
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2644
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2645
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2646
        src++;
2647
        dst++;
2648
    }
2649
}
2650

    
2651
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2652
    put_pixels8_c(dst, src, stride, 8);
2653
}
2654

    
2655
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2656
    uint8_t half[64];
2657
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2658
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2659
}
2660

    
2661
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2662
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2663
}
2664

    
2665
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2666
    uint8_t half[64];
2667
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2668
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2669
}
2670

    
2671
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2672
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2673
}
2674

    
2675
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2676
    uint8_t halfH[88];
2677
    uint8_t halfV[64];
2678
    uint8_t halfHV[64];
2679
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2680
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2681
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2682
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2683
}
2684
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2685
    uint8_t halfH[88];
2686
    uint8_t halfV[64];
2687
    uint8_t halfHV[64];
2688
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2689
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2690
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2691
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2692
}
2693
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2694
    uint8_t halfH[88];
2695
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2696
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2697
}
2698

    
2699
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2700
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2701
    int x;
2702
    const int strength= ff_h263_loop_filter_strength[qscale];
2703

    
2704
    for(x=0; x<8; x++){
2705
        int d1, d2, ad1;
2706
        int p0= src[x-2*stride];
2707
        int p1= src[x-1*stride];
2708
        int p2= src[x+0*stride];
2709
        int p3= src[x+1*stride];
2710
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2711

    
2712
        if     (d<-2*strength) d1= 0;
2713
        else if(d<-  strength) d1=-2*strength - d;
2714
        else if(d<   strength) d1= d;
2715
        else if(d< 2*strength) d1= 2*strength - d;
2716
        else                   d1= 0;
2717

    
2718
        p1 += d1;
2719
        p2 -= d1;
2720
        if(p1&256) p1= ~(p1>>31);
2721
        if(p2&256) p2= ~(p2>>31);
2722

    
2723
        src[x-1*stride] = p1;
2724
        src[x+0*stride] = p2;
2725

    
2726
        ad1= FFABS(d1)>>1;
2727

    
2728
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2729

    
2730
        src[x-2*stride] = p0 - d2;
2731
        src[x+  stride] = p3 + d2;
2732
    }
2733
    }
2734
}
2735

    
2736
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2737
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2738
    int y;
2739
    const int strength= ff_h263_loop_filter_strength[qscale];
2740

    
2741
    for(y=0; y<8; y++){
2742
        int d1, d2, ad1;
2743
        int p0= src[y*stride-2];
2744
        int p1= src[y*stride-1];
2745
        int p2= src[y*stride+0];
2746
        int p3= src[y*stride+1];
2747
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2748

    
2749
        if     (d<-2*strength) d1= 0;
2750
        else if(d<-  strength) d1=-2*strength - d;
2751
        else if(d<   strength) d1= d;
2752
        else if(d< 2*strength) d1= 2*strength - d;
2753
        else                   d1= 0;
2754

    
2755
        p1 += d1;
2756
        p2 -= d1;
2757
        if(p1&256) p1= ~(p1>>31);
2758
        if(p2&256) p2= ~(p2>>31);
2759

    
2760
        src[y*stride-1] = p1;
2761
        src[y*stride+0] = p2;
2762

    
2763
        ad1= FFABS(d1)>>1;
2764

    
2765
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2766

    
2767
        src[y*stride-2] = p0 - d2;
2768
        src[y*stride+1] = p3 + d2;
2769
    }
2770
    }
2771
}
2772

    
2773
static void h261_loop_filter_c(uint8_t *src, int stride){
2774
    int x,y,xy,yz;
2775
    int temp[64];
2776

    
2777
    for(x=0; x<8; x++){
2778
        temp[x      ] = 4*src[x           ];
2779
        temp[x + 7*8] = 4*src[x + 7*stride];
2780
    }
2781
    for(y=1; y<7; y++){
2782
        for(x=0; x<8; x++){
2783
            xy = y * stride + x;
2784
            yz = y * 8 + x;
2785
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2786
        }
2787
    }
2788

    
2789
    for(y=0; y<8; y++){
2790
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2791
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2792
        for(x=1; x<7; x++){
2793
            xy = y * stride + x;
2794
            yz = y * 8 + x;
2795
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2796
        }
2797
    }
2798
}
2799

    
2800
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2801
{
2802
    int s, i;
2803

    
2804
    s = 0;
2805
    for(i=0;i<h;i++) {
2806
        s += abs(pix1[0] - pix2[0]);
2807
        s += abs(pix1[1] - pix2[1]);
2808
        s += abs(pix1[2] - pix2[2]);
2809
        s += abs(pix1[3] - pix2[3]);
2810
        s += abs(pix1[4] - pix2[4]);
2811
        s += abs(pix1[5] - pix2[5]);
2812
        s += abs(pix1[6] - pix2[6]);
2813
        s += abs(pix1[7] - pix2[7]);
2814
        s += abs(pix1[8] - pix2[8]);
2815
        s += abs(pix1[9] - pix2[9]);
2816
        s += abs(pix1[10] - pix2[10]);
2817
        s += abs(pix1[11] - pix2[11]);
2818
        s += abs(pix1[12] - pix2[12]);
2819
        s += abs(pix1[13] - pix2[13]);
2820
        s += abs(pix1[14] - pix2[14]);
2821
        s += abs(pix1[15] - pix2[15]);
2822
        pix1 += line_size;
2823
        pix2 += line_size;
2824
    }
2825
    return s;
2826
}
2827

    
2828
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2829
{
2830
    int s, i;
2831

    
2832
    s = 0;
2833
    for(i=0;i<h;i++) {
2834
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2835
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2836
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2837
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2838
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2839
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2840
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2841
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2842
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2843
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2844
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2845
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2846
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2847
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2848
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2849
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2850
        pix1 += line_size;
2851
        pix2 += line_size;
2852
    }
2853
    return s;
2854
}
2855

    
2856
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2857
{
2858
    int s, i;
2859
    uint8_t *pix3 = pix2 + line_size;
2860

    
2861
    s = 0;
2862
    for(i=0;i<h;i++) {
2863
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2864
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2865
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2866
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2867
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2868
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2869
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2870
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2871
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2872
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2873
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2874
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2875
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2876
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2877
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2878
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2879
        pix1 += line_size;
2880
        pix2 += line_size;
2881
        pix3 += line_size;
2882
    }
2883
    return s;
2884
}
2885

    
2886
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2887
{
2888
    int s, i;
2889
    uint8_t *pix3 = pix2 + line_size;
2890

    
2891
    s = 0;
2892
    for(i=0;i<h;i++) {
2893
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2894
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2895
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2896
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2897
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2898
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2899
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2900
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2901
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2902
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2903
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2904
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2905
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2906
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2907
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2908
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2909
        pix1 += line_size;
2910
        pix2 += line_size;
2911
        pix3 += line_size;
2912
    }
2913
    return s;
2914
}
2915

    
2916
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2917
{
2918
    int s, i;
2919

    
2920
    s = 0;
2921
    for(i=0;i<h;i++) {
2922
        s += abs(pix1[0] - pix2[0]);
2923
        s += abs(pix1[1] - pix2[1]);
2924
        s += abs(pix1[2] - pix2[2]);
2925
        s += abs(pix1[3] - pix2[3]);
2926
        s += abs(pix1[4] - pix2[4]);
2927
        s += abs(pix1[5] - pix2[5]);
2928
        s += abs(pix1[6] - pix2[6]);
2929
        s += abs(pix1[7] - pix2[7]);
2930
        pix1 += line_size;
2931
        pix2 += line_size;
2932
    }
2933
    return s;
2934
}
2935

    
2936
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2937
{
2938
    int s, i;
2939

    
2940
    s = 0;
2941
    for(i=0;i<h;i++) {
2942
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2943
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2944
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2945
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2946
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2947
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2948
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2949
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2950
        pix1 += line_size;
2951
        pix2 += line_size;
2952
    }
2953
    return s;
2954
}
2955

    
2956
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2957
{
2958
    int s, i;
2959
    uint8_t *pix3 = pix2 + line_size;
2960

    
2961
    s = 0;
2962
    for(i=0;i<h;i++) {
2963
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2964
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2965
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2966
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2967
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2968
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2969
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2970
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2971
        pix1 += line_size;
2972
        pix2 += line_size;
2973
        pix3 += line_size;
2974
    }
2975
    return s;
2976
}
2977

    
2978
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2979
{
2980
    int s, i;
2981
    uint8_t *pix3 = pix2 + line_size;
2982

    
2983
    s = 0;
2984
    for(i=0;i<h;i++) {
2985
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2986
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2987
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2988
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2989
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2990
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2991
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2992
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2993
        pix1 += line_size;
2994
        pix2 += line_size;
2995
        pix3 += line_size;
2996
    }
2997
    return s;
2998
}
2999

    
3000
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3001
    MpegEncContext *c = v;
3002
    int score1=0;
3003
    int score2=0;
3004
    int x,y;
3005

    
3006
    for(y=0; y<h; y++){
3007
        for(x=0; x<16; x++){
3008
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3009
        }
3010
        if(y+1<h){
3011
            for(x=0; x<15; x++){
3012
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3013
                             - s1[x+1] + s1[x+1+stride])
3014
                        -FFABS(  s2[x  ] - s2[x  +stride]
3015
                             - s2[x+1] + s2[x+1+stride]);
3016
            }
3017
        }
3018
        s1+= stride;
3019
        s2+= stride;
3020
    }
3021

    
3022
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3023
    else  return score1 + FFABS(score2)*8;
3024
}
3025

    
3026
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3027
    MpegEncContext *c = v;
3028
    int score1=0;
3029
    int score2=0;
3030
    int x,y;
3031

    
3032
    for(y=0; y<h; y++){
3033
        for(x=0; x<8; x++){
3034
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3035
        }
3036
        if(y+1<h){
3037
            for(x=0; x<7; x++){
3038
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3039
                             - s1[x+1] + s1[x+1+stride])
3040
                        -FFABS(  s2[x  ] - s2[x  +stride]
3041
                             - s2[x+1] + s2[x+1+stride]);
3042
            }
3043
        }
3044
        s1+= stride;
3045
        s2+= stride;
3046
    }
3047

    
3048
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3049
    else  return score1 + FFABS(score2)*8;
3050
}
3051

    
3052
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3053
    int i;
3054
    unsigned int sum=0;
3055

    
3056
    for(i=0; i<8*8; i++){
3057
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3058
        int w= weight[i];
3059
        b>>= RECON_SHIFT;
3060
        assert(-512<b && b<512);
3061

    
3062
        sum += (w*b)*(w*b)>>4;
3063
    }
3064
    return sum>>2;
3065
}
3066

    
3067
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3068
    int i;
3069

    
3070
    for(i=0; i<8*8; i++){
3071
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3072
    }
3073
}
3074

    
3075
/**
3076
 * permutes an 8x8 block.
3077
 * @param block the block which will be permuted according to the given permutation vector
3078
 * @param permutation the permutation vector
3079
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3080
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3081
 *                  (inverse) permutated to scantable order!
3082
 */
3083
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3084
{
3085
    int i;
3086
    DCTELEM temp[64];
3087

    
3088
    if(last<=0) return;
3089
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3090

    
3091
    for(i=0; i<=last; i++){
3092
        const int j= scantable[i];
3093
        temp[j]= block[j];
3094
        block[j]=0;
3095
    }
3096

    
3097
    for(i=0; i<=last; i++){
3098
        const int j= scantable[i];
3099
        const int perm_j= permutation[j];
3100
        block[perm_j]= temp[j];
3101
    }
3102
}
3103

    
3104
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3105
    return 0;
3106
}
3107

    
3108
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3109
    int i;
3110

    
3111
    memset(cmp, 0, sizeof(void*)*6);
3112

    
3113
    for(i=0; i<6; i++){
3114
        switch(type&0xFF){
3115
        case FF_CMP_SAD:
3116
            cmp[i]= c->sad[i];
3117
            break;
3118
        case FF_CMP_SATD:
3119
            cmp[i]= c->hadamard8_diff[i];
3120
            break;
3121
        case FF_CMP_SSE:
3122
            cmp[i]= c->sse[i];
3123
            break;
3124
        case FF_CMP_DCT:
3125
            cmp[i]= c->dct_sad[i];
3126
            break;
3127
        case FF_CMP_DCT264:
3128
            cmp[i]= c->dct264_sad[i];
3129
            break;
3130
        case FF_CMP_DCTMAX:
3131
            cmp[i]= c->dct_max[i];
3132
            break;
3133
        case FF_CMP_PSNR:
3134
            cmp[i]= c->quant_psnr[i];
3135
            break;
3136
        case FF_CMP_BIT:
3137
            cmp[i]= c->bit[i];
3138
            break;
3139
        case FF_CMP_RD:
3140
            cmp[i]= c->rd[i];
3141
            break;
3142
        case FF_CMP_VSAD:
3143
            cmp[i]= c->vsad[i];
3144
            break;
3145
        case FF_CMP_VSSE:
3146
            cmp[i]= c->vsse[i];
3147
            break;
3148
        case FF_CMP_ZERO:
3149
            cmp[i]= zero_cmp;
3150
            break;
3151
        case FF_CMP_NSSE:
3152
            cmp[i]= c->nsse[i];
3153
            break;
3154
#if CONFIG_DWT
3155
        case FF_CMP_W53:
3156
            cmp[i]= c->w53[i];
3157
            break;
3158
        case FF_CMP_W97:
3159
            cmp[i]= c->w97[i];
3160
            break;
3161
#endif
3162
        default:
3163
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3164
        }
3165
    }
3166
}
3167

    
3168
static void clear_block_c(DCTELEM *block)
3169
{
3170
    memset(block, 0, sizeof(DCTELEM)*64);
3171
}
3172

    
3173
/**
3174
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3175
 */
3176
static void clear_blocks_c(DCTELEM *blocks)
3177
{
3178
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3179
}
3180

    
3181
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3182
    long i;
3183
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3184
        long a = *(long*)(src+i);
3185
        long b = *(long*)(dst+i);
3186
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3187
    }
3188
    for(; i<w; i++)
3189
        dst[i+0] += src[i+0];
3190
}
3191

    
3192
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3193
    long i;
3194
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3195
        long a = *(long*)(src1+i);
3196
        long b = *(long*)(src2+i);
3197
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3198
    }
3199
    for(; i<w; i++)
3200
        dst[i] = src1[i]+src2[i];
3201
}
3202

    
3203
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3204
    long i;
3205
#if !HAVE_FAST_UNALIGNED
3206
    if((long)src2 & (sizeof(long)-1)){
3207
        for(i=0; i+7<w; i+=8){
3208
            dst[i+0] = src1[i+0]-src2[i+0];
3209
            dst[i+1] = src1[i+1]-src2[i+1];
3210
            dst[i+2] = src1[i+2]-src2[i+2];
3211
            dst[i+3] = src1[i+3]-src2[i+3];
3212
            dst[i+4] = src1[i+4]-src2[i+4];
3213
            dst[i+5] = src1[i+5]-src2[i+5];
3214
            dst[i+6] = src1[i+6]-src2[i+6];
3215
            dst[i+7] = src1[i+7]-src2[i+7];
3216
        }
3217
    }else
3218
#endif
3219
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3220
        long a = *(long*)(src1+i);
3221
        long b = *(long*)(src2+i);
3222
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3223
    }
3224
    for(; i<w; i++)
3225
        dst[i+0] = src1[i+0]-src2[i+0];
3226
}
3227

    
3228
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3229
    int i;
3230
    uint8_t l, lt;
3231

    
3232
    l= *left;
3233
    lt= *left_top;
3234

    
3235
    for(i=0; i<w; i++){
3236
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3237
        lt= src1[i];
3238
        dst[i]= l;
3239
    }
3240

    
3241
    *left= l;
3242
    *left_top= lt;
3243
}
3244

    
3245
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3246
    int i;
3247
    uint8_t l, lt;
3248

    
3249
    l= *left;
3250
    lt= *left_top;
3251

    
3252
    for(i=0; i<w; i++){
3253
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3254
        lt= src1[i];
3255
        l= src2[i];
3256
        dst[i]= l - pred;
3257
    }
3258

    
3259
    *left= l;
3260
    *left_top= lt;
3261
}
3262

    
3263
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3264
    int i;
3265

    
3266
    for(i=0; i<w-1; i++){
3267
        acc+= src[i];
3268
        dst[i]= acc;
3269
        i++;
3270
        acc+= src[i];
3271
        dst[i]= acc;
3272
    }
3273

    
3274
    for(; i<w; i++){
3275
        acc+= src[i];
3276
        dst[i]= acc;
3277
    }
3278

    
3279
    return acc;
3280
}
3281

    
3282
#if HAVE_BIGENDIAN
3283
#define B 3
3284
#define G 2
3285
#define R 1
3286
#define A 0
3287
#else
3288
#define B 0
3289
#define G 1
3290
#define R 2
3291
#define A 3
3292
#endif
3293
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3294
    int i;
3295
    int r,g,b,a;
3296
    r= *red;
3297
    g= *green;
3298
    b= *blue;
3299
    a= *alpha;
3300

    
3301
    for(i=0; i<w; i++){
3302
        b+= src[4*i+B];
3303
        g+= src[4*i+G];
3304
        r+= src[4*i+R];
3305
        a+= src[4*i+A];
3306

    
3307
        dst[4*i+B]= b;
3308
        dst[4*i+G]= g;
3309
        dst[4*i+R]= r;
3310
        dst[4*i+A]= a;
3311
    }
3312

    
3313
    *red= r;
3314
    *green= g;
3315
    *blue= b;
3316
    *alpha= a;
3317
}
3318
#undef B
3319
#undef G
3320
#undef R
3321
#undef A
3322

    
3323
#define BUTTERFLY2(o1,o2,i1,i2) \
3324
o1= (i1)+(i2);\
3325
o2= (i1)-(i2);
3326

    
3327
#define BUTTERFLY1(x,y) \
3328
{\
3329
    int a,b;\
3330
    a= x;\
3331
    b= y;\
3332
    x= a+b;\
3333
    y= a-b;\
3334
}
3335

    
3336
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3337

    
3338
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3339
    int i;
3340
    int temp[64];
3341
    int sum=0;
3342

    
3343
    assert(h==8);
3344

    
3345
    for(i=0; i<8; i++){
3346
        //FIXME try pointer walks
3347
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3348
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3349
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3350
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3351

    
3352
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3353
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3354
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3355
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3356

    
3357
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3358
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3359
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3360
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3361
    }
3362

    
3363
    for(i=0; i<8; i++){
3364
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3365
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3366
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3367
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3368

    
3369
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3370
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3371
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3372
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3373

    
3374
        sum +=
3375
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3376
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3377
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3378
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3379
    }
3380
#if 0
3381
static int maxi=0;
3382
if(sum>maxi){
3383
    maxi=sum;
3384
    printf("MAX:%d\n", maxi);
3385
}
3386
#endif
3387
    return sum;
3388
}
3389

    
3390
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3391
    int i;
3392
    int temp[64];
3393
    int sum=0;
3394

    
3395
    assert(h==8);
3396

    
3397
    for(i=0; i<8; i++){
3398
        //FIXME try pointer walks
3399
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3400
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3401
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3402
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3403

    
3404
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3405
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3406
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3407
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3408

    
3409
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3410
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3411
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3412
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3413
    }
3414

    
3415
    for(i=0; i<8; i++){
3416
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3417
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3418
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3419
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3420

    
3421
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3422
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3423
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3424
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3425

    
3426
        sum +=
3427
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3428
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3429
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3430
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3431
    }
3432

    
3433
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3434

    
3435
    return sum;
3436
}
3437

    
3438
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3439
    MpegEncContext * const s= (MpegEncContext *)c;
3440
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3441

    
3442
    assert(h==8);
3443

    
3444
    s->dsp.diff_pixels(temp, src1, src2, stride);
3445
    s->dsp.fdct(temp);
3446
    return s->dsp.sum_abs_dctelem(temp);
3447
}
3448

    
3449
#if CONFIG_GPL
3450
#define DCT8_1D {\
3451
    const int s07 = SRC(0) + SRC(7);\
3452
    const int s16 = SRC(1) + SRC(6);\
3453
    const int s25 = SRC(2) + SRC(5);\
3454
    const int s34 = SRC(3) + SRC(4);\
3455
    const int a0 = s07 + s34;\
3456
    const int a1 = s16 + s25;\
3457
    const int a2 = s07 - s34;\
3458
    const int a3 = s16 - s25;\
3459
    const int d07 = SRC(0) - SRC(7);\
3460
    const int d16 = SRC(1) - SRC(6);\
3461
    const int d25 = SRC(2) - SRC(5);\
3462
    const int d34 = SRC(3) - SRC(4);\
3463
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3464
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3465
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3466
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3467
    DST(0,  a0 + a1     ) ;\
3468
    DST(1,  a4 + (a7>>2)) ;\
3469
    DST(2,  a2 + (a3>>1)) ;\
3470
    DST(3,  a5 + (a6>>2)) ;\
3471
    DST(4,  a0 - a1     ) ;\
3472
    DST(5,  a6 - (a5>>2)) ;\
3473
    DST(6, (a2>>1) - a3 ) ;\
3474
    DST(7, (a4>>2) - a7 ) ;\
3475
}
3476

    
3477
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3478
    MpegEncContext * const s= (MpegEncContext *)c;
3479
    DCTELEM dct[8][8];
3480
    int i;
3481
    int sum=0;
3482

    
3483
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3484

    
3485
#define SRC(x) dct[i][x]
3486
#define DST(x,v) dct[i][x]= v
3487
    for( i = 0; i < 8; i++ )
3488
        DCT8_1D
3489
#undef SRC
3490
#undef DST
3491

    
3492
#define SRC(x) dct[x][i]
3493
#define DST(x,v) sum += FFABS(v)
3494
    for( i = 0; i < 8; i++ )
3495
        DCT8_1D
3496
#undef SRC
3497
#undef DST
3498
    return sum;
3499
}
3500
#endif
3501

    
3502
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3503
    MpegEncContext * const s= (MpegEncContext *)c;
3504
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3505
    int sum=0, i;
3506

    
3507
    assert(h==8);
3508

    
3509
    s->dsp.diff_pixels(temp, src1, src2, stride);
3510
    s->dsp.fdct(temp);
3511

    
3512
    for(i=0; i<64; i++)
3513
        sum= FFMAX(sum, FFABS(temp[i]));
3514

    
3515
    return sum;
3516
}
3517

    
3518
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3519
    MpegEncContext * const s= (MpegEncContext *)c;
3520
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3521
    DCTELEM * const bak = temp+64;
3522
    int sum=0, i;
3523

    
3524
    assert(h==8);
3525
    s->mb_intra=0;
3526

    
3527
    s->dsp.diff_pixels(temp, src1, src2, stride);
3528

    
3529
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3530

    
3531
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3532
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3533
    ff_simple_idct(temp); //FIXME
3534

    
3535
    for(i=0; i<64; i++)
3536
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3537

    
3538
    return sum;
3539
}
3540

    
3541
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3542
    MpegEncContext * const s= (MpegEncContext *)c;
3543
    const uint8_t *scantable= s->intra_scantable.permutated;
3544
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3545
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3546
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3547
    int i, last, run, bits, level, distortion, start_i;
3548
    const int esc_length= s->ac_esc_length;
3549
    uint8_t * length;
3550
    uint8_t * last_length;
3551

    
3552
    assert(h==8);
3553

    
3554
    copy_block8(lsrc1, src1, 8, stride, 8);
3555
    copy_block8(lsrc2, src2, 8, stride, 8);
3556

    
3557
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3558

    
3559
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3560

    
3561
    bits=0;
3562

    
3563
    if (s->mb_intra) {
3564
        start_i = 1;
3565
        length     = s->intra_ac_vlc_length;
3566
        last_length= s->intra_ac_vlc_last_length;
3567
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3568
    } else {
3569
        start_i = 0;
3570
        length     = s->inter_ac_vlc_length;
3571
        last_length= s->inter_ac_vlc_last_length;
3572
    }
3573

    
3574
    if(last>=start_i){
3575
        run=0;
3576
        for(i=start_i; i<last; i++){
3577
            int j= scantable[i];
3578
            level= temp[j];
3579

    
3580
            if(level){
3581
                level+=64;
3582
                if((level&(~127)) == 0){
3583
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3584
                }else
3585
                    bits+= esc_length;
3586
                run=0;
3587
            }else
3588
                run++;
3589
        }
3590
        i= scantable[last];
3591

    
3592
        level= temp[i] + 64;
3593

    
3594
        assert(level - 64);
3595

    
3596
        if((level&(~127)) == 0){
3597
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3598
        }else
3599
            bits+= esc_length;
3600

    
3601
    }
3602

    
3603
    if(last>=0){
3604
        if(s->mb_intra)
3605
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3606
        else
3607
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3608
    }
3609

    
3610
    s->dsp.idct_add(lsrc2, 8, temp);
3611

    
3612
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3613

    
3614
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3615
}
3616

    
3617
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3618
    MpegEncContext * const s= (MpegEncContext *)c;
3619
    const uint8_t *scantable= s->intra_scantable.permutated;
3620
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3621
    int i, last, run, bits, level, start_i;
3622
    const int esc_length= s->ac_esc_length;
3623
    uint8_t * length;
3624
    uint8_t * last_length;
3625

    
3626
    assert(h==8);
3627

    
3628
    s->dsp.diff_pixels(temp, src1, src2, stride);
3629

    
3630
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3631

    
3632
    bits=0;
3633

    
3634
    if (s->mb_intra) {
3635
        start_i = 1;
3636
        length     = s->intra_ac_vlc_length;
3637
        last_length= s->intra_ac_vlc_last_length;
3638
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3639
    } else {
3640
        start_i = 0;
3641
        length     = s->inter_ac_vlc_length;
3642
        last_length= s->inter_ac_vlc_last_length;
3643
    }
3644

    
3645
    if(last>=start_i){
3646
        run=0;
3647
        for(i=start_i; i<last; i++){
3648
            int j= scantable[i];
3649
            level= temp[j];
3650

    
3651
            if(level){
3652
                level+=64;
3653
                if((level&(~127)) == 0){
3654
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3655
                }else
3656
                    bits+= esc_length;
3657
                run=0;
3658
            }else
3659
                run++;
3660
        }
3661
        i= scantable[last];
3662

    
3663
        level= temp[i] + 64;
3664

    
3665
        assert(level - 64);
3666

    
3667
        if((level&(~127)) == 0){
3668
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3669
        }else
3670
            bits+= esc_length;
3671
    }
3672

    
3673
    return bits;
3674
}
3675

    
3676
#define VSAD_INTRA(size) \
3677
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3678
    int score=0;                                                                                            \
3679
    int x,y;                                                                                                \
3680
                                                                                                            \
3681
    for(y=1; y<h; y++){                                                                                     \
3682
        for(x=0; x<size; x+=4){                                                                             \
3683
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3684
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3685
        }                                                                                                   \
3686
        s+= stride;                                                                                         \
3687
    }                                                                                                       \
3688
                                                                                                            \
3689
    return score;                                                                                           \
3690
}
3691
VSAD_INTRA(8)
3692
VSAD_INTRA(16)
3693

    
3694
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3695
    int score=0;
3696
    int x,y;
3697

    
3698
    for(y=1; y<h; y++){
3699
        for(x=0; x<16; x++){
3700
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3701
        }
3702
        s1+= stride;
3703
        s2+= stride;
3704
    }
3705

    
3706
    return score;
3707
}
3708

    
3709
#define SQ(a) ((a)*(a))
3710
#define VSSE_INTRA(size) \
3711
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3712
    int score=0;                                                                                            \
3713
    int x,y;                                                                                                \
3714
                                                                                                            \
3715
    for(y=1; y<h; y++){                                                                                     \
3716
        for(x=0; x<size; x+=4){                                                                               \
3717
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3718
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3719
        }                                                                                                   \
3720
        s+= stride;                                                                                         \
3721
    }                                                                                                       \
3722
                                                                                                            \
3723
    return score;                                                                                           \
3724
}
3725
VSSE_INTRA(8)
3726
VSSE_INTRA(16)
3727

    
3728
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3729
    int score=0;
3730
    int x,y;
3731

    
3732
    for(y=1; y<h; y++){
3733
        for(x=0; x<16; x++){
3734
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3735
        }
3736
        s1+= stride;
3737
        s2+= stride;
3738
    }
3739

    
3740
    return score;
3741
}
3742

    
3743
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3744
                               int size){
3745
    int score=0;
3746
    int i;
3747
    for(i=0; i<size; i++)
3748
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3749
    return score;
3750
}
3751

    
3752
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3753
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3754
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3755
#if CONFIG_GPL
3756
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3757
#endif
3758
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3759
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3760
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3761
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3762

    
3763
static void vector_fmul_c(float *dst, const float *src, int len){
3764
    int i;
3765
    for(i=0; i<len; i++)
3766
        dst[i] *= src[i];
3767
}
3768

    
3769
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3770
    int i;
3771
    src1 += len-1;
3772
    for(i=0; i<len; i++)
3773
        dst[i] = src0[i] * src1[-i];
3774
}
3775

    
3776
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3777
    int i;
3778
    for(i=0; i<len; i++)
3779
        dst[i] = src0[i] * src1[i] + src2[i];
3780
}
3781

    
3782
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3783
    int i,j;
3784
    dst += len;
3785
    win += len;
3786
    src0+= len;
3787
    for(i=-len, j=len-1; i<0; i++, j--) {
3788
        float s0 = src0[i];
3789
        float s1 = src1[j];
3790
        float wi = win[i];
3791
        float wj = win[j];
3792
        dst[i] = s0*wj - s1*wi + add_bias;
3793
        dst[j] = s0*wi + s1*wj + add_bias;
3794
    }
3795
}
3796

    
3797
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3798
                                 int len)
3799
{
3800
    int i;
3801
    for (i = 0; i < len; i++)
3802
        dst[i] = src[i] * mul;
3803
}
3804

    
3805
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3806
                                      const float **sv, float mul, int len)
3807
{
3808
    int i;
3809
    for (i = 0; i < len; i += 2, sv++) {
3810
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3811
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3812
    }
3813
}
3814

    
3815
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3816
                                      const float **sv, float mul, int len)
3817
{
3818
    int i;
3819
    for (i = 0; i < len; i += 4, sv++) {
3820
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3821
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3822
        dst[i+2] = src[i+2] * sv[0][2] * mul;
3823
        dst[i+3] = src[i+3] * sv[0][3] * mul;
3824
    }
3825
}
3826

    
3827
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3828
                               int len)
3829
{
3830
    int i;
3831
    for (i = 0; i < len; i += 2, sv++) {
3832
        dst[i  ] = sv[0][0] * mul;
3833
        dst[i+1] = sv[0][1] * mul;
3834
    }
3835
}
3836

    
3837
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3838
                               int len)
3839
{
3840
    int i;
3841
    for (i = 0; i < len; i += 4, sv++) {
3842
        dst[i  ] = sv[0][0] * mul;
3843
        dst[i+1] = sv[0][1] * mul;
3844
        dst[i+2] = sv[0][2] * mul;
3845
        dst[i+3] = sv[0][3] * mul;
3846
    }
3847
}
3848

    
3849
static void butterflies_float_c(float *restrict v1, float *restrict v2,
3850
                                int len)
3851
{
3852
    int i;
3853
    for (i = 0; i < len; i++) {
3854
        float t = v1[i] - v2[i];
3855
        v1[i] += v2[i];
3856
        v2[i] = t;
3857
    }
3858
}
3859

    
3860
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3861
{
3862
    float p = 0.0;
3863
    int i;
3864

    
3865
    for (i = 0; i < len; i++)
3866
        p += v1[i] * v2[i];
3867

    
3868
    return p;
3869
}
3870

    
3871
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3872
    int i;
3873
    for(i=0; i<len; i++)
3874
        dst[i] = src[i] * mul;
3875
}
3876

    
3877
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3878
                   uint32_t maxi, uint32_t maxisign)
3879
{
3880

    
3881
    if(a > mini) return mini;
3882
    else if((a^(1<<31)) > maxisign) return maxi;
3883
    else return a;
3884
}
3885

    
3886
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3887
    int i;
3888
    uint32_t mini = *(uint32_t*)min;
3889
    uint32_t maxi = *(uint32_t*)max;
3890
    uint32_t maxisign = maxi ^ (1<<31);
3891
    uint32_t *dsti = (uint32_t*)dst;
3892
    const uint32_t *srci = (const uint32_t*)src;
3893
    for(i=0; i<len; i+=8) {
3894
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3895
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3896
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3897
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3898
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3899
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3900
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3901
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3902
    }
3903
}
3904
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3905
    int i;
3906
    if(min < 0 && max > 0) {
3907
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3908
    } else {
3909
        for(i=0; i < len; i+=8) {
3910
            dst[i    ] = av_clipf(src[i    ], min, max);
3911
            dst[i + 1] = av_clipf(src[i + 1], min, max);
3912
            dst[i + 2] = av_clipf(src[i + 2], min, max);
3913
            dst[i + 3] = av_clipf(src[i + 3], min, max);
3914
            dst[i + 4] = av_clipf(src[i + 4], min, max);
3915
            dst[i + 5] = av_clipf(src[i + 5], min, max);
3916
            dst[i + 6] = av_clipf(src[i + 6], min, max);
3917
            dst[i + 7] = av_clipf(src[i + 7], min, max);
3918
        }
3919
    }
3920
}
3921

    
3922
static av_always_inline int float_to_int16_one(const float *src){
3923
    int_fast32_t tmp = *(const int32_t*)src;
3924
    if(tmp & 0xf0000){
3925
        tmp = (0x43c0ffff - tmp)>>31;
3926
        // is this faster on some gcc/cpu combinations?
3927
//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3928
//      else                 tmp = 0;
3929
    }
3930
    return tmp - 0x8000;
3931
}
3932

    
3933
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3934
    int i;
3935
    for(i=0; i<len; i++)
3936
        dst[i] = float_to_int16_one(src+i);
3937
}
3938

    
3939
void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3940
    int i,j,c;
3941
    if(channels==2){
3942
        for(i=0; i<len; i++){
3943
            dst[2*i]   = float_to_int16_one(src[0]+i);
3944
            dst[2*i+1] = float_to_int16_one(src[1]+i);
3945
        }
3946
    }else{
3947
        for(c=0; c<channels; c++)
3948
            for(i=0, j=c; i<len; i++, j+=channels)
3949
                dst[j] = float_to_int16_one(src[c]+i);
3950
    }
3951
}
3952

    
3953
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3954
{
3955
    int res = 0;
3956

    
3957
    while (order--)
3958
        res += (*v1++ * *v2++) >> shift;
3959

    
3960
    return res;
3961
}
3962

    
3963
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3964
{
3965
    int res = 0;
3966
    while (order--) {
3967
        res   += *v1 * *v2++;
3968
        *v1++ += mul * *v3++;
3969
    }
3970
    return res;
3971
}
3972

    
3973
#define W0 2048
3974
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3975
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3976
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3977
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3978
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3979
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3980
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3981

    
3982
static void wmv2_idct_row(short * b)
3983
{
3984
    int s1,s2;
3985
    int a0,a1,a2,a3,a4,a5,a6,a7;
3986
    /*step 1*/
3987
    a1 = W1*b[1]+W7*b[7];
3988
    a7 = W7*b[1]-W1*b[7];
3989
    a5 = W5*b[5]+W3*b[3];
3990
    a3 = W3*b[5]-W5*b[3];
3991
    a2 = W2*b[2]+W6*b[6];
3992
    a6 = W6*b[2]-W2*b[6];
3993
    a0 = W0*b[0]+W0*b[4];
3994
    a4 = W0*b[0]-W0*b[4];
3995
    /*step 2*/
3996
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3997
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3998
    /*step 3*/
3999
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4000
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
4001
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
4002
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4003
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4004
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
4005
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
4006
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4007
}
4008
static void wmv2_idct_col(short * b)
4009
{
4010
    int s1,s2;
4011
    int a0,a1,a2,a3,a4,a5,a6,a7;
4012
    /*step 1, with extended precision*/
4013
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4014
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4015
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4016
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4017
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4018
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4019
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4020
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4021
    /*step 2*/
4022
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
4023
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4024
    /*step 3*/
4025
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4026
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4027
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4028
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4029

    
4030
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4031
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4032
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4033
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4034
}
4035
void ff_wmv2_idct_c(short * block){
4036
    int i;
4037

    
4038
    for(i=0;i<64;i+=8){
4039
        wmv2_idct_row(block+i);
4040
    }
4041
    for(i=0;i<8;i++){
4042
        wmv2_idct_col(block+i);
4043
    }
4044
}
4045
/* XXX: those functions should be suppressed ASAP when all IDCTs are
4046
 converted */
4047
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4048
{
4049
    ff_wmv2_idct_c(block);
4050
    put_pixels_clamped_c(block, dest, line_size);
4051
}
4052
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4053
{
4054
    ff_wmv2_idct_c(block);
4055
    add_pixels_clamped_c(block, dest, line_size);
4056
}
4057
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4058
{
4059
    j_rev_dct (block);
4060
    put_pixels_clamped_c(block, dest, line_size);
4061
}
4062
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4063
{
4064
    j_rev_dct (block);
4065
    add_pixels_clamped_c(block, dest, line_size);
4066
}
4067

    
4068
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4069
{
4070
    j_rev_dct4 (block);
4071
    put_pixels_clamped4_c(block, dest, line_size);
4072
}
4073
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4074
{
4075
    j_rev_dct4 (block);
4076
    add_pixels_clamped4_c(block, dest, line_size);
4077
}
4078

    
4079
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4080
{
4081
    j_rev_dct2 (block);
4082
    put_pixels_clamped2_c(block, dest, line_size);
4083
}
4084
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4085
{
4086
    j_rev_dct2 (block);
4087
    add_pixels_clamped2_c(block, dest, line_size);
4088
}
4089

    
4090
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4091
{
4092
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4093

    
4094
    dest[0] = cm[(block[0] + 4)>>3];
4095
}
4096
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4097
{
4098
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4099

    
4100
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4101
}
4102

    
4103
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4104

    
4105
/* init static data */
4106
av_cold void dsputil_static_init(void)
4107
{
4108
    int i;
4109

    
4110
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4111
    for(i=0;i<MAX_NEG_CROP;i++) {
4112
        ff_cropTbl[i] = 0;
4113
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4114
    }
4115

    
4116
    for(i=0;i<512;i++) {
4117
        ff_squareTbl[i] = (i - 256) * (i - 256);
4118
    }
4119

    
4120
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4121
}
4122

    
4123
int ff_check_alignment(void){
4124
    static int did_fail=0;
4125
    DECLARE_ALIGNED(16, int, aligned);
4126

    
4127
    if((intptr_t)&aligned & 15){
4128
        if(!did_fail){
4129
#if HAVE_MMX || HAVE_ALTIVEC
4130
            av_log(NULL, AV_LOG_ERROR,
4131
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4132
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4133
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4134
                "Do not report crashes to FFmpeg developers.\n");
4135
#endif
4136
            did_fail=1;
4137
        }
4138
        return -1;
4139
    }
4140
    return 0;
4141
}
4142

    
4143
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4144
{
4145
    int i;
4146

    
4147
    ff_check_alignment();
4148

    
4149
#if CONFIG_ENCODERS
4150
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4151
        c->fdct = fdct_ifast;
4152
        c->fdct248 = fdct_ifast248;
4153
    }
4154
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4155
        c->fdct = ff_faandct;
4156
        c->fdct248 = ff_faandct248;
4157
    }
4158
    else {
4159
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4160
        c->fdct248 = ff_fdct248_islow;
4161
    }
4162
#endif //CONFIG_ENCODERS
4163

    
4164
    if(avctx->lowres==1){
4165
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4166
            c->idct_put= ff_jref_idct4_put;
4167
            c->idct_add= ff_jref_idct4_add;
4168
        }else{
4169
            c->idct_put= ff_h264_lowres_idct_put_c;
4170
            c->idct_add= ff_h264_lowres_idct_add_c;
4171
        }
4172
        c->idct    = j_rev_dct4;
4173
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4174
    }else if(avctx->lowres==2){
4175
        c->idct_put= ff_jref_idct2_put;
4176
        c->idct_add= ff_jref_idct2_add;
4177
        c->idct    = j_rev_dct2;
4178
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4179
    }else if(avctx->lowres==3){
4180
        c->idct_put= ff_jref_idct1_put;
4181
        c->idct_add= ff_jref_idct1_add;
4182
        c->idct    = j_rev_dct1;
4183
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4184
    }else{
4185
        if(avctx->idct_algo==FF_IDCT_INT){
4186
            c->idct_put= ff_jref_idct_put;
4187
            c->idct_add= ff_jref_idct_add;
4188
            c->idct    = j_rev_dct;
4189
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4190
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4191
                avctx->idct_algo==FF_IDCT_VP3){
4192
            c->idct_put= ff_vp3_idct_put_c;
4193
            c->idct_add= ff_vp3_idct_add_c;
4194
            c->idct    = ff_vp3_idct_c;
4195
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4196
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4197
            c->idct_put= ff_wmv2_idct_put_c;
4198
            c->idct_add= ff_wmv2_idct_add_c;
4199
            c->idct    = ff_wmv2_idct_c;
4200
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4201
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4202
            c->idct_put= ff_faanidct_put;
4203
            c->idct_add= ff_faanidct_add;
4204
            c->idct    = ff_faanidct;
4205
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4206
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4207
            c->idct_put= ff_ea_idct_put_c;
4208
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4209
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4210
            c->idct     = ff_bink_idct_c;
4211
            c->idct_add = ff_bink_idct_add_c;
4212
            c->idct_put = ff_bink_idct_put_c;
4213
            c->idct_permutation_type = FF_NO_IDCT_PERM;
4214
        }else{ //accurate/default
4215
            c->idct_put= ff_simple_idct_put;
4216
            c->idct_add= ff_simple_idct_add;
4217
            c->idct    = ff_simple_idct;
4218
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4219
        }
4220
    }
4221

    
4222
    c->get_pixels = get_pixels_c;
4223
    c->diff_pixels = diff_pixels_c;
4224
    c->put_pixels_clamped = put_pixels_clamped_c;
4225
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4226
    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4227
    c->add_pixels_clamped = add_pixels_clamped_c;
4228
    c->add_pixels8 = add_pixels8_c;
4229
    c->add_pixels4 = add_pixels4_c;
4230
    c->sum_abs_dctelem = sum_abs_dctelem_c;
4231
    c->gmc1 = gmc1_c;
4232
    c->gmc = ff_gmc_c;
4233
    c->clear_block = clear_block_c;
4234
    c->clear_blocks = clear_blocks_c;
4235
    c->pix_sum = pix_sum_c;
4236
    c->pix_norm1 = pix_norm1_c;
4237

    
4238
    c->fill_block_tab[0] = fill_block16_c;
4239
    c->fill_block_tab[1] = fill_block8_c;
4240
    c->scale_block = scale_block_c;
4241

    
4242
    /* TODO [0] 16  [1] 8 */
4243
    c->pix_abs[0][0] = pix_abs16_c;
4244
    c->pix_abs[0][1] = pix_abs16_x2_c;
4245
    c->pix_abs[0][2] = pix_abs16_y2_c;
4246
    c->pix_abs[0][3] = pix_abs16_xy2_c;
4247
    c->pix_abs[1][0] = pix_abs8_c;
4248
    c->pix_abs[1][1] = pix_abs8_x2_c;
4249
    c->pix_abs[1][2] = pix_abs8_y2_c;
4250
    c->pix_abs[1][3] = pix_abs8_xy2_c;
4251

    
4252
#define dspfunc(PFX, IDX, NUM) \
4253
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4254
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4255
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4256
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4257

    
4258
    dspfunc(put, 0, 16);
4259
    dspfunc(put_no_rnd, 0, 16);
4260
    dspfunc(put, 1, 8);
4261
    dspfunc(put_no_rnd, 1, 8);
4262
    dspfunc(put, 2, 4);
4263
    dspfunc(put, 3, 2);
4264

    
4265
    dspfunc(avg, 0, 16);
4266
    dspfunc(avg_no_rnd, 0, 16);
4267
    dspfunc(avg, 1, 8);
4268
    dspfunc(avg_no_rnd, 1, 8);
4269
    dspfunc(avg, 2, 4);
4270
    dspfunc(avg, 3, 2);
4271
#undef dspfunc
4272

    
4273
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4274
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4275

    
4276
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4277
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4278
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4279
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4280
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4281
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4282
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4283
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4284
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4285

    
4286
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;