Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 05aec7bb

History | View | Annotate | Download (171 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file libavcodec/dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "mpegvideo.h"
37
#include "config.h"
38
#include "lpc.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47
#define pb_7f (~0UL/255 * 0x7f)
48
#define pb_80 (~0UL/255 * 0x80)
49

    
50
const uint8_t ff_zigzag_direct[64] = {
51
    0,   1,  8, 16,  9,  2,  3, 10,
52
    17, 24, 32, 25, 18, 11,  4,  5,
53
    12, 19, 26, 33, 40, 48, 41, 34,
54
    27, 20, 13,  6,  7, 14, 21, 28,
55
    35, 42, 49, 56, 57, 50, 43, 36,
56
    29, 22, 15, 23, 30, 37, 44, 51,
57
    58, 59, 52, 45, 38, 31, 39, 46,
58
    53, 60, 61, 54, 47, 55, 62, 63
59
};
60

    
61
/* Specific zigzag scan for 248 idct. NOTE that unlike the
62
   specification, we interleave the fields */
63
const uint8_t ff_zigzag248_direct[64] = {
64
     0,  8,  1,  9, 16, 24,  2, 10,
65
    17, 25, 32, 40, 48, 56, 33, 41,
66
    18, 26,  3, 11,  4, 12, 19, 27,
67
    34, 42, 49, 57, 50, 58, 35, 43,
68
    20, 28,  5, 13,  6, 14, 21, 29,
69
    36, 44, 51, 59, 52, 60, 37, 45,
70
    22, 30,  7, 15, 23, 31, 38, 46,
71
    53, 61, 54, 62, 39, 47, 55, 63,
72
};
73

    
74
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
76

    
77
const uint8_t ff_alternate_horizontal_scan[64] = {
78
    0,  1,   2,  3,  8,  9, 16, 17,
79
    10, 11,  4,  5,  6,  7, 15, 14,
80
    13, 12, 19, 18, 24, 25, 32, 33,
81
    26, 27, 20, 21, 22, 23, 28, 29,
82
    30, 31, 34, 35, 40, 41, 48, 49,
83
    42, 43, 36, 37, 38, 39, 44, 45,
84
    46, 47, 50, 51, 56, 57, 58, 59,
85
    52, 53, 54, 55, 60, 61, 62, 63,
86
};
87

    
88
const uint8_t ff_alternate_vertical_scan[64] = {
89
    0,  8,  16, 24,  1,  9,  2, 10,
90
    17, 25, 32, 40, 48, 56, 57, 49,
91
    41, 33, 26, 18,  3, 11,  4, 12,
92
    19, 27, 34, 42, 50, 58, 35, 43,
93
    51, 59, 20, 28,  5, 13,  6, 14,
94
    21, 29, 36, 44, 52, 60, 37, 45,
95
    53, 61, 22, 30,  7, 15, 23, 31,
96
    38, 46, 54, 62, 39, 47, 55, 63,
97
};
98

    
99
/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
100
 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
101
const uint32_t ff_inverse[257]={
102
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
103
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
104
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
105
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
106
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
107
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
108
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
109
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
110
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
111
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
112
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
113
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
114
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
115
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
116
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
117
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
118
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
119
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
120
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
121
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
122
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
123
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
124
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
125
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
126
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
127
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
128
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
129
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
130
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
131
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
132
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
133
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
134
  16777216
135
};
136

    
137
/* Input permutation for the simple_idct_mmx */
138
static const uint8_t simple_mmx_permutation[64]={
139
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
140
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
141
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
142
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
143
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
144
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
145
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
146
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
147
};
148

    
149
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
150

    
151
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
152
    int i;
153
    int end;
154

    
155
    st->scantable= src_scantable;
156

    
157
    for(i=0; i<64; i++){
158
        int j;
159
        j = src_scantable[i];
160
        st->permutated[i] = permutation[j];
161
#if ARCH_PPC
162
        st->inverse[j] = i;
163
#endif
164
    }
165

    
166
    end=-1;
167
    for(i=0; i<64; i++){
168
        int j;
169
        j = st->permutated[i];
170
        if(j>end) end=j;
171
        st->raster_end[i]= end;
172
    }
173
}
174

    
175
static int pix_sum_c(uint8_t * pix, int line_size)
176
{
177
    int s, i, j;
178

    
179
    s = 0;
180
    for (i = 0; i < 16; i++) {
181
        for (j = 0; j < 16; j += 8) {
182
            s += pix[0];
183
            s += pix[1];
184
            s += pix[2];
185
            s += pix[3];
186
            s += pix[4];
187
            s += pix[5];
188
            s += pix[6];
189
            s += pix[7];
190
            pix += 8;
191
        }
192
        pix += line_size - 16;
193
    }
194
    return s;
195
}
196

    
197
static int pix_norm1_c(uint8_t * pix, int line_size)
198
{
199
    int s, i, j;
200
    uint32_t *sq = ff_squareTbl + 256;
201

    
202
    s = 0;
203
    for (i = 0; i < 16; i++) {
204
        for (j = 0; j < 16; j += 8) {
205
#if 0
206
            s += sq[pix[0]];
207
            s += sq[pix[1]];
208
            s += sq[pix[2]];
209
            s += sq[pix[3]];
210
            s += sq[pix[4]];
211
            s += sq[pix[5]];
212
            s += sq[pix[6]];
213
            s += sq[pix[7]];
214
#else
215
#if LONG_MAX > 2147483647
216
            register uint64_t x=*(uint64_t*)pix;
217
            s += sq[x&0xff];
218
            s += sq[(x>>8)&0xff];
219
            s += sq[(x>>16)&0xff];
220
            s += sq[(x>>24)&0xff];
221
            s += sq[(x>>32)&0xff];
222
            s += sq[(x>>40)&0xff];
223
            s += sq[(x>>48)&0xff];
224
            s += sq[(x>>56)&0xff];
225
#else
226
            register uint32_t x=*(uint32_t*)pix;
227
            s += sq[x&0xff];
228
            s += sq[(x>>8)&0xff];
229
            s += sq[(x>>16)&0xff];
230
            s += sq[(x>>24)&0xff];
231
            x=*(uint32_t*)(pix+4);
232
            s += sq[x&0xff];
233
            s += sq[(x>>8)&0xff];
234
            s += sq[(x>>16)&0xff];
235
            s += sq[(x>>24)&0xff];
236
#endif
237
#endif
238
            pix += 8;
239
        }
240
        pix += line_size - 16;
241
    }
242
    return s;
243
}
244

    
245
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
246
    int i;
247

    
248
    for(i=0; i+8<=w; i+=8){
249
        dst[i+0]= bswap_32(src[i+0]);
250
        dst[i+1]= bswap_32(src[i+1]);
251
        dst[i+2]= bswap_32(src[i+2]);
252
        dst[i+3]= bswap_32(src[i+3]);
253
        dst[i+4]= bswap_32(src[i+4]);
254
        dst[i+5]= bswap_32(src[i+5]);
255
        dst[i+6]= bswap_32(src[i+6]);
256
        dst[i+7]= bswap_32(src[i+7]);
257
    }
258
    for(;i<w; i++){
259
        dst[i+0]= bswap_32(src[i+0]);
260
    }
261
}
262

    
263
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
264
{
265
    int s, i;
266
    uint32_t *sq = ff_squareTbl + 256;
267

    
268
    s = 0;
269
    for (i = 0; i < h; i++) {
270
        s += sq[pix1[0] - pix2[0]];
271
        s += sq[pix1[1] - pix2[1]];
272
        s += sq[pix1[2] - pix2[2]];
273
        s += sq[pix1[3] - pix2[3]];
274
        pix1 += line_size;
275
        pix2 += line_size;
276
    }
277
    return s;
278
}
279

    
280
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
281
{
282
    int s, i;
283
    uint32_t *sq = ff_squareTbl + 256;
284

    
285
    s = 0;
286
    for (i = 0; i < h; i++) {
287
        s += sq[pix1[0] - pix2[0]];
288
        s += sq[pix1[1] - pix2[1]];
289
        s += sq[pix1[2] - pix2[2]];
290
        s += sq[pix1[3] - pix2[3]];
291
        s += sq[pix1[4] - pix2[4]];
292
        s += sq[pix1[5] - pix2[5]];
293
        s += sq[pix1[6] - pix2[6]];
294
        s += sq[pix1[7] - pix2[7]];
295
        pix1 += line_size;
296
        pix2 += line_size;
297
    }
298
    return s;
299
}
300

    
301
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
302
{
303
    int s, i;
304
    uint32_t *sq = ff_squareTbl + 256;
305

    
306
    s = 0;
307
    for (i = 0; i < h; i++) {
308
        s += sq[pix1[ 0] - pix2[ 0]];
309
        s += sq[pix1[ 1] - pix2[ 1]];
310
        s += sq[pix1[ 2] - pix2[ 2]];
311
        s += sq[pix1[ 3] - pix2[ 3]];
312
        s += sq[pix1[ 4] - pix2[ 4]];
313
        s += sq[pix1[ 5] - pix2[ 5]];
314
        s += sq[pix1[ 6] - pix2[ 6]];
315
        s += sq[pix1[ 7] - pix2[ 7]];
316
        s += sq[pix1[ 8] - pix2[ 8]];
317
        s += sq[pix1[ 9] - pix2[ 9]];
318
        s += sq[pix1[10] - pix2[10]];
319
        s += sq[pix1[11] - pix2[11]];
320
        s += sq[pix1[12] - pix2[12]];
321
        s += sq[pix1[13] - pix2[13]];
322
        s += sq[pix1[14] - pix2[14]];
323
        s += sq[pix1[15] - pix2[15]];
324

    
325
        pix1 += line_size;
326
        pix2 += line_size;
327
    }
328
    return s;
329
}
330

    
331
/* draw the edges of width 'w' of an image of size width, height */
332
//FIXME check that this is ok for mpeg4 interlaced
333
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
334
{
335
    uint8_t *ptr, *last_line;
336
    int i;
337

    
338
    last_line = buf + (height - 1) * wrap;
339
    for(i=0;i<w;i++) {
340
        /* top and bottom */
341
        memcpy(buf - (i + 1) * wrap, buf, width);
342
        memcpy(last_line + (i + 1) * wrap, last_line, width);
343
    }
344
    /* left and right */
345
    ptr = buf;
346
    for(i=0;i<height;i++) {
347
        memset(ptr - w, ptr[0], w);
348
        memset(ptr + width, ptr[width-1], w);
349
        ptr += wrap;
350
    }
351
    /* corners */
352
    for(i=0;i<w;i++) {
353
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
354
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
355
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
356
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
357
    }
358
}
359

    
360
/**
361
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
362
 * @param buf destination buffer
363
 * @param src source buffer
364
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
365
 * @param block_w width of block
366
 * @param block_h height of block
367
 * @param src_x x coordinate of the top left sample of the block in the source buffer
368
 * @param src_y y coordinate of the top left sample of the block in the source buffer
369
 * @param w width of the source buffer
370
 * @param h height of the source buffer
371
 */
372
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
373
                                    int src_x, int src_y, int w, int h){
374
    int x, y;
375
    int start_y, start_x, end_y, end_x;
376

    
377
    if(src_y>= h){
378
        src+= (h-1-src_y)*linesize;
379
        src_y=h-1;
380
    }else if(src_y<=-block_h){
381
        src+= (1-block_h-src_y)*linesize;
382
        src_y=1-block_h;
383
    }
384
    if(src_x>= w){
385
        src+= (w-1-src_x);
386
        src_x=w-1;
387
    }else if(src_x<=-block_w){
388
        src+= (1-block_w-src_x);
389
        src_x=1-block_w;
390
    }
391

    
392
    start_y= FFMAX(0, -src_y);
393
    start_x= FFMAX(0, -src_x);
394
    end_y= FFMIN(block_h, h-src_y);
395
    end_x= FFMIN(block_w, w-src_x);
396

    
397
    // copy existing part
398
    for(y=start_y; y<end_y; y++){
399
        for(x=start_x; x<end_x; x++){
400
            buf[x + y*linesize]= src[x + y*linesize];
401
        }
402
    }
403

    
404
    //top
405
    for(y=0; y<start_y; y++){
406
        for(x=start_x; x<end_x; x++){
407
            buf[x + y*linesize]= buf[x + start_y*linesize];
408
        }
409
    }
410

    
411
    //bottom
412
    for(y=end_y; y<block_h; y++){
413
        for(x=start_x; x<end_x; x++){
414
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
415
        }
416
    }
417

    
418
    for(y=0; y<block_h; y++){
419
       //left
420
        for(x=0; x<start_x; x++){
421
            buf[x + y*linesize]= buf[start_x + y*linesize];
422
        }
423

    
424
       //right
425
        for(x=end_x; x<block_w; x++){
426
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
427
        }
428
    }
429
}
430

    
431
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
432
{
433
    int i;
434

    
435
    /* read the pixels */
436
    for(i=0;i<8;i++) {
437
        block[0] = pixels[0];
438
        block[1] = pixels[1];
439
        block[2] = pixels[2];
440
        block[3] = pixels[3];
441
        block[4] = pixels[4];
442
        block[5] = pixels[5];
443
        block[6] = pixels[6];
444
        block[7] = pixels[7];
445
        pixels += line_size;
446
        block += 8;
447
    }
448
}
449

    
450
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
451
                          const uint8_t *s2, int stride){
452
    int i;
453

    
454
    /* read the pixels */
455
    for(i=0;i<8;i++) {
456
        block[0] = s1[0] - s2[0];
457
        block[1] = s1[1] - s2[1];
458
        block[2] = s1[2] - s2[2];
459
        block[3] = s1[3] - s2[3];
460
        block[4] = s1[4] - s2[4];
461
        block[5] = s1[5] - s2[5];
462
        block[6] = s1[6] - s2[6];
463
        block[7] = s1[7] - s2[7];
464
        s1 += stride;
465
        s2 += stride;
466
        block += 8;
467
    }
468
}
469

    
470

    
471
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
472
                                 int line_size)
473
{
474
    int i;
475
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
476

    
477
    /* read the pixels */
478
    for(i=0;i<8;i++) {
479
        pixels[0] = cm[block[0]];
480
        pixels[1] = cm[block[1]];
481
        pixels[2] = cm[block[2]];
482
        pixels[3] = cm[block[3]];
483
        pixels[4] = cm[block[4]];
484
        pixels[5] = cm[block[5]];
485
        pixels[6] = cm[block[6]];
486
        pixels[7] = cm[block[7]];
487

    
488
        pixels += line_size;
489
        block += 8;
490
    }
491
}
492

    
493
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
494
                                 int line_size)
495
{
496
    int i;
497
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
498

    
499
    /* read the pixels */
500
    for(i=0;i<4;i++) {
501
        pixels[0] = cm[block[0]];
502
        pixels[1] = cm[block[1]];
503
        pixels[2] = cm[block[2]];
504
        pixels[3] = cm[block[3]];
505

    
506
        pixels += line_size;
507
        block += 8;
508
    }
509
}
510

    
511
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
512
                                 int line_size)
513
{
514
    int i;
515
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
516

    
517
    /* read the pixels */
518
    for(i=0;i<2;i++) {
519
        pixels[0] = cm[block[0]];
520
        pixels[1] = cm[block[1]];
521

    
522
        pixels += line_size;
523
        block += 8;
524
    }
525
}
526

    
527
static void put_signed_pixels_clamped_c(const DCTELEM *block,
528
                                        uint8_t *restrict pixels,
529
                                        int line_size)
530
{
531
    int i, j;
532

    
533
    for (i = 0; i < 8; i++) {
534
        for (j = 0; j < 8; j++) {
535
            if (*block < -128)
536
                *pixels = 0;
537
            else if (*block > 127)
538
                *pixels = 255;
539
            else
540
                *pixels = (uint8_t)(*block + 128);
541
            block++;
542
            pixels++;
543
        }
544
        pixels += (line_size - 8);
545
    }
546
}
547

    
548
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
549
                                    int line_size)
550
{
551
    int i;
552

    
553
    /* read the pixels */
554
    for(i=0;i<8;i++) {
555
        pixels[0] = block[0];
556
        pixels[1] = block[1];
557
        pixels[2] = block[2];
558
        pixels[3] = block[3];
559
        pixels[4] = block[4];
560
        pixels[5] = block[5];
561
        pixels[6] = block[6];
562
        pixels[7] = block[7];
563

    
564
        pixels += line_size;
565
        block += 8;
566
    }
567
}
568

    
569
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
570
                          int line_size)
571
{
572
    int i;
573
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
574

    
575
    /* read the pixels */
576
    for(i=0;i<8;i++) {
577
        pixels[0] = cm[pixels[0] + block[0]];
578
        pixels[1] = cm[pixels[1] + block[1]];
579
        pixels[2] = cm[pixels[2] + block[2]];
580
        pixels[3] = cm[pixels[3] + block[3]];
581
        pixels[4] = cm[pixels[4] + block[4]];
582
        pixels[5] = cm[pixels[5] + block[5]];
583
        pixels[6] = cm[pixels[6] + block[6]];
584
        pixels[7] = cm[pixels[7] + block[7]];
585
        pixels += line_size;
586
        block += 8;
587
    }
588
}
589

    
590
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
591
                          int line_size)
592
{
593
    int i;
594
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
595

    
596
    /* read the pixels */
597
    for(i=0;i<4;i++) {
598
        pixels[0] = cm[pixels[0] + block[0]];
599
        pixels[1] = cm[pixels[1] + block[1]];
600
        pixels[2] = cm[pixels[2] + block[2]];
601
        pixels[3] = cm[pixels[3] + block[3]];
602
        pixels += line_size;
603
        block += 8;
604
    }
605
}
606

    
607
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
608
                          int line_size)
609
{
610
    int i;
611
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
612

    
613
    /* read the pixels */
614
    for(i=0;i<2;i++) {
615
        pixels[0] = cm[pixels[0] + block[0]];
616
        pixels[1] = cm[pixels[1] + block[1]];
617
        pixels += line_size;
618
        block += 8;
619
    }
620
}
621

    
622
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
623
{
624
    int i;
625
    for(i=0;i<8;i++) {
626
        pixels[0] += block[0];
627
        pixels[1] += block[1];
628
        pixels[2] += block[2];
629
        pixels[3] += block[3];
630
        pixels[4] += block[4];
631
        pixels[5] += block[5];
632
        pixels[6] += block[6];
633
        pixels[7] += block[7];
634
        pixels += line_size;
635
        block += 8;
636
    }
637
}
638

    
639
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
640
{
641
    int i;
642
    for(i=0;i<4;i++) {
643
        pixels[0] += block[0];
644
        pixels[1] += block[1];
645
        pixels[2] += block[2];
646
        pixels[3] += block[3];
647
        pixels += line_size;
648
        block += 4;
649
    }
650
}
651

    
652
static int sum_abs_dctelem_c(DCTELEM *block)
653
{
654
    int sum=0, i;
655
    for(i=0; i<64; i++)
656
        sum+= FFABS(block[i]);
657
    return sum;
658
}
659

    
660
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
661
{
662
    int i;
663

    
664
    for (i = 0; i < h; i++) {
665
        memset(block, value, 16);
666
        block += line_size;
667
    }
668
}
669

    
670
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
671
{
672
    int i;
673

    
674
    for (i = 0; i < h; i++) {
675
        memset(block, value, 8);
676
        block += line_size;
677
    }
678
}
679

    
680
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
681
{
682
    int i, j;
683
    uint16_t *dst1 = (uint16_t *) dst;
684
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
685

    
686
    for (j = 0; j < 8; j++) {
687
        for (i = 0; i < 8; i++) {
688
            dst1[i] = dst2[i] = src[i] * 0x0101;
689
        }
690
        src  += 8;
691
        dst1 += linesize;
692
        dst2 += linesize;
693
    }
694
}
695

    
696
#if 0
697

698
#define PIXOP2(OPNAME, OP) \
699
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
700
{\
701
    int i;\
702
    for(i=0; i<h; i++){\
703
        OP(*((uint64_t*)block), AV_RN64(pixels));\
704
        pixels+=line_size;\
705
        block +=line_size;\
706
    }\
707
}\
708
\
709
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
710
{\
711
    int i;\
712
    for(i=0; i<h; i++){\
713
        const uint64_t a= AV_RN64(pixels  );\
714
        const uint64_t b= AV_RN64(pixels+1);\
715
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
716
        pixels+=line_size;\
717
        block +=line_size;\
718
    }\
719
}\
720
\
721
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
722
{\
723
    int i;\
724
    for(i=0; i<h; i++){\
725
        const uint64_t a= AV_RN64(pixels  );\
726
        const uint64_t b= AV_RN64(pixels+1);\
727
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
728
        pixels+=line_size;\
729
        block +=line_size;\
730
    }\
731
}\
732
\
733
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
734
{\
735
    int i;\
736
    for(i=0; i<h; i++){\
737
        const uint64_t a= AV_RN64(pixels          );\
738
        const uint64_t b= AV_RN64(pixels+line_size);\
739
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
740
        pixels+=line_size;\
741
        block +=line_size;\
742
    }\
743
}\
744
\
745
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
746
{\
747
    int i;\
748
    for(i=0; i<h; i++){\
749
        const uint64_t a= AV_RN64(pixels          );\
750
        const uint64_t b= AV_RN64(pixels+line_size);\
751
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
752
        pixels+=line_size;\
753
        block +=line_size;\
754
    }\
755
}\
756
\
757
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
758
{\
759
        int i;\
760
        const uint64_t a= AV_RN64(pixels  );\
761
        const uint64_t b= AV_RN64(pixels+1);\
762
        uint64_t l0=  (a&0x0303030303030303ULL)\
763
                    + (b&0x0303030303030303ULL)\
764
                    + 0x0202020202020202ULL;\
765
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
766
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
767
        uint64_t l1,h1;\
768
\
769
        pixels+=line_size;\
770
        for(i=0; i<h; i+=2){\
771
            uint64_t a= AV_RN64(pixels  );\
772
            uint64_t b= AV_RN64(pixels+1);\
773
            l1=  (a&0x0303030303030303ULL)\
774
               + (b&0x0303030303030303ULL);\
775
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
776
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
777
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
778
            pixels+=line_size;\
779
            block +=line_size;\
780
            a= AV_RN64(pixels  );\
781
            b= AV_RN64(pixels+1);\
782
            l0=  (a&0x0303030303030303ULL)\
783
               + (b&0x0303030303030303ULL)\
784
               + 0x0202020202020202ULL;\
785
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
786
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
787
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
788
            pixels+=line_size;\
789
            block +=line_size;\
790
        }\
791
}\
792
\
793
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
794
{\
795
        int i;\
796
        const uint64_t a= AV_RN64(pixels  );\
797
        const uint64_t b= AV_RN64(pixels+1);\
798
        uint64_t l0=  (a&0x0303030303030303ULL)\
799
                    + (b&0x0303030303030303ULL)\
800
                    + 0x0101010101010101ULL;\
801
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
802
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
803
        uint64_t l1,h1;\
804
\
805
        pixels+=line_size;\
806
        for(i=0; i<h; i+=2){\
807
            uint64_t a= AV_RN64(pixels  );\
808
            uint64_t b= AV_RN64(pixels+1);\
809
            l1=  (a&0x0303030303030303ULL)\
810
               + (b&0x0303030303030303ULL);\
811
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
812
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
813
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
814
            pixels+=line_size;\
815
            block +=line_size;\
816
            a= AV_RN64(pixels  );\
817
            b= AV_RN64(pixels+1);\
818
            l0=  (a&0x0303030303030303ULL)\
819
               + (b&0x0303030303030303ULL)\
820
               + 0x0101010101010101ULL;\
821
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
822
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
823
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
824
            pixels+=line_size;\
825
            block +=line_size;\
826
        }\
827
}\
828
\
829
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
830
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
831
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
832
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
833
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
834
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
835
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
836

837
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
838
#else // 64 bit variant
839

    
840
#define PIXOP2(OPNAME, OP) \
841
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
842
    int i;\
843
    for(i=0; i<h; i++){\
844
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
845
        pixels+=line_size;\
846
        block +=line_size;\
847
    }\
848
}\
849
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
850
    int i;\
851
    for(i=0; i<h; i++){\
852
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
853
        pixels+=line_size;\
854
        block +=line_size;\
855
    }\
856
}\
857
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
858
    int i;\
859
    for(i=0; i<h; i++){\
860
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
861
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
862
        pixels+=line_size;\
863
        block +=line_size;\
864
    }\
865
}\
866
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
867
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
868
}\
869
\
870
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
871
                                                int src_stride1, int src_stride2, int h){\
872
    int i;\
873
    for(i=0; i<h; i++){\
874
        uint32_t a,b;\
875
        a= AV_RN32(&src1[i*src_stride1  ]);\
876
        b= AV_RN32(&src2[i*src_stride2  ]);\
877
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
878
        a= AV_RN32(&src1[i*src_stride1+4]);\
879
        b= AV_RN32(&src2[i*src_stride2+4]);\
880
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
881
    }\
882
}\
883
\
884
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
885
                                                int src_stride1, int src_stride2, int h){\
886
    int i;\
887
    for(i=0; i<h; i++){\
888
        uint32_t a,b;\
889
        a= AV_RN32(&src1[i*src_stride1  ]);\
890
        b= AV_RN32(&src2[i*src_stride2  ]);\
891
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
892
        a= AV_RN32(&src1[i*src_stride1+4]);\
893
        b= AV_RN32(&src2[i*src_stride2+4]);\
894
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
895
    }\
896
}\
897
\
898
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
899
                                                int src_stride1, int src_stride2, int h){\
900
    int i;\
901
    for(i=0; i<h; i++){\
902
        uint32_t a,b;\
903
        a= AV_RN32(&src1[i*src_stride1  ]);\
904
        b= AV_RN32(&src2[i*src_stride2  ]);\
905
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
906
    }\
907
}\
908
\
909
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
910
                                                int src_stride1, int src_stride2, int h){\
911
    int i;\
912
    for(i=0; i<h; i++){\
913
        uint32_t a,b;\
914
        a= AV_RN16(&src1[i*src_stride1  ]);\
915
        b= AV_RN16(&src2[i*src_stride2  ]);\
916
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
917
    }\
918
}\
919
\
920
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
921
                                                int src_stride1, int src_stride2, int h){\
922
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
923
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
924
}\
925
\
926
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
927
                                                int src_stride1, int src_stride2, int h){\
928
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
929
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
930
}\
931
\
932
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
933
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
934
}\
935
\
936
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
937
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
938
}\
939
\
940
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
941
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
942
}\
943
\
944
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
945
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
946
}\
947
\
948
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
949
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
950
    int i;\
951
    for(i=0; i<h; i++){\
952
        uint32_t a, b, c, d, l0, l1, h0, h1;\
953
        a= AV_RN32(&src1[i*src_stride1]);\
954
        b= AV_RN32(&src2[i*src_stride2]);\
955
        c= AV_RN32(&src3[i*src_stride3]);\
956
        d= AV_RN32(&src4[i*src_stride4]);\
957
        l0=  (a&0x03030303UL)\
958
           + (b&0x03030303UL)\
959
           + 0x02020202UL;\
960
        h0= ((a&0xFCFCFCFCUL)>>2)\
961
          + ((b&0xFCFCFCFCUL)>>2);\
962
        l1=  (c&0x03030303UL)\
963
           + (d&0x03030303UL);\
964
        h1= ((c&0xFCFCFCFCUL)>>2)\
965
          + ((d&0xFCFCFCFCUL)>>2);\
966
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
967
        a= AV_RN32(&src1[i*src_stride1+4]);\
968
        b= AV_RN32(&src2[i*src_stride2+4]);\
969
        c= AV_RN32(&src3[i*src_stride3+4]);\
970
        d= AV_RN32(&src4[i*src_stride4+4]);\
971
        l0=  (a&0x03030303UL)\
972
           + (b&0x03030303UL)\
973
           + 0x02020202UL;\
974
        h0= ((a&0xFCFCFCFCUL)>>2)\
975
          + ((b&0xFCFCFCFCUL)>>2);\
976
        l1=  (c&0x03030303UL)\
977
           + (d&0x03030303UL);\
978
        h1= ((c&0xFCFCFCFCUL)>>2)\
979
          + ((d&0xFCFCFCFCUL)>>2);\
980
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
981
    }\
982
}\
983
\
984
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
985
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
986
}\
987
\
988
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
989
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
990
}\
991
\
992
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
993
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
994
}\
995
\
996
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
997
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
998
}\
999
\
1000
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1001
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1002
    int i;\
1003
    for(i=0; i<h; i++){\
1004
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1005
        a= AV_RN32(&src1[i*src_stride1]);\
1006
        b= AV_RN32(&src2[i*src_stride2]);\
1007
        c= AV_RN32(&src3[i*src_stride3]);\
1008
        d= AV_RN32(&src4[i*src_stride4]);\
1009
        l0=  (a&0x03030303UL)\
1010
           + (b&0x03030303UL)\
1011
           + 0x01010101UL;\
1012
        h0= ((a&0xFCFCFCFCUL)>>2)\
1013
          + ((b&0xFCFCFCFCUL)>>2);\
1014
        l1=  (c&0x03030303UL)\
1015
           + (d&0x03030303UL);\
1016
        h1= ((c&0xFCFCFCFCUL)>>2)\
1017
          + ((d&0xFCFCFCFCUL)>>2);\
1018
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1019
        a= AV_RN32(&src1[i*src_stride1+4]);\
1020
        b= AV_RN32(&src2[i*src_stride2+4]);\
1021
        c= AV_RN32(&src3[i*src_stride3+4]);\
1022
        d= AV_RN32(&src4[i*src_stride4+4]);\
1023
        l0=  (a&0x03030303UL)\
1024
           + (b&0x03030303UL)\
1025
           + 0x01010101UL;\
1026
        h0= ((a&0xFCFCFCFCUL)>>2)\
1027
          + ((b&0xFCFCFCFCUL)>>2);\
1028
        l1=  (c&0x03030303UL)\
1029
           + (d&0x03030303UL);\
1030
        h1= ((c&0xFCFCFCFCUL)>>2)\
1031
          + ((d&0xFCFCFCFCUL)>>2);\
1032
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1033
    }\
1034
}\
1035
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1036
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1037
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1038
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1039
}\
1040
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1041
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1042
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1043
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1044
}\
1045
\
1046
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1047
{\
1048
        int i, a0, b0, a1, b1;\
1049
        a0= pixels[0];\
1050
        b0= pixels[1] + 2;\
1051
        a0 += b0;\
1052
        b0 += pixels[2];\
1053
\
1054
        pixels+=line_size;\
1055
        for(i=0; i<h; i+=2){\
1056
            a1= pixels[0];\
1057
            b1= pixels[1];\
1058
            a1 += b1;\
1059
            b1 += pixels[2];\
1060
\
1061
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1062
            block[1]= (b1+b0)>>2;\
1063
\
1064
            pixels+=line_size;\
1065
            block +=line_size;\
1066
\
1067
            a0= pixels[0];\
1068
            b0= pixels[1] + 2;\
1069
            a0 += b0;\
1070
            b0 += pixels[2];\
1071
\
1072
            block[0]= (a1+a0)>>2;\
1073
            block[1]= (b1+b0)>>2;\
1074
            pixels+=line_size;\
1075
            block +=line_size;\
1076
        }\
1077
}\
1078
\
1079
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1080
{\
1081
        int i;\
1082
        const uint32_t a= AV_RN32(pixels  );\
1083
        const uint32_t b= AV_RN32(pixels+1);\
1084
        uint32_t l0=  (a&0x03030303UL)\
1085
                    + (b&0x03030303UL)\
1086
                    + 0x02020202UL;\
1087
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1088
                   + ((b&0xFCFCFCFCUL)>>2);\
1089
        uint32_t l1,h1;\
1090
\
1091
        pixels+=line_size;\
1092
        for(i=0; i<h; i+=2){\
1093
            uint32_t a= AV_RN32(pixels  );\
1094
            uint32_t b= AV_RN32(pixels+1);\
1095
            l1=  (a&0x03030303UL)\
1096
               + (b&0x03030303UL);\
1097
            h1= ((a&0xFCFCFCFCUL)>>2)\
1098
              + ((b&0xFCFCFCFCUL)>>2);\
1099
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100
            pixels+=line_size;\
1101
            block +=line_size;\
1102
            a= AV_RN32(pixels  );\
1103
            b= AV_RN32(pixels+1);\
1104
            l0=  (a&0x03030303UL)\
1105
               + (b&0x03030303UL)\
1106
               + 0x02020202UL;\
1107
            h0= ((a&0xFCFCFCFCUL)>>2)\
1108
              + ((b&0xFCFCFCFCUL)>>2);\
1109
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1110
            pixels+=line_size;\
1111
            block +=line_size;\
1112
        }\
1113
}\
1114
\
1115
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1116
{\
1117
    int j;\
1118
    for(j=0; j<2; j++){\
1119
        int i;\
1120
        const uint32_t a= AV_RN32(pixels  );\
1121
        const uint32_t b= AV_RN32(pixels+1);\
1122
        uint32_t l0=  (a&0x03030303UL)\
1123
                    + (b&0x03030303UL)\
1124
                    + 0x02020202UL;\
1125
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1126
                   + ((b&0xFCFCFCFCUL)>>2);\
1127
        uint32_t l1,h1;\
1128
\
1129
        pixels+=line_size;\
1130
        for(i=0; i<h; i+=2){\
1131
            uint32_t a= AV_RN32(pixels  );\
1132
            uint32_t b= AV_RN32(pixels+1);\
1133
            l1=  (a&0x03030303UL)\
1134
               + (b&0x03030303UL);\
1135
            h1= ((a&0xFCFCFCFCUL)>>2)\
1136
              + ((b&0xFCFCFCFCUL)>>2);\
1137
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1138
            pixels+=line_size;\
1139
            block +=line_size;\
1140
            a= AV_RN32(pixels  );\
1141
            b= AV_RN32(pixels+1);\
1142
            l0=  (a&0x03030303UL)\
1143
               + (b&0x03030303UL)\
1144
               + 0x02020202UL;\
1145
            h0= ((a&0xFCFCFCFCUL)>>2)\
1146
              + ((b&0xFCFCFCFCUL)>>2);\
1147
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1148
            pixels+=line_size;\
1149
            block +=line_size;\
1150
        }\
1151
        pixels+=4-line_size*(h+1);\
1152
        block +=4-line_size*h;\
1153
    }\
1154
}\
1155
\
1156
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1157
{\
1158
    int j;\
1159
    for(j=0; j<2; j++){\
1160
        int i;\
1161
        const uint32_t a= AV_RN32(pixels  );\
1162
        const uint32_t b= AV_RN32(pixels+1);\
1163
        uint32_t l0=  (a&0x03030303UL)\
1164
                    + (b&0x03030303UL)\
1165
                    + 0x01010101UL;\
1166
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1167
                   + ((b&0xFCFCFCFCUL)>>2);\
1168
        uint32_t l1,h1;\
1169
\
1170
        pixels+=line_size;\
1171
        for(i=0; i<h; i+=2){\
1172
            uint32_t a= AV_RN32(pixels  );\
1173
            uint32_t b= AV_RN32(pixels+1);\
1174
            l1=  (a&0x03030303UL)\
1175
               + (b&0x03030303UL);\
1176
            h1= ((a&0xFCFCFCFCUL)>>2)\
1177
              + ((b&0xFCFCFCFCUL)>>2);\
1178
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1179
            pixels+=line_size;\
1180
            block +=line_size;\
1181
            a= AV_RN32(pixels  );\
1182
            b= AV_RN32(pixels+1);\
1183
            l0=  (a&0x03030303UL)\
1184
               + (b&0x03030303UL)\
1185
               + 0x01010101UL;\
1186
            h0= ((a&0xFCFCFCFCUL)>>2)\
1187
              + ((b&0xFCFCFCFCUL)>>2);\
1188
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189
            pixels+=line_size;\
1190
            block +=line_size;\
1191
        }\
1192
        pixels+=4-line_size*(h+1);\
1193
        block +=4-line_size*h;\
1194
    }\
1195
}\
1196
\
1197
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1198
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1199
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1200
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1201
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1202
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1203
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1204
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1205

    
1206
#define op_avg(a, b) a = rnd_avg32(a, b)
1207
#endif
1208
#define op_put(a, b) a = b
1209

    
1210
PIXOP2(avg, op_avg)
1211
PIXOP2(put, op_put)
1212
#undef op_avg
1213
#undef op_put
1214

    
1215
#define avg2(a,b) ((a+b+1)>>1)
1216
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1217

    
1218
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1219
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1220
}
1221

    
1222
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1223
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1224
}
1225

    
1226
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1227
{
1228
    const int A=(16-x16)*(16-y16);
1229
    const int B=(   x16)*(16-y16);
1230
    const int C=(16-x16)*(   y16);
1231
    const int D=(   x16)*(   y16);
1232
    int i;
1233

    
1234
    for(i=0; i<h; i++)
1235
    {
1236
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1237
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1238
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1239
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1240
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1241
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1242
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1243
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1244
        dst+= stride;
1245
        src+= stride;
1246
    }
1247
}
1248

    
1249
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1250
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1251
{
1252
    int y, vx, vy;
1253
    const int s= 1<<shift;
1254

    
1255
    width--;
1256
    height--;
1257

    
1258
    for(y=0; y<h; y++){
1259
        int x;
1260

    
1261
        vx= ox;
1262
        vy= oy;
1263
        for(x=0; x<8; x++){ //XXX FIXME optimize
1264
            int src_x, src_y, frac_x, frac_y, index;
1265

    
1266
            src_x= vx>>16;
1267
            src_y= vy>>16;
1268
            frac_x= src_x&(s-1);
1269
            frac_y= src_y&(s-1);
1270
            src_x>>=shift;
1271
            src_y>>=shift;
1272

    
1273
            if((unsigned)src_x < width){
1274
                if((unsigned)src_y < height){
1275
                    index= src_x + src_y*stride;
1276
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1277
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1278
                                        + (  src[index+stride  ]*(s-frac_x)
1279
                                           + src[index+stride+1]*   frac_x )*   frac_y
1280
                                        + r)>>(shift*2);
1281
                }else{
1282
                    index= src_x + av_clip(src_y, 0, height)*stride;
1283
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1284
                                          + src[index       +1]*   frac_x )*s
1285
                                        + r)>>(shift*2);
1286
                }
1287
            }else{
1288
                if((unsigned)src_y < height){
1289
                    index= av_clip(src_x, 0, width) + src_y*stride;
1290
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1291
                                           + src[index+stride  ]*   frac_y )*s
1292
                                        + r)>>(shift*2);
1293
                }else{
1294
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1295
                    dst[y*stride + x]=    src[index         ];
1296
                }
1297
            }
1298

    
1299
            vx+= dxx;
1300
            vy+= dyx;
1301
        }
1302
        ox += dxy;
1303
        oy += dyy;
1304
    }
1305
}
1306

    
1307
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1308
    switch(width){
1309
    case 2: put_pixels2_c (dst, src, stride, height); break;
1310
    case 4: put_pixels4_c (dst, src, stride, height); break;
1311
    case 8: put_pixels8_c (dst, src, stride, height); break;
1312
    case 16:put_pixels16_c(dst, src, stride, height); break;
1313
    }
1314
}
1315

    
1316
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1317
    int i,j;
1318
    for (i=0; i < height; i++) {
1319
      for (j=0; j < width; j++) {
1320
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1321
      }
1322
      src += stride;
1323
      dst += stride;
1324
    }
1325
}
1326

    
1327
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1328
    int i,j;
1329
    for (i=0; i < height; i++) {
1330
      for (j=0; j < width; j++) {
1331
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1332
      }
1333
      src += stride;
1334
      dst += stride;
1335
    }
1336
}
1337

    
1338
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1339
    int i,j;
1340
    for (i=0; i < height; i++) {
1341
      for (j=0; j < width; j++) {
1342
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1343
      }
1344
      src += stride;
1345
      dst += stride;
1346
    }
1347
}
1348

    
1349
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1350
    int i,j;
1351
    for (i=0; i < height; i++) {
1352
      for (j=0; j < width; j++) {
1353
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1354
      }
1355
      src += stride;
1356
      dst += stride;
1357
    }
1358
}
1359

    
1360
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361
    int i,j;
1362
    for (i=0; i < height; i++) {
1363
      for (j=0; j < width; j++) {
1364
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1365
      }
1366
      src += stride;
1367
      dst += stride;
1368
    }
1369
}
1370

    
1371
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372
    int i,j;
1373
    for (i=0; i < height; i++) {
1374
      for (j=0; j < width; j++) {
1375
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1376
      }
1377
      src += stride;
1378
      dst += stride;
1379
    }
1380
}
1381

    
1382
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383
    int i,j;
1384
    for (i=0; i < height; i++) {
1385
      for (j=0; j < width; j++) {
1386
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1387
      }
1388
      src += stride;
1389
      dst += stride;
1390
    }
1391
}
1392

    
1393
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1394
    int i,j;
1395
    for (i=0; i < height; i++) {
1396
      for (j=0; j < width; j++) {
1397
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1398
      }
1399
      src += stride;
1400
      dst += stride;
1401
    }
1402
}
1403

    
1404
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1405
    switch(width){
1406
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1407
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1408
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1409
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1410
    }
1411
}
1412

    
1413
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414
    int i,j;
1415
    for (i=0; i < height; i++) {
1416
      for (j=0; j < width; j++) {
1417
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1418
      }
1419
      src += stride;
1420
      dst += stride;
1421
    }
1422
}
1423

    
1424
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425
    int i,j;
1426
    for (i=0; i < height; i++) {
1427
      for (j=0; j < width; j++) {
1428
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1429
      }
1430
      src += stride;
1431
      dst += stride;
1432
    }
1433
}
1434

    
1435
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436
    int i,j;
1437
    for (i=0; i < height; i++) {
1438
      for (j=0; j < width; j++) {
1439
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1440
      }
1441
      src += stride;
1442
      dst += stride;
1443
    }
1444
}
1445

    
1446
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447
    int i,j;
1448
    for (i=0; i < height; i++) {
1449
      for (j=0; j < width; j++) {
1450
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1451
      }
1452
      src += stride;
1453
      dst += stride;
1454
    }
1455
}
1456

    
1457
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458
    int i,j;
1459
    for (i=0; i < height; i++) {
1460
      for (j=0; j < width; j++) {
1461
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1462
      }
1463
      src += stride;
1464
      dst += stride;
1465
    }
1466
}
1467

    
1468
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469
    int i,j;
1470
    for (i=0; i < height; i++) {
1471
      for (j=0; j < width; j++) {
1472
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1473
      }
1474
      src += stride;
1475
      dst += stride;
1476
    }
1477
}
1478

    
1479
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480
    int i,j;
1481
    for (i=0; i < height; i++) {
1482
      for (j=0; j < width; j++) {
1483
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1484
      }
1485
      src += stride;
1486
      dst += stride;
1487
    }
1488
}
1489

    
1490
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491
    int i,j;
1492
    for (i=0; i < height; i++) {
1493
      for (j=0; j < width; j++) {
1494
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1495
      }
1496
      src += stride;
1497
      dst += stride;
1498
    }
1499
}
1500
#if 0
1501
#define TPEL_WIDTH(width)\
1502
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1503
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1504
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1505
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1506
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1507
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1508
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1509
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1510
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1511
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1512
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1513
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1514
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1515
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1516
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1517
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1518
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1519
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1520
#endif
1521

    
1522
#define H264_CHROMA_MC(OPNAME, OP)\
1523
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1524
    const int A=(8-x)*(8-y);\
1525
    const int B=(  x)*(8-y);\
1526
    const int C=(8-x)*(  y);\
1527
    const int D=(  x)*(  y);\
1528
    int i;\
1529
    \
1530
    assert(x<8 && y<8 && x>=0 && y>=0);\
1531
\
1532
    if(D){\
1533
        for(i=0; i<h; i++){\
1534
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1535
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1536
            dst+= stride;\
1537
            src+= stride;\
1538
        }\
1539
    }else{\
1540
        const int E= B+C;\
1541
        const int step= C ? stride : 1;\
1542
        for(i=0; i<h; i++){\
1543
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1544
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1545
            dst+= stride;\
1546
            src+= stride;\
1547
        }\
1548
    }\
1549
}\
1550
\
1551
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1552
    const int A=(8-x)*(8-y);\
1553
    const int B=(  x)*(8-y);\
1554
    const int C=(8-x)*(  y);\
1555
    const int D=(  x)*(  y);\
1556
    int i;\
1557
    \
1558
    assert(x<8 && y<8 && x>=0 && y>=0);\
1559
\
1560
    if(D){\
1561
        for(i=0; i<h; i++){\
1562
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1563
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1564
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1565
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1566
            dst+= stride;\
1567
            src+= stride;\
1568
        }\
1569
    }else{\
1570
        const int E= B+C;\
1571
        const int step= C ? stride : 1;\
1572
        for(i=0; i<h; i++){\
1573
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1574
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1575
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1576
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1577
            dst+= stride;\
1578
            src+= stride;\
1579
        }\
1580
    }\
1581
}\
1582
\
1583
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1584
    const int A=(8-x)*(8-y);\
1585
    const int B=(  x)*(8-y);\
1586
    const int C=(8-x)*(  y);\
1587
    const int D=(  x)*(  y);\
1588
    int i;\
1589
    \
1590
    assert(x<8 && y<8 && x>=0 && y>=0);\
1591
\
1592
    if(D){\
1593
        for(i=0; i<h; i++){\
1594
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1595
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1596
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1597
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1598
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1599
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1600
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1601
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1602
            dst+= stride;\
1603
            src+= stride;\
1604
        }\
1605
    }else{\
1606
        const int E= B+C;\
1607
        const int step= C ? stride : 1;\
1608
        for(i=0; i<h; i++){\
1609
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1610
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1611
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1612
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1613
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1614
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1615
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1616
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1617
            dst+= stride;\
1618
            src+= stride;\
1619
        }\
1620
    }\
1621
}
1622

    
1623
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1624
#define op_put(a, b) a = (((b) + 32)>>6)
1625

    
1626
H264_CHROMA_MC(put_       , op_put)
1627
H264_CHROMA_MC(avg_       , op_avg)
1628
#undef op_avg
1629
#undef op_put
1630

    
1631
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1632
    const int A=(8-x)*(8-y);
1633
    const int B=(  x)*(8-y);
1634
    const int C=(8-x)*(  y);
1635
    const int D=(  x)*(  y);
1636
    int i;
1637

    
1638
    assert(x<8 && y<8 && x>=0 && y>=0);
1639

    
1640
    for(i=0; i<h; i++)
1641
    {
1642
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1643
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1644
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1645
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1646
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1647
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1648
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1649
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1650
        dst+= stride;
1651
        src+= stride;
1652
    }
1653
}
1654

    
1655
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1656
    const int A=(8-x)*(8-y);
1657
    const int B=(  x)*(8-y);
1658
    const int C=(8-x)*(  y);
1659
    const int D=(  x)*(  y);
1660
    int i;
1661

    
1662
    assert(x<8 && y<8 && x>=0 && y>=0);
1663

    
1664
    for(i=0; i<h; i++)
1665
    {
1666
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1667
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1668
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1669
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1670
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1671
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1672
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1673
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1674
        dst+= stride;
1675
        src+= stride;
1676
    }
1677
}
1678

    
1679
#define QPEL_MC(r, OPNAME, RND, OP) \
1680
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1681
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1682
    int i;\
1683
    for(i=0; i<h; i++)\
1684
    {\
1685
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1686
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1687
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1688
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1689
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1690
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1691
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1692
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1693
        dst+=dstStride;\
1694
        src+=srcStride;\
1695
    }\
1696
}\
1697
\
1698
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1699
    const int w=8;\
1700
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1701
    int i;\
1702
    for(i=0; i<w; i++)\
1703
    {\
1704
        const int src0= src[0*srcStride];\
1705
        const int src1= src[1*srcStride];\
1706
        const int src2= src[2*srcStride];\
1707
        const int src3= src[3*srcStride];\
1708
        const int src4= src[4*srcStride];\
1709
        const int src5= src[5*srcStride];\
1710
        const int src6= src[6*srcStride];\
1711
        const int src7= src[7*srcStride];\
1712
        const int src8= src[8*srcStride];\
1713
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1714
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1715
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1716
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1717
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1718
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1719
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1720
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1721
        dst++;\
1722
        src++;\
1723
    }\
1724
}\
1725
\
1726
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1727
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1728
    int i;\
1729
    \
1730
    for(i=0; i<h; i++)\
1731
    {\
1732
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1733
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1734
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1735
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1736
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1737
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1738
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1739
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1740
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1741
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1742
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1743
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1744
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1745
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1746
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1747
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1748
        dst+=dstStride;\
1749
        src+=srcStride;\
1750
    }\
1751
}\
1752
\
1753
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1754
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755
    int i;\
1756
    const int w=16;\
1757
    for(i=0; i<w; i++)\
1758
    {\
1759
        const int src0= src[0*srcStride];\
1760
        const int src1= src[1*srcStride];\
1761
        const int src2= src[2*srcStride];\
1762
        const int src3= src[3*srcStride];\
1763
        const int src4= src[4*srcStride];\
1764
        const int src5= src[5*srcStride];\
1765
        const int src6= src[6*srcStride];\
1766
        const int src7= src[7*srcStride];\
1767
        const int src8= src[8*srcStride];\
1768
        const int src9= src[9*srcStride];\
1769
        const int src10= src[10*srcStride];\
1770
        const int src11= src[11*srcStride];\
1771
        const int src12= src[12*srcStride];\
1772
        const int src13= src[13*srcStride];\
1773
        const int src14= src[14*srcStride];\
1774
        const int src15= src[15*srcStride];\
1775
        const int src16= src[16*srcStride];\
1776
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1777
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1778
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1779
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1780
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1781
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1782
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1783
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1784
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1785
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1786
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1787
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1788
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1789
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1790
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1791
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1792
        dst++;\
1793
        src++;\
1794
    }\
1795
}\
1796
\
1797
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1798
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1799
}\
1800
\
1801
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1802
    uint8_t half[64];\
1803
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1804
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1805
}\
1806
\
1807
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1808
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1809
}\
1810
\
1811
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1812
    uint8_t half[64];\
1813
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1814
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1815
}\
1816
\
1817
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1818
    uint8_t full[16*9];\
1819
    uint8_t half[64];\
1820
    copy_block9(full, src, 16, stride, 9);\
1821
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1822
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1823
}\
1824
\
1825
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1826
    uint8_t full[16*9];\
1827
    copy_block9(full, src, 16, stride, 9);\
1828
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1829
}\
1830
\
1831
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1832
    uint8_t full[16*9];\
1833
    uint8_t half[64];\
1834
    copy_block9(full, src, 16, stride, 9);\
1835
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1836
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1837
}\
1838
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1839
    uint8_t full[16*9];\
1840
    uint8_t halfH[72];\
1841
    uint8_t halfV[64];\
1842
    uint8_t halfHV[64];\
1843
    copy_block9(full, src, 16, stride, 9);\
1844
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1845
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1846
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1847
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1848
}\
1849
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1850
    uint8_t full[16*9];\
1851
    uint8_t halfH[72];\
1852
    uint8_t halfHV[64];\
1853
    copy_block9(full, src, 16, stride, 9);\
1854
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1855
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1856
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1858
}\
1859
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1860
    uint8_t full[16*9];\
1861
    uint8_t halfH[72];\
1862
    uint8_t halfV[64];\
1863
    uint8_t halfHV[64];\
1864
    copy_block9(full, src, 16, stride, 9);\
1865
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1866
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1867
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1868
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1869
}\
1870
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1871
    uint8_t full[16*9];\
1872
    uint8_t halfH[72];\
1873
    uint8_t halfHV[64];\
1874
    copy_block9(full, src, 16, stride, 9);\
1875
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1876
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1877
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1878
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1879
}\
1880
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1881
    uint8_t full[16*9];\
1882
    uint8_t halfH[72];\
1883
    uint8_t halfV[64];\
1884
    uint8_t halfHV[64];\
1885
    copy_block9(full, src, 16, stride, 9);\
1886
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1887
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1888
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1889
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1890
}\
1891
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1892
    uint8_t full[16*9];\
1893
    uint8_t halfH[72];\
1894
    uint8_t halfHV[64];\
1895
    copy_block9(full, src, 16, stride, 9);\
1896
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1897
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1898
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1899
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1900
}\
1901
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1902
    uint8_t full[16*9];\
1903
    uint8_t halfH[72];\
1904
    uint8_t halfV[64];\
1905
    uint8_t halfHV[64];\
1906
    copy_block9(full, src, 16, stride, 9);\
1907
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1908
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1909
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1910
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1911
}\
1912
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1913
    uint8_t full[16*9];\
1914
    uint8_t halfH[72];\
1915
    uint8_t halfHV[64];\
1916
    copy_block9(full, src, 16, stride, 9);\
1917
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1918
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1919
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1920
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1921
}\
1922
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1923
    uint8_t halfH[72];\
1924
    uint8_t halfHV[64];\
1925
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1926
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1927
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1928
}\
1929
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1930
    uint8_t halfH[72];\
1931
    uint8_t halfHV[64];\
1932
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1933
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1934
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1935
}\
1936
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1937
    uint8_t full[16*9];\
1938
    uint8_t halfH[72];\
1939
    uint8_t halfV[64];\
1940
    uint8_t halfHV[64];\
1941
    copy_block9(full, src, 16, stride, 9);\
1942
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1943
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1944
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1945
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1946
}\
1947
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1948
    uint8_t full[16*9];\
1949
    uint8_t halfH[72];\
1950
    copy_block9(full, src, 16, stride, 9);\
1951
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1952
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1953
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1954
}\
1955
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956
    uint8_t full[16*9];\
1957
    uint8_t halfH[72];\
1958
    uint8_t halfV[64];\
1959
    uint8_t halfHV[64];\
1960
    copy_block9(full, src, 16, stride, 9);\
1961
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1962
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1963
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1964
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1965
}\
1966
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1967
    uint8_t full[16*9];\
1968
    uint8_t halfH[72];\
1969
    copy_block9(full, src, 16, stride, 9);\
1970
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1971
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1972
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1973
}\
1974
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1975
    uint8_t halfH[72];\
1976
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1977
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1978
}\
1979
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1980
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1981
}\
1982
\
1983
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1984
    uint8_t half[256];\
1985
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1986
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1987
}\
1988
\
1989
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1990
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1991
}\
1992
\
1993
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1994
    uint8_t half[256];\
1995
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1996
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1997
}\
1998
\
1999
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2000
    uint8_t full[24*17];\
2001
    uint8_t half[256];\
2002
    copy_block17(full, src, 24, stride, 17);\
2003
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2004
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2005
}\
2006
\
2007
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2008
    uint8_t full[24*17];\
2009
    copy_block17(full, src, 24, stride, 17);\
2010
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2011
}\
2012
\
2013
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2014
    uint8_t full[24*17];\
2015
    uint8_t half[256];\
2016
    copy_block17(full, src, 24, stride, 17);\
2017
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2018
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2019
}\
2020
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2021
    uint8_t full[24*17];\
2022
    uint8_t halfH[272];\
2023
    uint8_t halfV[256];\
2024
    uint8_t halfHV[256];\
2025
    copy_block17(full, src, 24, stride, 17);\
2026
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2027
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2028
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2029
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2030
}\
2031
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2032
    uint8_t full[24*17];\
2033
    uint8_t halfH[272];\
2034
    uint8_t halfHV[256];\
2035
    copy_block17(full, src, 24, stride, 17);\
2036
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2037
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2038
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2040
}\
2041
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2042
    uint8_t full[24*17];\
2043
    uint8_t halfH[272];\
2044
    uint8_t halfV[256];\
2045
    uint8_t halfHV[256];\
2046
    copy_block17(full, src, 24, stride, 17);\
2047
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2048
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2049
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2050
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2051
}\
2052
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2053
    uint8_t full[24*17];\
2054
    uint8_t halfH[272];\
2055
    uint8_t halfHV[256];\
2056
    copy_block17(full, src, 24, stride, 17);\
2057
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2059
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2060
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2061
}\
2062
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2063
    uint8_t full[24*17];\
2064
    uint8_t halfH[272];\
2065
    uint8_t halfV[256];\
2066
    uint8_t halfHV[256];\
2067
    copy_block17(full, src, 24, stride, 17);\
2068
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2069
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2070
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2071
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2072
}\
2073
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2074
    uint8_t full[24*17];\
2075
    uint8_t halfH[272];\
2076
    uint8_t halfHV[256];\
2077
    copy_block17(full, src, 24, stride, 17);\
2078
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2079
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2080
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2081
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2082
}\
2083
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2084
    uint8_t full[24*17];\
2085
    uint8_t halfH[272];\
2086
    uint8_t halfV[256];\
2087
    uint8_t halfHV[256];\
2088
    copy_block17(full, src, 24, stride, 17);\
2089
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2090
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2091
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2092
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2093
}\
2094
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2095
    uint8_t full[24*17];\
2096
    uint8_t halfH[272];\
2097
    uint8_t halfHV[256];\
2098
    copy_block17(full, src, 24, stride, 17);\
2099
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2100
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2101
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2102
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2103
}\
2104
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2105
    uint8_t halfH[272];\
2106
    uint8_t halfHV[256];\
2107
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2108
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2109
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2110
}\
2111
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2112
    uint8_t halfH[272];\
2113
    uint8_t halfHV[256];\
2114
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2115
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2116
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2117
}\
2118
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2119
    uint8_t full[24*17];\
2120
    uint8_t halfH[272];\
2121
    uint8_t halfV[256];\
2122
    uint8_t halfHV[256];\
2123
    copy_block17(full, src, 24, stride, 17);\
2124
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2125
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2126
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2127
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2128
}\
2129
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2130
    uint8_t full[24*17];\
2131
    uint8_t halfH[272];\
2132
    copy_block17(full, src, 24, stride, 17);\
2133
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2134
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2135
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2136
}\
2137
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2138
    uint8_t full[24*17];\
2139
    uint8_t halfH[272];\
2140
    uint8_t halfV[256];\
2141
    uint8_t halfHV[256];\
2142
    copy_block17(full, src, 24, stride, 17);\
2143
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2144
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2145
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2146
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2147
}\
2148
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2149
    uint8_t full[24*17];\
2150
    uint8_t halfH[272];\
2151
    copy_block17(full, src, 24, stride, 17);\
2152
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2153
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2154
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2155
}\
2156
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2157
    uint8_t halfH[272];\
2158
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2159
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2160
}
2161

    
2162
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2163
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2164
#define op_put(a, b) a = cm[((b) + 16)>>5]
2165
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2166

    
2167
QPEL_MC(0, put_       , _       , op_put)
2168
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2169
QPEL_MC(0, avg_       , _       , op_avg)
2170
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2171
#undef op_avg
2172
#undef op_avg_no_rnd
2173
#undef op_put
2174
#undef op_put_no_rnd
2175

    
2176
#if 1
2177
#define H264_LOWPASS(OPNAME, OP, OP2) \
2178
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2179
    const int h=2;\
2180
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2181
    int i;\
2182
    for(i=0; i<h; i++)\
2183
    {\
2184
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2185
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2186
        dst+=dstStride;\
2187
        src+=srcStride;\
2188
    }\
2189
}\
2190
\
2191
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2192
    const int w=2;\
2193
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2194
    int i;\
2195
    for(i=0; i<w; i++)\
2196
    {\
2197
        const int srcB= src[-2*srcStride];\
2198
        const int srcA= src[-1*srcStride];\
2199
        const int src0= src[0 *srcStride];\
2200
        const int src1= src[1 *srcStride];\
2201
        const int src2= src[2 *srcStride];\
2202
        const int src3= src[3 *srcStride];\
2203
        const int src4= src[4 *srcStride];\
2204
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2205
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2206
        dst++;\
2207
        src++;\
2208
    }\
2209
}\
2210
\
2211
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2212
    const int h=2;\
2213
    const int w=2;\
2214
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2215
    int i;\
2216
    src -= 2*srcStride;\
2217
    for(i=0; i<h+5; i++)\
2218
    {\
2219
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2220
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2221
        tmp+=tmpStride;\
2222
        src+=srcStride;\
2223
    }\
2224
    tmp -= tmpStride*(h+5-2);\
2225
    for(i=0; i<w; i++)\
2226
    {\
2227
        const int tmpB= tmp[-2*tmpStride];\
2228
        const int tmpA= tmp[-1*tmpStride];\
2229
        const int tmp0= tmp[0 *tmpStride];\
2230
        const int tmp1= tmp[1 *tmpStride];\
2231
        const int tmp2= tmp[2 *tmpStride];\
2232
        const int tmp3= tmp[3 *tmpStride];\
2233
        const int tmp4= tmp[4 *tmpStride];\
2234
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2235
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2236
        dst++;\
2237
        tmp++;\
2238
    }\
2239
}\
2240
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2241
    const int h=4;\
2242
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2243
    int i;\
2244
    for(i=0; i<h; i++)\
2245
    {\
2246
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2247
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2248
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2249
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2250
        dst+=dstStride;\
2251
        src+=srcStride;\
2252
    }\
2253
}\
2254
\
2255
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2256
    const int w=4;\
2257
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2258
    int i;\
2259
    for(i=0; i<w; i++)\
2260
    {\
2261
        const int srcB= src[-2*srcStride];\
2262
        const int srcA= src[-1*srcStride];\
2263
        const int src0= src[0 *srcStride];\
2264
        const int src1= src[1 *srcStride];\
2265
        const int src2= src[2 *srcStride];\
2266
        const int src3= src[3 *srcStride];\
2267
        const int src4= src[4 *srcStride];\
2268
        const int src5= src[5 *srcStride];\
2269
        const int src6= src[6 *srcStride];\
2270
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2271
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2272
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2273
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2274
        dst++;\
2275
        src++;\
2276
    }\
2277
}\
2278
\
2279
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2280
    const int h=4;\
2281
    const int w=4;\
2282
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2283
    int i;\
2284
    src -= 2*srcStride;\
2285
    for(i=0; i<h+5; i++)\
2286
    {\
2287
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2288
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2289
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2290
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2291
        tmp+=tmpStride;\
2292
        src+=srcStride;\
2293
    }\
2294
    tmp -= tmpStride*(h+5-2);\
2295
    for(i=0; i<w; i++)\
2296
    {\
2297
        const int tmpB= tmp[-2*tmpStride];\
2298
        const int tmpA= tmp[-1*tmpStride];\
2299
        const int tmp0= tmp[0 *tmpStride];\
2300
        const int tmp1= tmp[1 *tmpStride];\
2301
        const int tmp2= tmp[2 *tmpStride];\
2302
        const int tmp3= tmp[3 *tmpStride];\
2303
        const int tmp4= tmp[4 *tmpStride];\
2304
        const int tmp5= tmp[5 *tmpStride];\
2305
        const int tmp6= tmp[6 *tmpStride];\
2306
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2307
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2308
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2309
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2310
        dst++;\
2311
        tmp++;\
2312
    }\
2313
}\
2314
\
2315
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2316
    const int h=8;\
2317
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2318
    int i;\
2319
    for(i=0; i<h; i++)\
2320
    {\
2321
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2322
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2323
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2324
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2325
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2326
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2327
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2328
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2329
        dst+=dstStride;\
2330
        src+=srcStride;\
2331
    }\
2332
}\
2333
\
2334
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2335
    const int w=8;\
2336
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2337
    int i;\
2338
    for(i=0; i<w; i++)\
2339
    {\
2340
        const int srcB= src[-2*srcStride];\
2341
        const int srcA= src[-1*srcStride];\
2342
        const int src0= src[0 *srcStride];\
2343
        const int src1= src[1 *srcStride];\
2344
        const int src2= src[2 *srcStride];\
2345
        const int src3= src[3 *srcStride];\
2346
        const int src4= src[4 *srcStride];\
2347
        const int src5= src[5 *srcStride];\
2348
        const int src6= src[6 *srcStride];\
2349
        const int src7= src[7 *srcStride];\
2350
        const int src8= src[8 *srcStride];\
2351
        const int src9= src[9 *srcStride];\
2352
        const int src10=src[10*srcStride];\
2353
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2354
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2355
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2356
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2357
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2358
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2359
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2360
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2361
        dst++;\
2362
        src++;\
2363
    }\
2364
}\
2365
\
2366
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2367
    const int h=8;\
2368
    const int w=8;\
2369
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2370
    int i;\
2371
    src -= 2*srcStride;\
2372
    for(i=0; i<h+5; i++)\
2373
    {\
2374
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2375
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2376
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2377
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2378
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2379
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2380
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2381
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2382
        tmp+=tmpStride;\
2383
        src+=srcStride;\
2384
    }\
2385
    tmp -= tmpStride*(h+5-2);\
2386
    for(i=0; i<w; i++)\
2387
    {\
2388
        const int tmpB= tmp[-2*tmpStride];\
2389
        const int tmpA= tmp[-1*tmpStride];\
2390
        const int tmp0= tmp[0 *tmpStride];\
2391
        const int tmp1= tmp[1 *tmpStride];\
2392
        const int tmp2= tmp[2 *tmpStride];\
2393
        const int tmp3= tmp[3 *tmpStride];\
2394
        const int tmp4= tmp[4 *tmpStride];\
2395
        const int tmp5= tmp[5 *tmpStride];\
2396
        const int tmp6= tmp[6 *tmpStride];\
2397
        const int tmp7= tmp[7 *tmpStride];\
2398
        const int tmp8= tmp[8 *tmpStride];\
2399
        const int tmp9= tmp[9 *tmpStride];\
2400
        const int tmp10=tmp[10*tmpStride];\
2401
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2402
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2403
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2404
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2405
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2406
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2407
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2408
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2409
        dst++;\
2410
        tmp++;\
2411
    }\
2412
}\
2413
\
2414
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2415
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2416
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2417
    src += 8*srcStride;\
2418
    dst += 8*dstStride;\
2419
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2420
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2421
}\
2422
\
2423
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2424
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2425
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2426
    src += 8*srcStride;\
2427
    dst += 8*dstStride;\
2428
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2429
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2430
}\
2431
\
2432
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2433
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2434
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2435
    src += 8*srcStride;\
2436
    dst += 8*dstStride;\
2437
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2438
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2439
}\
2440

    
2441
#define H264_MC(OPNAME, SIZE) \
2442
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2443
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2444
}\
2445
\
2446
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2447
    uint8_t half[SIZE*SIZE];\
2448
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2449
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2450
}\
2451
\
2452
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2453
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2454
}\
2455
\
2456
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2457
    uint8_t half[SIZE*SIZE];\
2458
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2459
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2460
}\
2461
\
2462
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2463
    uint8_t full[SIZE*(SIZE+5)];\
2464
    uint8_t * const full_mid= full + SIZE*2;\
2465
    uint8_t half[SIZE*SIZE];\
2466
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2467
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2468
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2469
}\
2470
\
2471
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2472
    uint8_t full[SIZE*(SIZE+5)];\
2473
    uint8_t * const full_mid= full + SIZE*2;\
2474
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2475
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2476
}\
2477
\
2478
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2479
    uint8_t full[SIZE*(SIZE+5)];\
2480
    uint8_t * const full_mid= full + SIZE*2;\
2481
    uint8_t half[SIZE*SIZE];\
2482
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2483
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2484
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2485
}\
2486
\
2487
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2488
    uint8_t full[SIZE*(SIZE+5)];\
2489
    uint8_t * const full_mid= full + SIZE*2;\
2490
    uint8_t halfH[SIZE*SIZE];\
2491
    uint8_t halfV[SIZE*SIZE];\
2492
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2493
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2494
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2495
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2496
}\
2497
\
2498
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2499
    uint8_t full[SIZE*(SIZE+5)];\
2500
    uint8_t * const full_mid= full + SIZE*2;\
2501
    uint8_t halfH[SIZE*SIZE];\
2502
    uint8_t halfV[SIZE*SIZE];\
2503
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2504
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2505
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2506
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2507
}\
2508
\
2509
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2510
    uint8_t full[SIZE*(SIZE+5)];\
2511
    uint8_t * const full_mid= full + SIZE*2;\
2512
    uint8_t halfH[SIZE*SIZE];\
2513
    uint8_t halfV[SIZE*SIZE];\
2514
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2515
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2516
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2517
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2518
}\
2519
\
2520
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2521
    uint8_t full[SIZE*(SIZE+5)];\
2522
    uint8_t * const full_mid= full + SIZE*2;\
2523
    uint8_t halfH[SIZE*SIZE];\
2524
    uint8_t halfV[SIZE*SIZE];\
2525
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2526
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2527
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2528
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2529
}\
2530
\
2531
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2532
    int16_t tmp[SIZE*(SIZE+5)];\
2533
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2534
}\
2535
\
2536
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2537
    int16_t tmp[SIZE*(SIZE+5)];\
2538
    uint8_t halfH[SIZE*SIZE];\
2539
    uint8_t halfHV[SIZE*SIZE];\
2540
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2541
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2542
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2543
}\
2544
\
2545
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2546
    int16_t tmp[SIZE*(SIZE+5)];\
2547
    uint8_t halfH[SIZE*SIZE];\
2548
    uint8_t halfHV[SIZE*SIZE];\
2549
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2550
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2551
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2552
}\
2553
\
2554
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2555
    uint8_t full[SIZE*(SIZE+5)];\
2556
    uint8_t * const full_mid= full + SIZE*2;\
2557
    int16_t tmp[SIZE*(SIZE+5)];\
2558
    uint8_t halfV[SIZE*SIZE];\
2559
    uint8_t halfHV[SIZE*SIZE];\
2560
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2561
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2562
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2563
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2564
}\
2565
\
2566
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2567
    uint8_t full[SIZE*(SIZE+5)];\
2568
    uint8_t * const full_mid= full + SIZE*2;\
2569
    int16_t tmp[SIZE*(SIZE+5)];\
2570
    uint8_t halfV[SIZE*SIZE];\
2571
    uint8_t halfHV[SIZE*SIZE];\
2572
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2573
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2574
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2575
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2576
}\
2577

    
2578
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2579
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2580
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2581
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2582
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2583

    
2584
H264_LOWPASS(put_       , op_put, op2_put)
2585
H264_LOWPASS(avg_       , op_avg, op2_avg)
2586
H264_MC(put_, 2)
2587
H264_MC(put_, 4)
2588
H264_MC(put_, 8)
2589
H264_MC(put_, 16)
2590
H264_MC(avg_, 4)
2591
H264_MC(avg_, 8)
2592
H264_MC(avg_, 16)
2593

    
2594
#undef op_avg
2595
#undef op_put
2596
#undef op2_avg
2597
#undef op2_put
2598
#endif
2599

    
2600
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2601
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2602
#define H264_WEIGHT(W,H) \
2603
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2604
    int y; \
2605
    offset <<= log2_denom; \
2606
    if(log2_denom) offset += 1<<(log2_denom-1); \
2607
    for(y=0; y<H; y++, block += stride){ \
2608
        op_scale1(0); \
2609
        op_scale1(1); \
2610
        if(W==2) continue; \
2611
        op_scale1(2); \
2612
        op_scale1(3); \
2613
        if(W==4) continue; \
2614
        op_scale1(4); \
2615
        op_scale1(5); \
2616
        op_scale1(6); \
2617
        op_scale1(7); \
2618
        if(W==8) continue; \
2619
        op_scale1(8); \
2620
        op_scale1(9); \
2621
        op_scale1(10); \
2622
        op_scale1(11); \
2623
        op_scale1(12); \
2624
        op_scale1(13); \
2625
        op_scale1(14); \
2626
        op_scale1(15); \
2627
    } \
2628
} \
2629
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2630
    int y; \
2631
    offset = ((offset + 1) | 1) << log2_denom; \
2632
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2633
        op_scale2(0); \
2634
        op_scale2(1); \
2635
        if(W==2) continue; \
2636
        op_scale2(2); \
2637
        op_scale2(3); \
2638
        if(W==4) continue; \
2639
        op_scale2(4); \
2640
        op_scale2(5); \
2641
        op_scale2(6); \
2642
        op_scale2(7); \
2643
        if(W==8) continue; \
2644
        op_scale2(8); \
2645
        op_scale2(9); \
2646
        op_scale2(10); \
2647
        op_scale2(11); \
2648
        op_scale2(12); \
2649
        op_scale2(13); \
2650
        op_scale2(14); \
2651
        op_scale2(15); \
2652
    } \
2653
}
2654

    
2655
H264_WEIGHT(16,16)
2656
H264_WEIGHT(16,8)
2657
H264_WEIGHT(8,16)
2658
H264_WEIGHT(8,8)
2659
H264_WEIGHT(8,4)
2660
H264_WEIGHT(4,8)
2661
H264_WEIGHT(4,4)
2662
H264_WEIGHT(4,2)
2663
H264_WEIGHT(2,4)
2664
H264_WEIGHT(2,2)
2665

    
2666
#undef op_scale1
2667
#undef op_scale2
2668
#undef H264_WEIGHT
2669

    
2670
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2671
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2672
    int i;
2673

    
2674
    for(i=0; i<h; i++){
2675
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2676
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2677
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2678
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2679
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2680
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2681
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2682
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2683
        dst+=dstStride;
2684
        src+=srcStride;
2685
    }
2686
}
2687

    
2688
#if CONFIG_CAVS_DECODER
2689
/* AVS specific */
2690
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2691
    put_pixels8_c(dst, src, stride, 8);
2692
}
2693
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2694
    avg_pixels8_c(dst, src, stride, 8);
2695
}
2696
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2697
    put_pixels16_c(dst, src, stride, 16);
2698
}
2699
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2700
    avg_pixels16_c(dst, src, stride, 16);
2701
}
2702
#endif /* CONFIG_CAVS_DECODER */
2703

    
2704
#if CONFIG_VC1_DECODER
2705
/* VC-1 specific */
2706
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2707
    put_pixels8_c(dst, src, stride, 8);
2708
}
2709
void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2710
    avg_pixels8_c(dst, src, stride, 8);
2711
}
2712
#endif /* CONFIG_VC1_DECODER */
2713

    
2714
/* H264 specific */
2715
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2716

    
2717
#if CONFIG_RV40_DECODER
2718
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2719
    put_pixels16_xy2_c(dst, src, stride, 16);
2720
}
2721
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2722
    avg_pixels16_xy2_c(dst, src, stride, 16);
2723
}
2724
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2725
    put_pixels8_xy2_c(dst, src, stride, 8);
2726
}
2727
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2728
    avg_pixels8_xy2_c(dst, src, stride, 8);
2729
}
2730
#endif /* CONFIG_RV40_DECODER */
2731

    
2732
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2733
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2734
    int i;
2735

    
2736
    for(i=0; i<w; i++){
2737
        const int src_1= src[ -srcStride];
2738
        const int src0 = src[0          ];
2739
        const int src1 = src[  srcStride];
2740
        const int src2 = src[2*srcStride];
2741
        const int src3 = src[3*srcStride];
2742
        const int src4 = src[4*srcStride];
2743
        const int src5 = src[5*srcStride];
2744
        const int src6 = src[6*srcStride];
2745
        const int src7 = src[7*srcStride];
2746
        const int src8 = src[8*srcStride];
2747
        const int src9 = src[9*srcStride];
2748
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2749
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2750
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2751
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2752
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2753
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2754
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2755
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2756
        src++;
2757
        dst++;
2758
    }
2759
}
2760

    
2761
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2762
    put_pixels8_c(dst, src, stride, 8);
2763
}
2764

    
2765
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2766
    uint8_t half[64];
2767
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2768
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2769
}
2770

    
2771
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2772
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2773
}
2774

    
2775
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2776
    uint8_t half[64];
2777
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2778
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2779
}
2780

    
2781
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2782
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2783
}
2784

    
2785
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2786
    uint8_t halfH[88];
2787
    uint8_t halfV[64];
2788
    uint8_t halfHV[64];
2789
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2790
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2791
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2792
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2793
}
2794
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2795
    uint8_t halfH[88];
2796
    uint8_t halfV[64];
2797
    uint8_t halfHV[64];
2798
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2799
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2800
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2801
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2802
}
2803
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2804
    uint8_t halfH[88];
2805
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2806
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2807
}
2808

    
2809
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2810
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2811
    int x;
2812
    const int strength= ff_h263_loop_filter_strength[qscale];
2813

    
2814
    for(x=0; x<8; x++){
2815
        int d1, d2, ad1;
2816
        int p0= src[x-2*stride];
2817
        int p1= src[x-1*stride];
2818
        int p2= src[x+0*stride];
2819
        int p3= src[x+1*stride];
2820
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2821

    
2822
        if     (d<-2*strength) d1= 0;
2823
        else if(d<-  strength) d1=-2*strength - d;
2824
        else if(d<   strength) d1= d;
2825
        else if(d< 2*strength) d1= 2*strength - d;
2826
        else                   d1= 0;
2827

    
2828
        p1 += d1;
2829
        p2 -= d1;
2830
        if(p1&256) p1= ~(p1>>31);
2831
        if(p2&256) p2= ~(p2>>31);
2832

    
2833
        src[x-1*stride] = p1;
2834
        src[x+0*stride] = p2;
2835

    
2836
        ad1= FFABS(d1)>>1;
2837

    
2838
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2839

    
2840
        src[x-2*stride] = p0 - d2;
2841
        src[x+  stride] = p3 + d2;
2842
    }
2843
    }
2844
}
2845

    
2846
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2847
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2848
    int y;
2849
    const int strength= ff_h263_loop_filter_strength[qscale];
2850

    
2851
    for(y=0; y<8; y++){
2852
        int d1, d2, ad1;
2853
        int p0= src[y*stride-2];
2854
        int p1= src[y*stride-1];
2855
        int p2= src[y*stride+0];
2856
        int p3= src[y*stride+1];
2857
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2858

    
2859
        if     (d<-2*strength) d1= 0;
2860
        else if(d<-  strength) d1=-2*strength - d;
2861
        else if(d<   strength) d1= d;
2862
        else if(d< 2*strength) d1= 2*strength - d;
2863
        else                   d1= 0;
2864

    
2865
        p1 += d1;
2866
        p2 -= d1;
2867
        if(p1&256) p1= ~(p1>>31);
2868
        if(p2&256) p2= ~(p2>>31);
2869

    
2870
        src[y*stride-1] = p1;
2871
        src[y*stride+0] = p2;
2872

    
2873
        ad1= FFABS(d1)>>1;
2874

    
2875
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2876

    
2877
        src[y*stride-2] = p0 - d2;
2878
        src[y*stride+1] = p3 + d2;
2879
    }
2880
    }
2881
}
2882

    
2883
static void h261_loop_filter_c(uint8_t *src, int stride){
2884
    int x,y,xy,yz;
2885
    int temp[64];
2886

    
2887
    for(x=0; x<8; x++){
2888
        temp[x      ] = 4*src[x           ];
2889
        temp[x + 7*8] = 4*src[x + 7*stride];
2890
    }
2891
    for(y=1; y<7; y++){
2892
        for(x=0; x<8; x++){
2893
            xy = y * stride + x;
2894
            yz = y * 8 + x;
2895
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2896
        }
2897
    }
2898

    
2899
    for(y=0; y<8; y++){
2900
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2901
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2902
        for(x=1; x<7; x++){
2903
            xy = y * stride + x;
2904
            yz = y * 8 + x;
2905
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2906
        }
2907
    }
2908
}
2909

    
2910
static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2911
{
2912
    int i, d;
2913
    for( i = 0; i < 4; i++ ) {
2914
        if( tc0[i] < 0 ) {
2915
            pix += 4*ystride;
2916
            continue;
2917
        }
2918
        for( d = 0; d < 4; d++ ) {
2919
            const int p0 = pix[-1*xstride];
2920
            const int p1 = pix[-2*xstride];
2921
            const int p2 = pix[-3*xstride];
2922
            const int q0 = pix[0];
2923
            const int q1 = pix[1*xstride];
2924
            const int q2 = pix[2*xstride];
2925

    
2926
            if( FFABS( p0 - q0 ) < alpha &&
2927
                FFABS( p1 - p0 ) < beta &&
2928
                FFABS( q1 - q0 ) < beta ) {
2929

    
2930
                int tc = tc0[i];
2931
                int i_delta;
2932

    
2933
                if( FFABS( p2 - p0 ) < beta ) {
2934
                    if(tc0[i])
2935
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2936
                    tc++;
2937
                }
2938
                if( FFABS( q2 - q0 ) < beta ) {
2939
                    if(tc0[i])
2940
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2941
                    tc++;
2942
                }
2943

    
2944
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2945
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2946
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2947
            }
2948
            pix += ystride;
2949
        }
2950
    }
2951
}
2952
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2953
{
2954
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2955
}
2956
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2957
{
2958
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2959
}
2960

    
2961
static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2962
{
2963
    int d;
2964
    for( d = 0; d < 16; d++ ) {
2965
        const int p2 = pix[-3*xstride];
2966
        const int p1 = pix[-2*xstride];
2967
        const int p0 = pix[-1*xstride];
2968

    
2969
        const int q0 = pix[ 0*xstride];
2970
        const int q1 = pix[ 1*xstride];
2971
        const int q2 = pix[ 2*xstride];
2972

    
2973
        if( FFABS( p0 - q0 ) < alpha &&
2974
            FFABS( p1 - p0 ) < beta &&
2975
            FFABS( q1 - q0 ) < beta ) {
2976

    
2977
            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
2978
                if( FFABS( p2 - p0 ) < beta)
2979
                {
2980
                    const int p3 = pix[-4*xstride];
2981
                    /* p0', p1', p2' */
2982
                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
2983
                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
2984
                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
2985
                } else {
2986
                    /* p0' */
2987
                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
2988
                }
2989
                if( FFABS( q2 - q0 ) < beta)
2990
                {
2991
                    const int q3 = pix[3*xstride];
2992
                    /* q0', q1', q2' */
2993
                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
2994
                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
2995
                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
2996
                } else {
2997
                    /* q0' */
2998
                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
2999
                }
3000
            }else{
3001
                /* p0', q0' */
3002
                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3003
                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3004
            }
3005
        }
3006
        pix += ystride;
3007
    }
3008
}
3009
static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3010
{
3011
    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3012
}
3013
static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3014
{
3015
    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3016
}
3017

    
3018
static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3019
{
3020
    int i, d;
3021
    for( i = 0; i < 4; i++ ) {
3022
        const int tc = tc0[i];
3023
        if( tc <= 0 ) {
3024
            pix += 2*ystride;
3025
            continue;
3026
        }
3027
        for( d = 0; d < 2; d++ ) {
3028
            const int p0 = pix[-1*xstride];
3029
            const int p1 = pix[-2*xstride];
3030
            const int q0 = pix[0];
3031
            const int q1 = pix[1*xstride];
3032

    
3033
            if( FFABS( p0 - q0 ) < alpha &&
3034
                FFABS( p1 - p0 ) < beta &&
3035
                FFABS( q1 - q0 ) < beta ) {
3036

    
3037
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3038

    
3039
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3040
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3041
            }
3042
            pix += ystride;
3043
        }
3044
    }
3045
}
3046
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3047
{
3048
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3049
}
3050
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3051
{
3052
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3053
}
3054

    
3055
static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3056
{
3057
    int d;
3058
    for( d = 0; d < 8; d++ ) {
3059
        const int p0 = pix[-1*xstride];
3060
        const int p1 = pix[-2*xstride];
3061
        const int q0 = pix[0];
3062
        const int q1 = pix[1*xstride];
3063

    
3064
        if( FFABS( p0 - q0 ) < alpha &&
3065
            FFABS( p1 - p0 ) < beta &&
3066
            FFABS( q1 - q0 ) < beta ) {
3067

    
3068
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3069
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3070
        }
3071
        pix += ystride;
3072
    }
3073
}
3074
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3075
{
3076
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3077
}
3078
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3079
{
3080
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3081
}
3082

    
3083
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3084
{
3085
    int s, i;
3086

    
3087
    s = 0;
3088
    for(i=0;i<h;i++) {
3089
        s += abs(pix1[0] - pix2[0]);
3090
        s += abs(pix1[1] - pix2[1]);
3091
        s += abs(pix1[2] - pix2[2]);
3092
        s += abs(pix1[3] - pix2[3]);
3093
        s += abs(pix1[4] - pix2[4]);
3094
        s += abs(pix1[5] - pix2[5]);
3095
        s += abs(pix1[6] - pix2[6]);
3096
        s += abs(pix1[7] - pix2[7]);
3097
        s += abs(pix1[8] - pix2[8]);
3098
        s += abs(pix1[9] - pix2[9]);
3099
        s += abs(pix1[10] - pix2[10]);
3100
        s += abs(pix1[11] - pix2[11]);
3101
        s += abs(pix1[12] - pix2[12]);
3102
        s += abs(pix1[13] - pix2[13]);
3103
        s += abs(pix1[14] - pix2[14]);
3104
        s += abs(pix1[15] - pix2[15]);
3105
        pix1 += line_size;
3106
        pix2 += line_size;
3107
    }
3108
    return s;
3109
}
3110

    
3111
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3112
{
3113
    int s, i;
3114

    
3115
    s = 0;
3116
    for(i=0;i<h;i++) {
3117
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3118
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3119
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3120
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3121
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3122
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3123
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3124
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3125
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3126
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3127
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3128
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3129
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3130
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3131
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3132
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3133
        pix1 += line_size;
3134
        pix2 += line_size;
3135
    }
3136
    return s;
3137
}
3138

    
3139
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3140
{
3141
    int s, i;
3142
    uint8_t *pix3 = pix2 + line_size;
3143

    
3144
    s = 0;
3145
    for(i=0;i<h;i++) {
3146
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3147
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3148
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3149
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3150
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3151
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3152
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3153
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3154
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3155
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3156
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3157
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3158
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3159
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3160
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3161
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3162
        pix1 += line_size;
3163
        pix2 += line_size;
3164
        pix3 += line_size;
3165
    }
3166
    return s;
3167
}
3168

    
3169
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3170
{
3171
    int s, i;
3172
    uint8_t *pix3 = pix2 + line_size;
3173

    
3174
    s = 0;
3175
    for(i=0;i<h;i++) {
3176
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3177
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3178
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3179
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3180
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3181
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3182
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3183
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3184
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3185
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3186
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3187
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3188
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3189
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3190
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3191
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3192
        pix1 += line_size;
3193
        pix2 += line_size;
3194
        pix3 += line_size;
3195
    }
3196
    return s;
3197
}
3198

    
3199
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3200
{
3201
    int s, i;
3202

    
3203
    s = 0;
3204
    for(i=0;i<h;i++) {
3205
        s += abs(pix1[0] - pix2[0]);
3206
        s += abs(pix1[1] - pix2[1]);
3207
        s += abs(pix1[2] - pix2[2]);
3208
        s += abs(pix1[3] - pix2[3]);
3209
        s += abs(pix1[4] - pix2[4]);
3210
        s += abs(pix1[5] - pix2[5]);
3211
        s += abs(pix1[6] - pix2[6]);
3212
        s += abs(pix1[7] - pix2[7]);
3213
        pix1 += line_size;
3214
        pix2 += line_size;
3215
    }
3216
    return s;
3217
}
3218

    
3219
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3220
{
3221
    int s, i;
3222

    
3223
    s = 0;
3224
    for(i=0;i<h;i++) {
3225
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3226
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3227
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3228
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3229
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3230
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3231
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3232
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3233
        pix1 += line_size;
3234
        pix2 += line_size;
3235
    }
3236
    return s;
3237
}
3238

    
3239
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3240
{
3241
    int s, i;
3242
    uint8_t *pix3 = pix2 + line_size;
3243

    
3244
    s = 0;
3245
    for(i=0;i<h;i++) {
3246
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3247
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3248
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3249
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3250
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3251
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3252
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3253
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3254
        pix1 += line_size;
3255
        pix2 += line_size;
3256
        pix3 += line_size;
3257
    }
3258
    return s;
3259
}
3260

    
3261
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3262
{
3263
    int s, i;
3264
    uint8_t *pix3 = pix2 + line_size;
3265

    
3266
    s = 0;
3267
    for(i=0;i<h;i++) {
3268
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3269
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3270
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3271
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3272
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3273
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3274
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3275
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3276
        pix1 += line_size;
3277
        pix2 += line_size;
3278
        pix3 += line_size;
3279
    }
3280
    return s;
3281
}
3282

    
3283
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3284
    MpegEncContext *c = v;
3285
    int score1=0;
3286
    int score2=0;
3287
    int x,y;
3288

    
3289
    for(y=0; y<h; y++){
3290
        for(x=0; x<16; x++){
3291
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3292
        }
3293
        if(y+1<h){
3294
            for(x=0; x<15; x++){
3295
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3296
                             - s1[x+1] + s1[x+1+stride])
3297
                        -FFABS(  s2[x  ] - s2[x  +stride]
3298
                             - s2[x+1] + s2[x+1+stride]);
3299
            }
3300
        }
3301
        s1+= stride;
3302
        s2+= stride;
3303
    }
3304

    
3305
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3306
    else  return score1 + FFABS(score2)*8;
3307
}
3308

    
3309
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3310
    MpegEncContext *c = v;
3311
    int score1=0;
3312
    int score2=0;
3313
    int x,y;
3314

    
3315
    for(y=0; y<h; y++){
3316
        for(x=0; x<8; x++){
3317
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3318
        }
3319
        if(y+1<h){
3320
            for(x=0; x<7; x++){
3321
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3322
                             - s1[x+1] + s1[x+1+stride])
3323
                        -FFABS(  s2[x  ] - s2[x  +stride]
3324
                             - s2[x+1] + s2[x+1+stride]);
3325
            }
3326
        }
3327
        s1+= stride;
3328
        s2+= stride;
3329
    }
3330

    
3331
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3332
    else  return score1 + FFABS(score2)*8;
3333
}
3334

    
3335
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3336
    int i;
3337
    unsigned int sum=0;
3338

    
3339
    for(i=0; i<8*8; i++){
3340
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3341
        int w= weight[i];
3342
        b>>= RECON_SHIFT;
3343
        assert(-512<b && b<512);
3344

    
3345
        sum += (w*b)*(w*b)>>4;
3346
    }
3347
    return sum>>2;
3348
}
3349

    
3350
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3351
    int i;
3352

    
3353
    for(i=0; i<8*8; i++){
3354
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3355
    }
3356
}
3357

    
3358
/**
3359
 * permutes an 8x8 block.
3360
 * @param block the block which will be permuted according to the given permutation vector
3361
 * @param permutation the permutation vector
3362
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3363
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3364
 *                  (inverse) permutated to scantable order!
3365
 */
3366
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3367
{
3368
    int i;
3369
    DCTELEM temp[64];
3370

    
3371
    if(last<=0) return;
3372
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3373

    
3374
    for(i=0; i<=last; i++){
3375
        const int j= scantable[i];
3376
        temp[j]= block[j];
3377
        block[j]=0;
3378
    }
3379

    
3380
    for(i=0; i<=last; i++){
3381
        const int j= scantable[i];
3382
        const int perm_j= permutation[j];
3383
        block[perm_j]= temp[j];
3384
    }
3385
}
3386

    
3387
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3388
    return 0;
3389
}
3390

    
3391
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3392
    int i;
3393

    
3394
    memset(cmp, 0, sizeof(void*)*6);
3395

    
3396
    for(i=0; i<6; i++){
3397
        switch(type&0xFF){
3398
        case FF_CMP_SAD:
3399
            cmp[i]= c->sad[i];
3400
            break;
3401
        case FF_CMP_SATD:
3402
            cmp[i]= c->hadamard8_diff[i];
3403
            break;
3404
        case FF_CMP_SSE:
3405
            cmp[i]= c->sse[i];
3406
            break;
3407
        case FF_CMP_DCT:
3408
            cmp[i]= c->dct_sad[i];
3409
            break;
3410
        case FF_CMP_DCT264:
3411
            cmp[i]= c->dct264_sad[i];
3412
            break;
3413
        case FF_CMP_DCTMAX:
3414
            cmp[i]= c->dct_max[i];
3415
            break;
3416
        case FF_CMP_PSNR:
3417
            cmp[i]= c->quant_psnr[i];
3418
            break;
3419
        case FF_CMP_BIT:
3420
            cmp[i]= c->bit[i];
3421
            break;
3422
        case FF_CMP_RD:
3423
            cmp[i]= c->rd[i];
3424
            break;
3425
        case FF_CMP_VSAD:
3426
            cmp[i]= c->vsad[i];
3427
            break;
3428
        case FF_CMP_VSSE:
3429
            cmp[i]= c->vsse[i];
3430
            break;
3431
        case FF_CMP_ZERO:
3432
            cmp[i]= zero_cmp;
3433
            break;
3434
        case FF_CMP_NSSE:
3435
            cmp[i]= c->nsse[i];
3436
            break;
3437
#if CONFIG_DWT
3438
        case FF_CMP_W53:
3439
            cmp[i]= c->w53[i];
3440
            break;
3441
        case FF_CMP_W97:
3442
            cmp[i]= c->w97[i];
3443
            break;
3444
#endif
3445
        default:
3446
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3447
        }
3448
    }
3449
}
3450

    
3451
static void clear_block_c(DCTELEM *block)
3452
{
3453
    memset(block, 0, sizeof(DCTELEM)*64);
3454
}
3455

    
3456
/**
3457
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3458
 */
3459
static void clear_blocks_c(DCTELEM *blocks)
3460
{
3461
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3462
}
3463

    
3464
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3465
    long i;
3466
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3467
        long a = *(long*)(src+i);
3468
        long b = *(long*)(dst+i);
3469
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3470
    }
3471
    for(; i<w; i++)
3472
        dst[i+0] += src[i+0];
3473
}
3474

    
3475
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3476
    long i;
3477
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3478
        long a = *(long*)(src1+i);
3479
        long b = *(long*)(src2+i);
3480
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3481
    }
3482
    for(; i<w; i++)
3483
        dst[i] = src1[i]+src2[i];
3484
}
3485

    
3486
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3487
    long i;
3488
#if !HAVE_FAST_UNALIGNED
3489
    if((long)src2 & (sizeof(long)-1)){
3490
        for(i=0; i+7<w; i+=8){
3491
            dst[i+0] = src1[i+0]-src2[i+0];
3492
            dst[i+1] = src1[i+1]-src2[i+1];
3493
            dst[i+2] = src1[i+2]-src2[i+2];
3494
            dst[i+3] = src1[i+3]-src2[i+3];
3495
            dst[i+4] = src1[i+4]-src2[i+4];
3496
            dst[i+5] = src1[i+5]-src2[i+5];
3497
            dst[i+6] = src1[i+6]-src2[i+6];
3498
            dst[i+7] = src1[i+7]-src2[i+7];
3499
        }
3500
    }else
3501
#endif
3502
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3503
        long a = *(long*)(src1+i);
3504
        long b = *(long*)(src2+i);
3505
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3506
    }
3507
    for(; i<w; i++)
3508
        dst[i+0] = src1[i+0]-src2[i+0];
3509
}
3510

    
3511
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3512
    int i;
3513
    uint8_t l, lt;
3514

    
3515
    l= *left;
3516
    lt= *left_top;
3517

    
3518
    for(i=0; i<w; i++){
3519
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3520
        lt= src1[i];
3521
        dst[i]= l;
3522
    }
3523

    
3524
    *left= l;
3525
    *left_top= lt;
3526
}
3527

    
3528
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3529
    int i;
3530
    uint8_t l, lt;
3531

    
3532
    l= *left;
3533
    lt= *left_top;
3534

    
3535
    for(i=0; i<w; i++){
3536
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3537
        lt= src1[i];
3538
        l= src2[i];
3539
        dst[i]= l - pred;
3540
    }
3541

    
3542
    *left= l;
3543
    *left_top= lt;
3544
}
3545

    
3546
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3547
    int i;
3548

    
3549
    for(i=0; i<w-1; i++){
3550
        acc+= src[i];
3551
        dst[i]= acc;
3552
        i++;
3553
        acc+= src[i];
3554
        dst[i]= acc;
3555
    }
3556

    
3557
    for(; i<w; i++){
3558
        acc+= src[i];
3559
        dst[i]= acc;
3560
    }
3561

    
3562
    return acc;
3563
}
3564

    
3565
#if HAVE_BIGENDIAN
3566
#define B 3
3567
#define G 2
3568
#define R 1
3569
#define A 0
3570
#else
3571
#define B 0
3572
#define G 1
3573
#define R 2
3574
#define A 3
3575
#endif
3576
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3577
    int i;
3578
    int r,g,b,a;
3579
    r= *red;
3580
    g= *green;
3581
    b= *blue;
3582
    a= *alpha;
3583

    
3584
    for(i=0; i<w; i++){
3585
        b+= src[4*i+B];
3586
        g+= src[4*i+G];
3587
        r+= src[4*i+R];
3588
        a+= src[4*i+A];
3589

    
3590
        dst[4*i+B]= b;
3591
        dst[4*i+G]= g;
3592
        dst[4*i+R]= r;
3593
        dst[4*i+A]= a;
3594
    }
3595

    
3596
    *red= r;
3597
    *green= g;
3598
    *blue= b;
3599
    *alpha= a;
3600
}
3601
#undef B
3602
#undef G
3603
#undef R
3604
#undef A
3605

    
3606
#define BUTTERFLY2(o1,o2,i1,i2) \
3607
o1= (i1)+(i2);\
3608
o2= (i1)-(i2);
3609

    
3610
#define BUTTERFLY1(x,y) \
3611
{\
3612
    int a,b;\
3613
    a= x;\
3614
    b= y;\
3615
    x= a+b;\
3616
    y= a-b;\
3617
}
3618

    
3619
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3620

    
3621
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3622
    int i;
3623
    int temp[64];
3624
    int sum=0;
3625

    
3626
    assert(h==8);
3627

    
3628
    for(i=0; i<8; i++){
3629
        //FIXME try pointer walks
3630
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3631
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3632
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3633
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3634

    
3635
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3636
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3637
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3638
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3639

    
3640
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3641
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3642
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3643
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3644
    }
3645

    
3646
    for(i=0; i<8; i++){
3647
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3648
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3649
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3650
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3651

    
3652
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3653
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3654
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3655
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3656

    
3657
        sum +=
3658
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3659
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3660
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3661
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3662
    }
3663
#if 0
3664
static int maxi=0;
3665
if(sum>maxi){
3666
    maxi=sum;
3667
    printf("MAX:%d\n", maxi);
3668
}
3669
#endif
3670
    return sum;
3671
}
3672

    
3673
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3674
    int i;
3675
    int temp[64];
3676
    int sum=0;
3677

    
3678
    assert(h==8);
3679

    
3680
    for(i=0; i<8; i++){
3681
        //FIXME try pointer walks
3682
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3683
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3684
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3685
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3686

    
3687
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3688
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3689
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3690
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3691

    
3692
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3693
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3694
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3695
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3696
    }
3697

    
3698
    for(i=0; i<8; i++){
3699
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3700
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3701
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3702
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3703

    
3704
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3705
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3706
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3707
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3708

    
3709
        sum +=
3710
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3711
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3712
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3713
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3714
    }
3715

    
3716
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3717

    
3718
    return sum;
3719
}
3720

    
3721
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3722
    MpegEncContext * const s= (MpegEncContext *)c;
3723
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3724

    
3725
    assert(h==8);
3726

    
3727
    s->dsp.diff_pixels(temp, src1, src2, stride);
3728
    s->dsp.fdct(temp);
3729
    return s->dsp.sum_abs_dctelem(temp);
3730
}
3731

    
3732
#if CONFIG_GPL
3733
#define DCT8_1D {\
3734
    const int s07 = SRC(0) + SRC(7);\
3735
    const int s16 = SRC(1) + SRC(6);\
3736
    const int s25 = SRC(2) + SRC(5);\
3737
    const int s34 = SRC(3) + SRC(4);\
3738
    const int a0 = s07 + s34;\
3739
    const int a1 = s16 + s25;\
3740
    const int a2 = s07 - s34;\
3741
    const int a3 = s16 - s25;\
3742
    const int d07 = SRC(0) - SRC(7);\
3743
    const int d16 = SRC(1) - SRC(6);\
3744
    const int d25 = SRC(2) - SRC(5);\
3745
    const int d34 = SRC(3) - SRC(4);\
3746
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3747
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3748
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3749
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3750
    DST(0,  a0 + a1     ) ;\
3751
    DST(1,  a4 + (a7>>2)) ;\
3752
    DST(2,  a2 + (a3>>1)) ;\
3753
    DST(3,  a5 + (a6>>2)) ;\
3754
    DST(4,  a0 - a1     ) ;\
3755
    DST(5,  a6 - (a5>>2)) ;\
3756
    DST(6, (a2>>1) - a3 ) ;\
3757
    DST(7, (a4>>2) - a7 ) ;\
3758
}
3759

    
3760
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3761
    MpegEncContext * const s= (MpegEncContext *)c;
3762
    DCTELEM dct[8][8];
3763
    int i;
3764
    int sum=0;
3765

    
3766
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3767

    
3768
#define SRC(x) dct[i][x]
3769
#define DST(x,v) dct[i][x]= v
3770
    for( i = 0; i < 8; i++ )
3771
        DCT8_1D
3772
#undef SRC
3773
#undef DST
3774

    
3775
#define SRC(x) dct[x][i]
3776
#define DST(x,v) sum += FFABS(v)
3777
    for( i = 0; i < 8; i++ )
3778
        DCT8_1D
3779
#undef SRC
3780
#undef DST
3781
    return sum;
3782
}
3783
#endif
3784

    
3785
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3786
    MpegEncContext * const s= (MpegEncContext *)c;
3787
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3788
    int sum=0, i;
3789

    
3790
    assert(h==8);
3791

    
3792
    s->dsp.diff_pixels(temp, src1, src2, stride);
3793
    s->dsp.fdct(temp);
3794

    
3795
    for(i=0; i<64; i++)
3796
        sum= FFMAX(sum, FFABS(temp[i]));
3797

    
3798
    return sum;
3799
}
3800

    
3801
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3802
    MpegEncContext * const s= (MpegEncContext *)c;
3803
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3804
    DCTELEM * const bak = temp+64;
3805
    int sum=0, i;
3806

    
3807
    assert(h==8);
3808
    s->mb_intra=0;
3809

    
3810
    s->dsp.diff_pixels(temp, src1, src2, stride);
3811

    
3812
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3813

    
3814
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3815
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3816
    ff_simple_idct(temp); //FIXME
3817

    
3818
    for(i=0; i<64; i++)
3819
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3820

    
3821
    return sum;
3822
}
3823

    
3824
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3825
    MpegEncContext * const s= (MpegEncContext *)c;
3826
    const uint8_t *scantable= s->intra_scantable.permutated;
3827
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3828
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3829
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3830
    int i, last, run, bits, level, distortion, start_i;
3831
    const int esc_length= s->ac_esc_length;
3832
    uint8_t * length;
3833
    uint8_t * last_length;
3834

    
3835
    assert(h==8);
3836

    
3837
    copy_block8(lsrc1, src1, 8, stride, 8);
3838
    copy_block8(lsrc2, src2, 8, stride, 8);
3839

    
3840
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3841

    
3842
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3843

    
3844
    bits=0;
3845

    
3846
    if (s->mb_intra) {
3847
        start_i = 1;
3848
        length     = s->intra_ac_vlc_length;
3849
        last_length= s->intra_ac_vlc_last_length;
3850
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3851
    } else {
3852
        start_i = 0;
3853
        length     = s->inter_ac_vlc_length;
3854
        last_length= s->inter_ac_vlc_last_length;
3855
    }
3856

    
3857
    if(last>=start_i){
3858
        run=0;
3859
        for(i=start_i; i<last; i++){
3860
            int j= scantable[i];
3861
            level= temp[j];
3862

    
3863
            if(level){
3864
                level+=64;
3865
                if((level&(~127)) == 0){
3866
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3867
                }else
3868
                    bits+= esc_length;
3869
                run=0;
3870
            }else
3871
                run++;
3872
        }
3873
        i= scantable[last];
3874

    
3875
        level= temp[i] + 64;
3876

    
3877
        assert(level - 64);
3878

    
3879
        if((level&(~127)) == 0){
3880
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3881
        }else
3882
            bits+= esc_length;
3883

    
3884
    }
3885

    
3886
    if(last>=0){
3887
        if(s->mb_intra)
3888
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3889
        else
3890
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3891
    }
3892

    
3893
    s->dsp.idct_add(lsrc2, 8, temp);
3894

    
3895
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3896

    
3897
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3898
}
3899

    
3900
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3901
    MpegEncContext * const s= (MpegEncContext *)c;
3902
    const uint8_t *scantable= s->intra_scantable.permutated;
3903
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3904
    int i, last, run, bits, level, start_i;
3905
    const int esc_length= s->ac_esc_length;
3906
    uint8_t * length;
3907
    uint8_t * last_length;
3908

    
3909
    assert(h==8);
3910

    
3911
    s->dsp.diff_pixels(temp, src1, src2, stride);
3912

    
3913
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3914

    
3915
    bits=0;
3916

    
3917
    if (s->mb_intra) {
3918
        start_i = 1;
3919
        length     = s->intra_ac_vlc_length;
3920
        last_length= s->intra_ac_vlc_last_length;
3921
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3922
    } else {
3923
        start_i = 0;
3924
        length     = s->inter_ac_vlc_length;
3925
        last_length= s->inter_ac_vlc_last_length;
3926
    }
3927

    
3928
    if(last>=start_i){
3929
        run=0;
3930
        for(i=start_i; i<last; i++){
3931
            int j= scantable[i];
3932
            level= temp[j];
3933

    
3934
            if(level){
3935
                level+=64;
3936
                if((level&(~127)) == 0){
3937
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3938
                }else
3939
                    bits+= esc_length;
3940
                run=0;
3941
            }else
3942
                run++;
3943
        }
3944
        i= scantable[last];
3945

    
3946
        level= temp[i] + 64;
3947

    
3948
        assert(level - 64);
3949

    
3950
        if((level&(~127)) == 0){
3951
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3952
        }else
3953
            bits+= esc_length;
3954
    }
3955

    
3956
    return bits;
3957
}
3958

    
3959
#define VSAD_INTRA(size) \
3960
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3961
    int score=0;                                                                                            \
3962
    int x,y;                                                                                                \
3963
                                                                                                            \
3964
    for(y=1; y<h; y++){                                                                                     \
3965
        for(x=0; x<size; x+=4){                                                                             \
3966
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3967
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3968
        }                                                                                                   \
3969
        s+= stride;                                                                                         \
3970
    }                                                                                                       \
3971
                                                                                                            \
3972
    return score;                                                                                           \
3973
}
3974
VSAD_INTRA(8)
3975
VSAD_INTRA(16)
3976

    
3977
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3978
    int score=0;
3979
    int x,y;
3980

    
3981
    for(y=1; y<h; y++){
3982
        for(x=0; x<16; x++){
3983
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3984
        }
3985
        s1+= stride;
3986
        s2+= stride;
3987
    }
3988

    
3989
    return score;
3990
}
3991

    
3992
#define SQ(a) ((a)*(a))
3993
#define VSSE_INTRA(size) \
3994
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3995
    int score=0;                                                                                            \
3996
    int x,y;                                                                                                \
3997
                                                                                                            \
3998
    for(y=1; y<h; y++){                                                                                     \
3999
        for(x=0; x<size; x+=4){                                                                               \
4000
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4001
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4002
        }                                                                                                   \
4003
        s+= stride;                                                                                         \
4004
    }                                                                                                       \
4005
                                                                                                            \
4006
    return score;                                                                                           \
4007
}
4008
VSSE_INTRA(8)
4009
VSSE_INTRA(16)
4010

    
4011
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4012
    int score=0;
4013
    int x,y;
4014

    
4015
    for(y=1; y<h; y++){
4016
        for(x=0; x<16; x++){
4017
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4018
        }
4019
        s1+= stride;
4020
        s2+= stride;
4021
    }
4022

    
4023
    return score;
4024
}
4025

    
4026
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4027
                               int size){
4028
    int score=0;
4029
    int i;
4030
    for(i=0; i<size; i++)
4031
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4032
    return score;
4033
}
4034

    
4035
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4036
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4037
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4038
#if CONFIG_GPL
4039
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4040
#endif
4041
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4042
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4043
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4044
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4045

    
4046
static void vector_fmul_c(float *dst, const float *src, int len){
4047
    int i;
4048
    for(i=0; i<len; i++)
4049
        dst[i] *= src[i];
4050
}
4051

    
4052
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4053
    int i;
4054
    src1 += len-1;
4055
    for(i=0; i<len; i++)
4056
        dst[i] = src0[i] * src1[-i];
4057
}
4058

    
4059
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4060
    int i;
4061
    for(i=0; i<len; i++)
4062
        dst[i] = src0[i] * src1[i] + src2[i];
4063
}
4064

    
4065
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4066
    int i,j;
4067
    dst += len;
4068
    win += len;
4069
    src0+= len;
4070
    for(i=-len, j=len-1; i<0; i++, j--) {
4071
        float s0 = src0[i];
4072
        float s1 = src1[j];
4073
        float wi = win[i];
4074
        float wj = win[j];
4075
        dst[i] = s0*wj - s1*wi + add_bias;
4076
        dst[j] = s0*wi + s1*wj + add_bias;
4077
    }
4078
}
4079

    
4080
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4081
                                 int len)
4082
{
4083
    int i;
4084
    for (i = 0; i < len; i++)
4085
        dst[i] = src[i] * mul;
4086
}
4087

    
4088
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4089
                                      const float **sv, float mul, int len)
4090
{
4091
    int i;
4092
    for (i = 0; i < len; i += 2, sv++) {
4093
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4094
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4095
    }
4096
}
4097

    
4098
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4099
                                      const float **sv, float mul, int len)
4100
{
4101
    int i;
4102
    for (i = 0; i < len; i += 4, sv++) {
4103
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4104
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4105
        dst[i+2] = src[i+2] * sv[0][2] * mul;
4106
        dst[i+3] = src[i+3] * sv[0][3] * mul;
4107
    }
4108
}
4109

    
4110
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4111
                               int len)
4112
{
4113
    int i;
4114
    for (i = 0; i < len; i += 2, sv++) {
4115
        dst[i  ] = sv[0][0] * mul;
4116
        dst[i+1] = sv[0][1] * mul;
4117
    }
4118
}
4119

    
4120
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
4121
                               int len)
4122
{
4123
    int i;
4124
    for (i = 0; i < len; i += 4, sv++) {
4125
        dst[i  ] = sv[0][0] * mul;
4126
        dst[i+1] = sv[0][1] * mul;
4127
        dst[i+2] = sv[0][2] * mul;
4128
        dst[i+3] = sv[0][3] * mul;
4129
    }
4130
}
4131

    
4132
static void butterflies_float_c(float *restrict v1, float *restrict v2,
4133
                                int len)
4134
{
4135
    int i;
4136
    for (i = 0; i < len; i++) {
4137
        float t = v1[i] - v2[i];
4138
        v1[i] += v2[i];
4139
        v2[i] = t;
4140
    }
4141
}
4142

    
4143
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
4144
{
4145
    float p = 0.0;
4146
    int i;
4147

    
4148
    for (i = 0; i < len; i++)
4149
        p += v1[i] * v2[i];
4150

    
4151
    return p;
4152
}
4153

    
4154
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4155
    int i;
4156
    for(i=0; i<len; i++)
4157
        dst[i] = src[i] * mul;
4158
}
4159

    
4160
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4161
                   uint32_t maxi, uint32_t maxisign)
4162
{
4163

    
4164
    if(a > mini) return mini;
4165
    else if((a^(1<<31)) > maxisign) return maxi;
4166
    else return a;
4167
}
4168

    
4169
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
4170
    int i;
4171
    uint32_t mini = *(uint32_t*)min;
4172
    uint32_t maxi = *(uint32_t*)max;
4173
    uint32_t maxisign = maxi ^ (1<<31);
4174
    uint32_t *dsti = (uint32_t*)dst;
4175
    const uint32_t *srci = (const uint32_t*)src;
4176
    for(i=0; i<len; i+=8) {
4177
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
4178
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
4179
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
4180
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
4181
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
4182
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
4183
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
4184
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
4185
    }
4186
}
4187
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
4188
    int i;
4189
    if(min < 0 && max > 0) {
4190
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
4191
    } else {
4192
        for(i=0; i < len; i+=8) {
4193
            dst[i    ] = av_clipf(src[i    ], min, max);
4194
            dst[i + 1] = av_clipf(src[i + 1], min, max);
4195
            dst[i + 2] = av_clipf(src[i + 2], min, max);
4196
            dst[i + 3] = av_clipf(src[i + 3], min, max);
4197
            dst[i + 4] = av_clipf(src[i + 4], min, max);
4198
            dst[i + 5] = av_clipf(src[i + 5], min, max);
4199
            dst[i + 6] = av_clipf(src[i + 6], min, max);
4200
            dst[i + 7] = av_clipf(src[i + 7], min, max);
4201
        }
4202
    }
4203
}
4204

    
4205
static av_always_inline int float_to_int16_one(const float *src){
4206
    int_fast32_t tmp = *(const int32_t*)src;
4207
    if(tmp & 0xf0000){
4208
        tmp = (0x43c0ffff - tmp)>>31;
4209
        // is this faster on some gcc/cpu combinations?
4210
//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4211
//      else                 tmp = 0;
4212
    }
4213
    return tmp - 0x8000;
4214
}
4215

    
4216
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4217
    int i;
4218
    for(i=0; i<len; i++)
4219
        dst[i] = float_to_int16_one(src+i);
4220
}
4221

    
4222
void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4223
    int i,j,c;
4224
    if(channels==2){
4225
        for(i=0; i<len; i++){
4226
            dst[2*i]   = float_to_int16_one(src[0]+i);
4227
            dst[2*i+1] = float_to_int16_one(src[1]+i);
4228
        }
4229
    }else{
4230
        for(c=0; c<channels; c++)
4231
            for(i=0, j=c; i<len; i++, j+=channels)
4232
                dst[j] = float_to_int16_one(src[c]+i);
4233
    }
4234
}
4235

    
4236
static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4237
{
4238
    int res = 0;
4239

    
4240
    while (order--)
4241
        res += (*v1++ * *v2++) >> shift;
4242

    
4243
    return res;