Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ b6204677

History | View | Annotate | Download (140 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 *
20
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21
 */
22

    
23
/**
24
 * @file dsputil.c
25
 * DSP utils
26
 */
27

    
28
#include "avcodec.h"
29
#include "dsputil.h"
30
#include "mpegvideo.h"
31
#include "simple_idct.h"
32
#include "faandct.h"
33

    
34
/* snow.c */
35
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
36

    
37
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
38
uint32_t squareTbl[512] = {0, };
39

    
40
const uint8_t ff_zigzag_direct[64] = {
41
    0,   1,  8, 16,  9,  2,  3, 10,
42
    17, 24, 32, 25, 18, 11,  4,  5,
43
    12, 19, 26, 33, 40, 48, 41, 34,
44
    27, 20, 13,  6,  7, 14, 21, 28,
45
    35, 42, 49, 56, 57, 50, 43, 36,
46
    29, 22, 15, 23, 30, 37, 44, 51,
47
    58, 59, 52, 45, 38, 31, 39, 46,
48
    53, 60, 61, 54, 47, 55, 62, 63
49
};
50

    
51
/* Specific zigzag scan for 248 idct. NOTE that unlike the
52
   specification, we interleave the fields */
53
const uint8_t ff_zigzag248_direct[64] = {
54
     0,  8,  1,  9, 16, 24,  2, 10,
55
    17, 25, 32, 40, 48, 56, 33, 41,
56
    18, 26,  3, 11,  4, 12, 19, 27,
57
    34, 42, 49, 57, 50, 58, 35, 43,
58
    20, 28,  5, 13,  6, 14, 21, 29,
59
    36, 44, 51, 59, 52, 60, 37, 45,
60
    22, 30,  7, 15, 23, 31, 38, 46,
61
    53, 61, 54, 62, 39, 47, 55, 63,
62
};
63

    
64
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
65
uint16_t __align8 inv_zigzag_direct16[64] = {0, };
66

    
67
const uint8_t ff_alternate_horizontal_scan[64] = {
68
    0,  1,   2,  3,  8,  9, 16, 17,
69
    10, 11,  4,  5,  6,  7, 15, 14,
70
    13, 12, 19, 18, 24, 25, 32, 33,
71
    26, 27, 20, 21, 22, 23, 28, 29,
72
    30, 31, 34, 35, 40, 41, 48, 49,
73
    42, 43, 36, 37, 38, 39, 44, 45,
74
    46, 47, 50, 51, 56, 57, 58, 59,
75
    52, 53, 54, 55, 60, 61, 62, 63,
76
};
77

    
78
const uint8_t ff_alternate_vertical_scan[64] = {
79
    0,  8,  16, 24,  1,  9,  2, 10,
80
    17, 25, 32, 40, 48, 56, 57, 49,
81
    41, 33, 26, 18,  3, 11,  4, 12,
82
    19, 27, 34, 42, 50, 58, 35, 43,
83
    51, 59, 20, 28,  5, 13,  6, 14,
84
    21, 29, 36, 44, 52, 60, 37, 45,
85
    53, 61, 22, 30,  7, 15, 23, 31,
86
    38, 46, 54, 62, 39, 47, 55, 63,
87
};
88

    
89
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
90
const uint32_t inverse[256]={
91
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
92
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
93
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
94
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
95
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
96
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
97
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
98
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
99
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
100
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
101
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
102
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
103
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
104
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
105
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
106
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
107
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
108
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
109
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
110
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
111
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
112
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
113
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
114
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
115
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
116
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
117
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
118
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
119
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
120
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
121
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
122
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
123
};
124

    
125
/* Input permutation for the simple_idct_mmx */
126
static const uint8_t simple_mmx_permutation[64]={
127
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
128
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
129
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
130
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
131
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
132
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
133
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
134
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
135
};
136

    
137
static int pix_sum_c(uint8_t * pix, int line_size)
138
{
139
    int s, i, j;
140

    
141
    s = 0;
142
    for (i = 0; i < 16; i++) {
143
        for (j = 0; j < 16; j += 8) {
144
            s += pix[0];
145
            s += pix[1];
146
            s += pix[2];
147
            s += pix[3];
148
            s += pix[4];
149
            s += pix[5];
150
            s += pix[6];
151
            s += pix[7];
152
            pix += 8;
153
        }
154
        pix += line_size - 16;
155
    }
156
    return s;
157
}
158

    
159
static int pix_norm1_c(uint8_t * pix, int line_size)
160
{
161
    int s, i, j;
162
    uint32_t *sq = squareTbl + 256;
163

    
164
    s = 0;
165
    for (i = 0; i < 16; i++) {
166
        for (j = 0; j < 16; j += 8) {
167
#if 0
168
            s += sq[pix[0]];
169
            s += sq[pix[1]];
170
            s += sq[pix[2]];
171
            s += sq[pix[3]];
172
            s += sq[pix[4]];
173
            s += sq[pix[5]];
174
            s += sq[pix[6]];
175
            s += sq[pix[7]];
176
#else
177
#if LONG_MAX > 2147483647
178
            register uint64_t x=*(uint64_t*)pix;
179
            s += sq[x&0xff];
180
            s += sq[(x>>8)&0xff];
181
            s += sq[(x>>16)&0xff];
182
            s += sq[(x>>24)&0xff];
183
            s += sq[(x>>32)&0xff];
184
            s += sq[(x>>40)&0xff];
185
            s += sq[(x>>48)&0xff];
186
            s += sq[(x>>56)&0xff];
187
#else
188
            register uint32_t x=*(uint32_t*)pix;
189
            s += sq[x&0xff];
190
            s += sq[(x>>8)&0xff];
191
            s += sq[(x>>16)&0xff];
192
            s += sq[(x>>24)&0xff];
193
            x=*(uint32_t*)(pix+4);
194
            s += sq[x&0xff];
195
            s += sq[(x>>8)&0xff];
196
            s += sq[(x>>16)&0xff];
197
            s += sq[(x>>24)&0xff];
198
#endif
199
#endif
200
            pix += 8;
201
        }
202
        pix += line_size - 16;
203
    }
204
    return s;
205
}
206

    
207
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
208
    int i;
209

    
210
    for(i=0; i+8<=w; i+=8){
211
        dst[i+0]= bswap_32(src[i+0]);
212
        dst[i+1]= bswap_32(src[i+1]);
213
        dst[i+2]= bswap_32(src[i+2]);
214
        dst[i+3]= bswap_32(src[i+3]);
215
        dst[i+4]= bswap_32(src[i+4]);
216
        dst[i+5]= bswap_32(src[i+5]);
217
        dst[i+6]= bswap_32(src[i+6]);
218
        dst[i+7]= bswap_32(src[i+7]);
219
    }
220
    for(;i<w; i++){
221
        dst[i+0]= bswap_32(src[i+0]);
222
    }
223
}
224

    
225
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
226
{
227
    int s, i;
228
    uint32_t *sq = squareTbl + 256;
229

    
230
    s = 0;
231
    for (i = 0; i < h; i++) {
232
        s += sq[pix1[0] - pix2[0]];
233
        s += sq[pix1[1] - pix2[1]];
234
        s += sq[pix1[2] - pix2[2]];
235
        s += sq[pix1[3] - pix2[3]];
236
        pix1 += line_size;
237
        pix2 += line_size;
238
    }
239
    return s;
240
}
241

    
242
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
243
{
244
    int s, i;
245
    uint32_t *sq = squareTbl + 256;
246

    
247
    s = 0;
248
    for (i = 0; i < h; i++) {
249
        s += sq[pix1[0] - pix2[0]];
250
        s += sq[pix1[1] - pix2[1]];
251
        s += sq[pix1[2] - pix2[2]];
252
        s += sq[pix1[3] - pix2[3]];
253
        s += sq[pix1[4] - pix2[4]];
254
        s += sq[pix1[5] - pix2[5]];
255
        s += sq[pix1[6] - pix2[6]];
256
        s += sq[pix1[7] - pix2[7]];
257
        pix1 += line_size;
258
        pix2 += line_size;
259
    }
260
    return s;
261
}
262

    
263
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
264
{
265
    int s, i;
266
    uint32_t *sq = squareTbl + 256;
267

    
268
    s = 0;
269
    for (i = 0; i < h; i++) {
270
        s += sq[pix1[ 0] - pix2[ 0]];
271
        s += sq[pix1[ 1] - pix2[ 1]];
272
        s += sq[pix1[ 2] - pix2[ 2]];
273
        s += sq[pix1[ 3] - pix2[ 3]];
274
        s += sq[pix1[ 4] - pix2[ 4]];
275
        s += sq[pix1[ 5] - pix2[ 5]];
276
        s += sq[pix1[ 6] - pix2[ 6]];
277
        s += sq[pix1[ 7] - pix2[ 7]];
278
        s += sq[pix1[ 8] - pix2[ 8]];
279
        s += sq[pix1[ 9] - pix2[ 9]];
280
        s += sq[pix1[10] - pix2[10]];
281
        s += sq[pix1[11] - pix2[11]];
282
        s += sq[pix1[12] - pix2[12]];
283
        s += sq[pix1[13] - pix2[13]];
284
        s += sq[pix1[14] - pix2[14]];
285
        s += sq[pix1[15] - pix2[15]];
286

    
287
        pix1 += line_size;
288
        pix2 += line_size;
289
    }
290
    return s;
291
}
292

    
293

    
294
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
295
#ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
296
    int s, i, j;
297
    const int dec_count= w==8 ? 3 : 4;
298
    int tmp[16*16];
299
#if 0
300
    int level, ori;
301
    static const int scale[2][2][4][4]={
302
      {
303
        {
304
            //8x8 dec=3
305
            {268, 239, 239, 213},
306
            {  0, 224, 224, 152},
307
            {  0, 135, 135, 110},
308
        },{
309
            //16x16 dec=4
310
            {344, 310, 310, 280},
311
            {  0, 320, 320, 228},
312
            {  0, 175, 175, 136},
313
            {  0, 129, 129, 102},
314
        }
315
      },{
316
        {//FIXME 5/3
317
            //8x8 dec=3
318
            {275, 245, 245, 218},
319
            {  0, 230, 230, 156},
320
            {  0, 138, 138, 113},
321
        },{
322
            //16x16 dec=4
323
            {352, 317, 317, 286},
324
            {  0, 328, 328, 233},
325
            {  0, 180, 180, 140},
326
            {  0, 132, 132, 105},
327
        }
328
      }
329
    };
330
#endif
331

    
332
    for (i = 0; i < h; i++) {
333
        for (j = 0; j < w; j+=4) {
334
            tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
335
            tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
336
            tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
337
            tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
338
        }
339
        pix1 += line_size;
340
        pix2 += line_size;
341
    }
342

    
343
    ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
344

    
345
    s=0;
346
#if 0
347
    for(level=0; level<dec_count; level++){
348
        for(ori= level ? 1 : 0; ori<4; ori++){
349
            int sx= (ori&1) ? 1<<level: 0;
350
            int stride= 16<<(dec_count-level);
351
            int sy= (ori&2) ? stride>>1 : 0;
352
            int size= 1<<level;
353

354
            for(i=0; i<size; i++){
355
                for(j=0; j<size; j++){
356
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
357
                    s += ABS(v);
358
                }
359
            }
360
        }
361
    }
362
#endif
363
    for (i = 0; i < h; i++) {
364
        for (j = 0; j < w; j+=4) {
365
            s+= ABS(tmp[16*i+j+0]);
366
            s+= ABS(tmp[16*i+j+1]);
367
            s+= ABS(tmp[16*i+j+2]);
368
            s+= ABS(tmp[16*i+j+3]);
369
        }
370
    }
371
    assert(s>=0);
372

    
373
    return s>>2;
374
#endif
375
}
376

    
377
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
379
}
380

    
381
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
383
}
384

    
385
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
387
}
388

    
389
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
390
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
391
}
392

    
393
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
394
{
395
    int i;
396

    
397
    /* read the pixels */
398
    for(i=0;i<8;i++) {
399
        block[0] = pixels[0];
400
        block[1] = pixels[1];
401
        block[2] = pixels[2];
402
        block[3] = pixels[3];
403
        block[4] = pixels[4];
404
        block[5] = pixels[5];
405
        block[6] = pixels[6];
406
        block[7] = pixels[7];
407
        pixels += line_size;
408
        block += 8;
409
    }
410
}
411

    
412
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
413
                          const uint8_t *s2, int stride){
414
    int i;
415

    
416
    /* read the pixels */
417
    for(i=0;i<8;i++) {
418
        block[0] = s1[0] - s2[0];
419
        block[1] = s1[1] - s2[1];
420
        block[2] = s1[2] - s2[2];
421
        block[3] = s1[3] - s2[3];
422
        block[4] = s1[4] - s2[4];
423
        block[5] = s1[5] - s2[5];
424
        block[6] = s1[6] - s2[6];
425
        block[7] = s1[7] - s2[7];
426
        s1 += stride;
427
        s2 += stride;
428
        block += 8;
429
    }
430
}
431

    
432

    
433
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
434
                                 int line_size)
435
{
436
    int i;
437
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
438

    
439
    /* read the pixels */
440
    for(i=0;i<8;i++) {
441
        pixels[0] = cm[block[0]];
442
        pixels[1] = cm[block[1]];
443
        pixels[2] = cm[block[2]];
444
        pixels[3] = cm[block[3]];
445
        pixels[4] = cm[block[4]];
446
        pixels[5] = cm[block[5]];
447
        pixels[6] = cm[block[6]];
448
        pixels[7] = cm[block[7]];
449

    
450
        pixels += line_size;
451
        block += 8;
452
    }
453
}
454

    
455
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
456
                                 int line_size)
457
{
458
    int i;
459
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
460

    
461
    /* read the pixels */
462
    for(i=0;i<4;i++) {
463
        pixels[0] = cm[block[0]];
464
        pixels[1] = cm[block[1]];
465
        pixels[2] = cm[block[2]];
466
        pixels[3] = cm[block[3]];
467

    
468
        pixels += line_size;
469
        block += 8;
470
    }
471
}
472

    
473
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
474
                                 int line_size)
475
{
476
    int i;
477
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
478

    
479
    /* read the pixels */
480
    for(i=0;i<2;i++) {
481
        pixels[0] = cm[block[0]];
482
        pixels[1] = cm[block[1]];
483

    
484
        pixels += line_size;
485
        block += 8;
486
    }
487
}
488

    
489
static void put_signed_pixels_clamped_c(const DCTELEM *block,
490
                                        uint8_t *restrict pixels,
491
                                        int line_size)
492
{
493
    int i, j;
494

    
495
    for (i = 0; i < 8; i++) {
496
        for (j = 0; j < 8; j++) {
497
            if (*block < -128)
498
                *pixels = 0;
499
            else if (*block > 127)
500
                *pixels = 255;
501
            else
502
                *pixels = (uint8_t)(*block + 128);
503
            block++;
504
            pixels++;
505
        }
506
        pixels += (line_size - 8);
507
    }
508
}
509

    
510
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
511
                          int line_size)
512
{
513
    int i;
514
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
515

    
516
    /* read the pixels */
517
    for(i=0;i<8;i++) {
518
        pixels[0] = cm[pixels[0] + block[0]];
519
        pixels[1] = cm[pixels[1] + block[1]];
520
        pixels[2] = cm[pixels[2] + block[2]];
521
        pixels[3] = cm[pixels[3] + block[3]];
522
        pixels[4] = cm[pixels[4] + block[4]];
523
        pixels[5] = cm[pixels[5] + block[5]];
524
        pixels[6] = cm[pixels[6] + block[6]];
525
        pixels[7] = cm[pixels[7] + block[7]];
526
        pixels += line_size;
527
        block += 8;
528
    }
529
}
530

    
531
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
532
                          int line_size)
533
{
534
    int i;
535
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
536

    
537
    /* read the pixels */
538
    for(i=0;i<4;i++) {
539
        pixels[0] = cm[pixels[0] + block[0]];
540
        pixels[1] = cm[pixels[1] + block[1]];
541
        pixels[2] = cm[pixels[2] + block[2]];
542
        pixels[3] = cm[pixels[3] + block[3]];
543
        pixels += line_size;
544
        block += 8;
545
    }
546
}
547

    
548
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
549
                          int line_size)
550
{
551
    int i;
552
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
553

    
554
    /* read the pixels */
555
    for(i=0;i<2;i++) {
556
        pixels[0] = cm[pixels[0] + block[0]];
557
        pixels[1] = cm[pixels[1] + block[1]];
558
        pixels += line_size;
559
        block += 8;
560
    }
561
}
562

    
563
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
564
{
565
    int i;
566
    for(i=0;i<8;i++) {
567
        pixels[0] += block[0];
568
        pixels[1] += block[1];
569
        pixels[2] += block[2];
570
        pixels[3] += block[3];
571
        pixels[4] += block[4];
572
        pixels[5] += block[5];
573
        pixels[6] += block[6];
574
        pixels[7] += block[7];
575
        pixels += line_size;
576
        block += 8;
577
    }
578
}
579

    
580
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
581
{
582
    int i;
583
    for(i=0;i<4;i++) {
584
        pixels[0] += block[0];
585
        pixels[1] += block[1];
586
        pixels[2] += block[2];
587
        pixels[3] += block[3];
588
        pixels += line_size;
589
        block += 4;
590
    }
591
}
592

    
593
#if 0
594

595
#define PIXOP2(OPNAME, OP) \
596
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
597
{\
598
    int i;\
599
    for(i=0; i<h; i++){\
600
        OP(*((uint64_t*)block), LD64(pixels));\
601
        pixels+=line_size;\
602
        block +=line_size;\
603
    }\
604
}\
605
\
606
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
607
{\
608
    int i;\
609
    for(i=0; i<h; i++){\
610
        const uint64_t a= LD64(pixels  );\
611
        const uint64_t b= LD64(pixels+1);\
612
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
613
        pixels+=line_size;\
614
        block +=line_size;\
615
    }\
616
}\
617
\
618
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
619
{\
620
    int i;\
621
    for(i=0; i<h; i++){\
622
        const uint64_t a= LD64(pixels  );\
623
        const uint64_t b= LD64(pixels+1);\
624
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
625
        pixels+=line_size;\
626
        block +=line_size;\
627
    }\
628
}\
629
\
630
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
631
{\
632
    int i;\
633
    for(i=0; i<h; i++){\
634
        const uint64_t a= LD64(pixels          );\
635
        const uint64_t b= LD64(pixels+line_size);\
636
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
637
        pixels+=line_size;\
638
        block +=line_size;\
639
    }\
640
}\
641
\
642
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
643
{\
644
    int i;\
645
    for(i=0; i<h; i++){\
646
        const uint64_t a= LD64(pixels          );\
647
        const uint64_t b= LD64(pixels+line_size);\
648
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
649
        pixels+=line_size;\
650
        block +=line_size;\
651
    }\
652
}\
653
\
654
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
655
{\
656
        int i;\
657
        const uint64_t a= LD64(pixels  );\
658
        const uint64_t b= LD64(pixels+1);\
659
        uint64_t l0=  (a&0x0303030303030303ULL)\
660
                    + (b&0x0303030303030303ULL)\
661
                    + 0x0202020202020202ULL;\
662
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
663
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
664
        uint64_t l1,h1;\
665
\
666
        pixels+=line_size;\
667
        for(i=0; i<h; i+=2){\
668
            uint64_t a= LD64(pixels  );\
669
            uint64_t b= LD64(pixels+1);\
670
            l1=  (a&0x0303030303030303ULL)\
671
               + (b&0x0303030303030303ULL);\
672
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
673
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
674
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
675
            pixels+=line_size;\
676
            block +=line_size;\
677
            a= LD64(pixels  );\
678
            b= LD64(pixels+1);\
679
            l0=  (a&0x0303030303030303ULL)\
680
               + (b&0x0303030303030303ULL)\
681
               + 0x0202020202020202ULL;\
682
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
683
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
684
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
685
            pixels+=line_size;\
686
            block +=line_size;\
687
        }\
688
}\
689
\
690
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
691
{\
692
        int i;\
693
        const uint64_t a= LD64(pixels  );\
694
        const uint64_t b= LD64(pixels+1);\
695
        uint64_t l0=  (a&0x0303030303030303ULL)\
696
                    + (b&0x0303030303030303ULL)\
697
                    + 0x0101010101010101ULL;\
698
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
699
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
700
        uint64_t l1,h1;\
701
\
702
        pixels+=line_size;\
703
        for(i=0; i<h; i+=2){\
704
            uint64_t a= LD64(pixels  );\
705
            uint64_t b= LD64(pixels+1);\
706
            l1=  (a&0x0303030303030303ULL)\
707
               + (b&0x0303030303030303ULL);\
708
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
709
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
710
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
711
            pixels+=line_size;\
712
            block +=line_size;\
713
            a= LD64(pixels  );\
714
            b= LD64(pixels+1);\
715
            l0=  (a&0x0303030303030303ULL)\
716
               + (b&0x0303030303030303ULL)\
717
               + 0x0101010101010101ULL;\
718
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
719
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
720
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
721
            pixels+=line_size;\
722
            block +=line_size;\
723
        }\
724
}\
725
\
726
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
727
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
728
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
729
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
730
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
731
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
732
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
733

734
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
735
#else // 64 bit variant
736

    
737
#define PIXOP2(OPNAME, OP) \
738
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
739
    int i;\
740
    for(i=0; i<h; i++){\
741
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
742
        pixels+=line_size;\
743
        block +=line_size;\
744
    }\
745
}\
746
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
747
    int i;\
748
    for(i=0; i<h; i++){\
749
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
750
        pixels+=line_size;\
751
        block +=line_size;\
752
    }\
753
}\
754
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
755
    int i;\
756
    for(i=0; i<h; i++){\
757
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
758
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
759
        pixels+=line_size;\
760
        block +=line_size;\
761
    }\
762
}\
763
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
764
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
765
}\
766
\
767
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
768
                                                int src_stride1, int src_stride2, int h){\
769
    int i;\
770
    for(i=0; i<h; i++){\
771
        uint32_t a,b;\
772
        a= LD32(&src1[i*src_stride1  ]);\
773
        b= LD32(&src2[i*src_stride2  ]);\
774
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
775
        a= LD32(&src1[i*src_stride1+4]);\
776
        b= LD32(&src2[i*src_stride2+4]);\
777
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
778
    }\
779
}\
780
\
781
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
782
                                                int src_stride1, int src_stride2, int h){\
783
    int i;\
784
    for(i=0; i<h; i++){\
785
        uint32_t a,b;\
786
        a= LD32(&src1[i*src_stride1  ]);\
787
        b= LD32(&src2[i*src_stride2  ]);\
788
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
789
        a= LD32(&src1[i*src_stride1+4]);\
790
        b= LD32(&src2[i*src_stride2+4]);\
791
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
792
    }\
793
}\
794
\
795
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
796
                                                int src_stride1, int src_stride2, int h){\
797
    int i;\
798
    for(i=0; i<h; i++){\
799
        uint32_t a,b;\
800
        a= LD32(&src1[i*src_stride1  ]);\
801
        b= LD32(&src2[i*src_stride2  ]);\
802
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
803
    }\
804
}\
805
\
806
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
807
                                                int src_stride1, int src_stride2, int h){\
808
    int i;\
809
    for(i=0; i<h; i++){\
810
        uint32_t a,b;\
811
        a= LD16(&src1[i*src_stride1  ]);\
812
        b= LD16(&src2[i*src_stride2  ]);\
813
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
814
    }\
815
}\
816
\
817
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
818
                                                int src_stride1, int src_stride2, int h){\
819
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
820
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
821
}\
822
\
823
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
824
                                                int src_stride1, int src_stride2, int h){\
825
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
826
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
827
}\
828
\
829
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
830
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
831
}\
832
\
833
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
834
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
835
}\
836
\
837
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
838
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
839
}\
840
\
841
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
842
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
843
}\
844
\
845
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
846
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
847
    int i;\
848
    for(i=0; i<h; i++){\
849
        uint32_t a, b, c, d, l0, l1, h0, h1;\
850
        a= LD32(&src1[i*src_stride1]);\
851
        b= LD32(&src2[i*src_stride2]);\
852
        c= LD32(&src3[i*src_stride3]);\
853
        d= LD32(&src4[i*src_stride4]);\
854
        l0=  (a&0x03030303UL)\
855
           + (b&0x03030303UL)\
856
           + 0x02020202UL;\
857
        h0= ((a&0xFCFCFCFCUL)>>2)\
858
          + ((b&0xFCFCFCFCUL)>>2);\
859
        l1=  (c&0x03030303UL)\
860
           + (d&0x03030303UL);\
861
        h1= ((c&0xFCFCFCFCUL)>>2)\
862
          + ((d&0xFCFCFCFCUL)>>2);\
863
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
864
        a= LD32(&src1[i*src_stride1+4]);\
865
        b= LD32(&src2[i*src_stride2+4]);\
866
        c= LD32(&src3[i*src_stride3+4]);\
867
        d= LD32(&src4[i*src_stride4+4]);\
868
        l0=  (a&0x03030303UL)\
869
           + (b&0x03030303UL)\
870
           + 0x02020202UL;\
871
        h0= ((a&0xFCFCFCFCUL)>>2)\
872
          + ((b&0xFCFCFCFCUL)>>2);\
873
        l1=  (c&0x03030303UL)\
874
           + (d&0x03030303UL);\
875
        h1= ((c&0xFCFCFCFCUL)>>2)\
876
          + ((d&0xFCFCFCFCUL)>>2);\
877
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
878
    }\
879
}\
880
\
881
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
882
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
883
}\
884
\
885
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
886
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
887
}\
888
\
889
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
890
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
891
}\
892
\
893
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
894
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
895
}\
896
\
897
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
898
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
899
    int i;\
900
    for(i=0; i<h; i++){\
901
        uint32_t a, b, c, d, l0, l1, h0, h1;\
902
        a= LD32(&src1[i*src_stride1]);\
903
        b= LD32(&src2[i*src_stride2]);\
904
        c= LD32(&src3[i*src_stride3]);\
905
        d= LD32(&src4[i*src_stride4]);\
906
        l0=  (a&0x03030303UL)\
907
           + (b&0x03030303UL)\
908
           + 0x01010101UL;\
909
        h0= ((a&0xFCFCFCFCUL)>>2)\
910
          + ((b&0xFCFCFCFCUL)>>2);\
911
        l1=  (c&0x03030303UL)\
912
           + (d&0x03030303UL);\
913
        h1= ((c&0xFCFCFCFCUL)>>2)\
914
          + ((d&0xFCFCFCFCUL)>>2);\
915
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
916
        a= LD32(&src1[i*src_stride1+4]);\
917
        b= LD32(&src2[i*src_stride2+4]);\
918
        c= LD32(&src3[i*src_stride3+4]);\
919
        d= LD32(&src4[i*src_stride4+4]);\
920
        l0=  (a&0x03030303UL)\
921
           + (b&0x03030303UL)\
922
           + 0x01010101UL;\
923
        h0= ((a&0xFCFCFCFCUL)>>2)\
924
          + ((b&0xFCFCFCFCUL)>>2);\
925
        l1=  (c&0x03030303UL)\
926
           + (d&0x03030303UL);\
927
        h1= ((c&0xFCFCFCFCUL)>>2)\
928
          + ((d&0xFCFCFCFCUL)>>2);\
929
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
930
    }\
931
}\
932
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
933
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
934
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
935
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
936
}\
937
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
938
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
939
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
940
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
941
}\
942
\
943
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
944
{\
945
        int i, a0, b0, a1, b1;\
946
        a0= pixels[0];\
947
        b0= pixels[1] + 2;\
948
        a0 += b0;\
949
        b0 += pixels[2];\
950
\
951
        pixels+=line_size;\
952
        for(i=0; i<h; i+=2){\
953
            a1= pixels[0];\
954
            b1= pixels[1];\
955
            a1 += b1;\
956
            b1 += pixels[2];\
957
\
958
            block[0]= (a1+a0)>>2; /* FIXME non put */\
959
            block[1]= (b1+b0)>>2;\
960
\
961
            pixels+=line_size;\
962
            block +=line_size;\
963
\
964
            a0= pixels[0];\
965
            b0= pixels[1] + 2;\
966
            a0 += b0;\
967
            b0 += pixels[2];\
968
\
969
            block[0]= (a1+a0)>>2;\
970
            block[1]= (b1+b0)>>2;\
971
            pixels+=line_size;\
972
            block +=line_size;\
973
        }\
974
}\
975
\
976
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
977
{\
978
        int i;\
979
        const uint32_t a= LD32(pixels  );\
980
        const uint32_t b= LD32(pixels+1);\
981
        uint32_t l0=  (a&0x03030303UL)\
982
                    + (b&0x03030303UL)\
983
                    + 0x02020202UL;\
984
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
985
                   + ((b&0xFCFCFCFCUL)>>2);\
986
        uint32_t l1,h1;\
987
\
988
        pixels+=line_size;\
989
        for(i=0; i<h; i+=2){\
990
            uint32_t a= LD32(pixels  );\
991
            uint32_t b= LD32(pixels+1);\
992
            l1=  (a&0x03030303UL)\
993
               + (b&0x03030303UL);\
994
            h1= ((a&0xFCFCFCFCUL)>>2)\
995
              + ((b&0xFCFCFCFCUL)>>2);\
996
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
997
            pixels+=line_size;\
998
            block +=line_size;\
999
            a= LD32(pixels  );\
1000
            b= LD32(pixels+1);\
1001
            l0=  (a&0x03030303UL)\
1002
               + (b&0x03030303UL)\
1003
               + 0x02020202UL;\
1004
            h0= ((a&0xFCFCFCFCUL)>>2)\
1005
              + ((b&0xFCFCFCFCUL)>>2);\
1006
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1007
            pixels+=line_size;\
1008
            block +=line_size;\
1009
        }\
1010
}\
1011
\
1012
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1013
{\
1014
    int j;\
1015
    for(j=0; j<2; j++){\
1016
        int i;\
1017
        const uint32_t a= LD32(pixels  );\
1018
        const uint32_t b= LD32(pixels+1);\
1019
        uint32_t l0=  (a&0x03030303UL)\
1020
                    + (b&0x03030303UL)\
1021
                    + 0x02020202UL;\
1022
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1023
                   + ((b&0xFCFCFCFCUL)>>2);\
1024
        uint32_t l1,h1;\
1025
\
1026
        pixels+=line_size;\
1027
        for(i=0; i<h; i+=2){\
1028
            uint32_t a= LD32(pixels  );\
1029
            uint32_t b= LD32(pixels+1);\
1030
            l1=  (a&0x03030303UL)\
1031
               + (b&0x03030303UL);\
1032
            h1= ((a&0xFCFCFCFCUL)>>2)\
1033
              + ((b&0xFCFCFCFCUL)>>2);\
1034
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035
            pixels+=line_size;\
1036
            block +=line_size;\
1037
            a= LD32(pixels  );\
1038
            b= LD32(pixels+1);\
1039
            l0=  (a&0x03030303UL)\
1040
               + (b&0x03030303UL)\
1041
               + 0x02020202UL;\
1042
            h0= ((a&0xFCFCFCFCUL)>>2)\
1043
              + ((b&0xFCFCFCFCUL)>>2);\
1044
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045
            pixels+=line_size;\
1046
            block +=line_size;\
1047
        }\
1048
        pixels+=4-line_size*(h+1);\
1049
        block +=4-line_size*h;\
1050
    }\
1051
}\
1052
\
1053
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1054
{\
1055
    int j;\
1056
    for(j=0; j<2; j++){\
1057
        int i;\
1058
        const uint32_t a= LD32(pixels  );\
1059
        const uint32_t b= LD32(pixels+1);\
1060
        uint32_t l0=  (a&0x03030303UL)\
1061
                    + (b&0x03030303UL)\
1062
                    + 0x01010101UL;\
1063
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1064
                   + ((b&0xFCFCFCFCUL)>>2);\
1065
        uint32_t l1,h1;\
1066
\
1067
        pixels+=line_size;\
1068
        for(i=0; i<h; i+=2){\
1069
            uint32_t a= LD32(pixels  );\
1070
            uint32_t b= LD32(pixels+1);\
1071
            l1=  (a&0x03030303UL)\
1072
               + (b&0x03030303UL);\
1073
            h1= ((a&0xFCFCFCFCUL)>>2)\
1074
              + ((b&0xFCFCFCFCUL)>>2);\
1075
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1076
            pixels+=line_size;\
1077
            block +=line_size;\
1078
            a= LD32(pixels  );\
1079
            b= LD32(pixels+1);\
1080
            l0=  (a&0x03030303UL)\
1081
               + (b&0x03030303UL)\
1082
               + 0x01010101UL;\
1083
            h0= ((a&0xFCFCFCFCUL)>>2)\
1084
              + ((b&0xFCFCFCFCUL)>>2);\
1085
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086
            pixels+=line_size;\
1087
            block +=line_size;\
1088
        }\
1089
        pixels+=4-line_size*(h+1);\
1090
        block +=4-line_size*h;\
1091
    }\
1092
}\
1093
\
1094
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1095
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1096
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1097
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1098
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1099
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1100
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1101
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1102

    
1103
#define op_avg(a, b) a = rnd_avg32(a, b)
1104
#endif
1105
#define op_put(a, b) a = b
1106

    
1107
PIXOP2(avg, op_avg)
1108
PIXOP2(put, op_put)
1109
#undef op_avg
1110
#undef op_put
1111

    
1112
#define avg2(a,b) ((a+b+1)>>1)
1113
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1114

    
1115
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1116
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1117
}
1118

    
1119
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1120
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1121
}
1122

    
1123
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1124
{
1125
    const int A=(16-x16)*(16-y16);
1126
    const int B=(   x16)*(16-y16);
1127
    const int C=(16-x16)*(   y16);
1128
    const int D=(   x16)*(   y16);
1129
    int i;
1130

    
1131
    for(i=0; i<h; i++)
1132
    {
1133
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1134
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1135
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1136
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1137
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1138
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1139
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1140
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1141
        dst+= stride;
1142
        src+= stride;
1143
    }
1144
}
1145

    
1146
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1147
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1148
{
1149
    int y, vx, vy;
1150
    const int s= 1<<shift;
1151

    
1152
    width--;
1153
    height--;
1154

    
1155
    for(y=0; y<h; y++){
1156
        int x;
1157

    
1158
        vx= ox;
1159
        vy= oy;
1160
        for(x=0; x<8; x++){ //XXX FIXME optimize
1161
            int src_x, src_y, frac_x, frac_y, index;
1162

    
1163
            src_x= vx>>16;
1164
            src_y= vy>>16;
1165
            frac_x= src_x&(s-1);
1166
            frac_y= src_y&(s-1);
1167
            src_x>>=shift;
1168
            src_y>>=shift;
1169

    
1170
            if((unsigned)src_x < width){
1171
                if((unsigned)src_y < height){
1172
                    index= src_x + src_y*stride;
1173
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1174
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1175
                                        + (  src[index+stride  ]*(s-frac_x)
1176
                                           + src[index+stride+1]*   frac_x )*   frac_y
1177
                                        + r)>>(shift*2);
1178
                }else{
1179
                    index= src_x + clip(src_y, 0, height)*stride;
1180
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1181
                                          + src[index       +1]*   frac_x )*s
1182
                                        + r)>>(shift*2);
1183
                }
1184
            }else{
1185
                if((unsigned)src_y < height){
1186
                    index= clip(src_x, 0, width) + src_y*stride;
1187
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1188
                                           + src[index+stride  ]*   frac_y )*s
1189
                                        + r)>>(shift*2);
1190
                }else{
1191
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1192
                    dst[y*stride + x]=    src[index         ];
1193
                }
1194
            }
1195

    
1196
            vx+= dxx;
1197
            vy+= dyx;
1198
        }
1199
        ox += dxy;
1200
        oy += dyy;
1201
    }
1202
}
1203

    
1204
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1205
    switch(width){
1206
    case 2: put_pixels2_c (dst, src, stride, height); break;
1207
    case 4: put_pixels4_c (dst, src, stride, height); break;
1208
    case 8: put_pixels8_c (dst, src, stride, height); break;
1209
    case 16:put_pixels16_c(dst, src, stride, height); break;
1210
    }
1211
}
1212

    
1213
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1214
    int i,j;
1215
    for (i=0; i < height; i++) {
1216
      for (j=0; j < width; j++) {
1217
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1218
      }
1219
      src += stride;
1220
      dst += stride;
1221
    }
1222
}
1223

    
1224
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1225
    int i,j;
1226
    for (i=0; i < height; i++) {
1227
      for (j=0; j < width; j++) {
1228
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1229
      }
1230
      src += stride;
1231
      dst += stride;
1232
    }
1233
}
1234

    
1235
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1236
    int i,j;
1237
    for (i=0; i < height; i++) {
1238
      for (j=0; j < width; j++) {
1239
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1240
      }
1241
      src += stride;
1242
      dst += stride;
1243
    }
1244
}
1245

    
1246
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1247
    int i,j;
1248
    for (i=0; i < height; i++) {
1249
      for (j=0; j < width; j++) {
1250
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1251
      }
1252
      src += stride;
1253
      dst += stride;
1254
    }
1255
}
1256

    
1257
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1258
    int i,j;
1259
    for (i=0; i < height; i++) {
1260
      for (j=0; j < width; j++) {
1261
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1262
      }
1263
      src += stride;
1264
      dst += stride;
1265
    }
1266
}
1267

    
1268
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1269
    int i,j;
1270
    for (i=0; i < height; i++) {
1271
      for (j=0; j < width; j++) {
1272
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1273
      }
1274
      src += stride;
1275
      dst += stride;
1276
    }
1277
}
1278

    
1279
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280
    int i,j;
1281
    for (i=0; i < height; i++) {
1282
      for (j=0; j < width; j++) {
1283
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1284
      }
1285
      src += stride;
1286
      dst += stride;
1287
    }
1288
}
1289

    
1290
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291
    int i,j;
1292
    for (i=0; i < height; i++) {
1293
      for (j=0; j < width; j++) {
1294
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1295
      }
1296
      src += stride;
1297
      dst += stride;
1298
    }
1299
}
1300

    
1301
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302
    switch(width){
1303
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1304
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1305
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1306
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1307
    }
1308
}
1309

    
1310
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311
    int i,j;
1312
    for (i=0; i < height; i++) {
1313
      for (j=0; j < width; j++) {
1314
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1315
      }
1316
      src += stride;
1317
      dst += stride;
1318
    }
1319
}
1320

    
1321
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322
    int i,j;
1323
    for (i=0; i < height; i++) {
1324
      for (j=0; j < width; j++) {
1325
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1326
      }
1327
      src += stride;
1328
      dst += stride;
1329
    }
1330
}
1331

    
1332
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333
    int i,j;
1334
    for (i=0; i < height; i++) {
1335
      for (j=0; j < width; j++) {
1336
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1337
      }
1338
      src += stride;
1339
      dst += stride;
1340
    }
1341
}
1342

    
1343
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344
    int i,j;
1345
    for (i=0; i < height; i++) {
1346
      for (j=0; j < width; j++) {
1347
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1348
      }
1349
      src += stride;
1350
      dst += stride;
1351
    }
1352
}
1353

    
1354
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355
    int i,j;
1356
    for (i=0; i < height; i++) {
1357
      for (j=0; j < width; j++) {
1358
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1359
      }
1360
      src += stride;
1361
      dst += stride;
1362
    }
1363
}
1364

    
1365
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366
    int i,j;
1367
    for (i=0; i < height; i++) {
1368
      for (j=0; j < width; j++) {
1369
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1370
      }
1371
      src += stride;
1372
      dst += stride;
1373
    }
1374
}
1375

    
1376
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377
    int i,j;
1378
    for (i=0; i < height; i++) {
1379
      for (j=0; j < width; j++) {
1380
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1381
      }
1382
      src += stride;
1383
      dst += stride;
1384
    }
1385
}
1386

    
1387
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388
    int i,j;
1389
    for (i=0; i < height; i++) {
1390
      for (j=0; j < width; j++) {
1391
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1392
      }
1393
      src += stride;
1394
      dst += stride;
1395
    }
1396
}
1397
#if 0
1398
#define TPEL_WIDTH(width)\
1399
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1400
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1401
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1403
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1405
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1407
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1409
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1411
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1413
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1415
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1417
#endif
1418

    
1419
#define H264_CHROMA_MC(OPNAME, OP)\
1420
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1421
    const int A=(8-x)*(8-y);\
1422
    const int B=(  x)*(8-y);\
1423
    const int C=(8-x)*(  y);\
1424
    const int D=(  x)*(  y);\
1425
    int i;\
1426
    \
1427
    assert(x<8 && y<8 && x>=0 && y>=0);\
1428
\
1429
    for(i=0; i<h; i++)\
1430
    {\
1431
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1432
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1433
        dst+= stride;\
1434
        src+= stride;\
1435
    }\
1436
}\
1437
\
1438
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1439
    const int A=(8-x)*(8-y);\
1440
    const int B=(  x)*(8-y);\
1441
    const int C=(8-x)*(  y);\
1442
    const int D=(  x)*(  y);\
1443
    int i;\
1444
    \
1445
    assert(x<8 && y<8 && x>=0 && y>=0);\
1446
\
1447
    for(i=0; i<h; i++)\
1448
    {\
1449
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1450
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1451
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1452
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1453
        dst+= stride;\
1454
        src+= stride;\
1455
    }\
1456
}\
1457
\
1458
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1459
    const int A=(8-x)*(8-y);\
1460
    const int B=(  x)*(8-y);\
1461
    const int C=(8-x)*(  y);\
1462
    const int D=(  x)*(  y);\
1463
    int i;\
1464
    \
1465
    assert(x<8 && y<8 && x>=0 && y>=0);\
1466
\
1467
    for(i=0; i<h; i++)\
1468
    {\
1469
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1470
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1471
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1472
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1473
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1474
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1475
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1476
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1477
        dst+= stride;\
1478
        src+= stride;\
1479
    }\
1480
}
1481

    
1482
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1483
#define op_put(a, b) a = (((b) + 32)>>6)
1484

    
1485
H264_CHROMA_MC(put_       , op_put)
1486
H264_CHROMA_MC(avg_       , op_avg)
1487
#undef op_avg
1488
#undef op_put
1489

    
1490
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1491
{
1492
    int i;
1493
    for(i=0; i<h; i++)
1494
    {
1495
        ST32(dst   , LD32(src   ));
1496
        dst+=dstStride;
1497
        src+=srcStride;
1498
    }
1499
}
1500

    
1501
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1502
{
1503
    int i;
1504
    for(i=0; i<h; i++)
1505
    {
1506
        ST32(dst   , LD32(src   ));
1507
        ST32(dst+4 , LD32(src+4 ));
1508
        dst+=dstStride;
1509
        src+=srcStride;
1510
    }
1511
}
1512

    
1513
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1514
{
1515
    int i;
1516
    for(i=0; i<h; i++)
1517
    {
1518
        ST32(dst   , LD32(src   ));
1519
        ST32(dst+4 , LD32(src+4 ));
1520
        ST32(dst+8 , LD32(src+8 ));
1521
        ST32(dst+12, LD32(src+12));
1522
        dst+=dstStride;
1523
        src+=srcStride;
1524
    }
1525
}
1526

    
1527
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1528
{
1529
    int i;
1530
    for(i=0; i<h; i++)
1531
    {
1532
        ST32(dst   , LD32(src   ));
1533
        ST32(dst+4 , LD32(src+4 ));
1534
        ST32(dst+8 , LD32(src+8 ));
1535
        ST32(dst+12, LD32(src+12));
1536
        dst[16]= src[16];
1537
        dst+=dstStride;
1538
        src+=srcStride;
1539
    }
1540
}
1541

    
1542
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1543
{
1544
    int i;
1545
    for(i=0; i<h; i++)
1546
    {
1547
        ST32(dst   , LD32(src   ));
1548
        ST32(dst+4 , LD32(src+4 ));
1549
        dst[8]= src[8];
1550
        dst+=dstStride;
1551
        src+=srcStride;
1552
    }
1553
}
1554

    
1555

    
1556
#define QPEL_MC(r, OPNAME, RND, OP) \
1557
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1558
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1559
    int i;\
1560
    for(i=0; i<h; i++)\
1561
    {\
1562
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1563
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1564
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1565
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1566
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1567
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1568
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1569
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1570
        dst+=dstStride;\
1571
        src+=srcStride;\
1572
    }\
1573
}\
1574
\
1575
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1576
    const int w=8;\
1577
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1578
    int i;\
1579
    for(i=0; i<w; i++)\
1580
    {\
1581
        const int src0= src[0*srcStride];\
1582
        const int src1= src[1*srcStride];\
1583
        const int src2= src[2*srcStride];\
1584
        const int src3= src[3*srcStride];\
1585
        const int src4= src[4*srcStride];\
1586
        const int src5= src[5*srcStride];\
1587
        const int src6= src[6*srcStride];\
1588
        const int src7= src[7*srcStride];\
1589
        const int src8= src[8*srcStride];\
1590
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1591
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1592
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1593
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1594
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1595
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1596
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1597
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1598
        dst++;\
1599
        src++;\
1600
    }\
1601
}\
1602
\
1603
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1604
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1605
    int i;\
1606
    \
1607
    for(i=0; i<h; i++)\
1608
    {\
1609
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1610
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1611
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1612
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1613
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1614
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1615
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1616
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1617
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1618
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1619
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1620
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1621
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1622
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1623
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1624
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1625
        dst+=dstStride;\
1626
        src+=srcStride;\
1627
    }\
1628
}\
1629
\
1630
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1631
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1632
    int i;\
1633
    const int w=16;\
1634
    for(i=0; i<w; i++)\
1635
    {\
1636
        const int src0= src[0*srcStride];\
1637
        const int src1= src[1*srcStride];\
1638
        const int src2= src[2*srcStride];\
1639
        const int src3= src[3*srcStride];\
1640
        const int src4= src[4*srcStride];\
1641
        const int src5= src[5*srcStride];\
1642
        const int src6= src[6*srcStride];\
1643
        const int src7= src[7*srcStride];\
1644
        const int src8= src[8*srcStride];\
1645
        const int src9= src[9*srcStride];\
1646
        const int src10= src[10*srcStride];\
1647
        const int src11= src[11*srcStride];\
1648
        const int src12= src[12*srcStride];\
1649
        const int src13= src[13*srcStride];\
1650
        const int src14= src[14*srcStride];\
1651
        const int src15= src[15*srcStride];\
1652
        const int src16= src[16*srcStride];\
1653
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1654
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1655
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1656
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1657
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1658
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1659
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1660
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1661
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1662
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1663
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1664
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1665
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1666
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1667
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1668
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1669
        dst++;\
1670
        src++;\
1671
    }\
1672
}\
1673
\
1674
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1675
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1676
}\
1677
\
1678
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1679
    uint8_t half[64];\
1680
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1681
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1682
}\
1683
\
1684
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1685
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1686
}\
1687
\
1688
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1689
    uint8_t half[64];\
1690
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1691
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1692
}\
1693
\
1694
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1695
    uint8_t full[16*9];\
1696
    uint8_t half[64];\
1697
    copy_block9(full, src, 16, stride, 9);\
1698
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1699
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1700
}\
1701
\
1702
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1703
    uint8_t full[16*9];\
1704
    copy_block9(full, src, 16, stride, 9);\
1705
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1706
}\
1707
\
1708
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1709
    uint8_t full[16*9];\
1710
    uint8_t half[64];\
1711
    copy_block9(full, src, 16, stride, 9);\
1712
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1713
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1714
}\
1715
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1716
    uint8_t full[16*9];\
1717
    uint8_t halfH[72];\
1718
    uint8_t halfV[64];\
1719
    uint8_t halfHV[64];\
1720
    copy_block9(full, src, 16, stride, 9);\
1721
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1722
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1723
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1724
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1725
}\
1726
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1727
    uint8_t full[16*9];\
1728
    uint8_t halfH[72];\
1729
    uint8_t halfHV[64];\
1730
    copy_block9(full, src, 16, stride, 9);\
1731
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1732
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1733
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1734
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1735
}\
1736
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1737
    uint8_t full[16*9];\
1738
    uint8_t halfH[72];\
1739
    uint8_t halfV[64];\
1740
    uint8_t halfHV[64];\
1741
    copy_block9(full, src, 16, stride, 9);\
1742
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1744
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1746
}\
1747
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1748
    uint8_t full[16*9];\
1749
    uint8_t halfH[72];\
1750
    uint8_t halfHV[64];\
1751
    copy_block9(full, src, 16, stride, 9);\
1752
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1753
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1754
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1755
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1756
}\
1757
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1758
    uint8_t full[16*9];\
1759
    uint8_t halfH[72];\
1760
    uint8_t halfV[64];\
1761
    uint8_t halfHV[64];\
1762
    copy_block9(full, src, 16, stride, 9);\
1763
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1765
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1767
}\
1768
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1769
    uint8_t full[16*9];\
1770
    uint8_t halfH[72];\
1771
    uint8_t halfHV[64];\
1772
    copy_block9(full, src, 16, stride, 9);\
1773
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1774
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1775
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1776
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1777
}\
1778
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1779
    uint8_t full[16*9];\
1780
    uint8_t halfH[72];\
1781
    uint8_t halfV[64];\
1782
    uint8_t halfHV[64];\
1783
    copy_block9(full, src, 16, stride, 9);\
1784
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1785
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1786
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1787
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1788
}\
1789
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1790
    uint8_t full[16*9];\
1791
    uint8_t halfH[72];\
1792
    uint8_t halfHV[64];\
1793
    copy_block9(full, src, 16, stride, 9);\
1794
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1795
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1796
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1797
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1798
}\
1799
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1800
    uint8_t halfH[72];\
1801
    uint8_t halfHV[64];\
1802
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1803
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1804
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1805
}\
1806
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1807
    uint8_t halfH[72];\
1808
    uint8_t halfHV[64];\
1809
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1810
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1811
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1812
}\
1813
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1814
    uint8_t full[16*9];\
1815
    uint8_t halfH[72];\
1816
    uint8_t halfV[64];\
1817
    uint8_t halfHV[64];\
1818
    copy_block9(full, src, 16, stride, 9);\
1819
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1820
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1821
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1823
}\
1824
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1825
    uint8_t full[16*9];\
1826
    uint8_t halfH[72];\
1827
    copy_block9(full, src, 16, stride, 9);\
1828
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1829
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1830
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1831
}\
1832
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1833
    uint8_t full[16*9];\
1834
    uint8_t halfH[72];\
1835
    uint8_t halfV[64];\
1836
    uint8_t halfHV[64];\
1837
    copy_block9(full, src, 16, stride, 9);\
1838
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1839
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1840
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1842
}\
1843
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1844
    uint8_t full[16*9];\
1845
    uint8_t halfH[72];\
1846
    copy_block9(full, src, 16, stride, 9);\
1847
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1848
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1849
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1850
}\
1851
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1852
    uint8_t halfH[72];\
1853
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1854
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1855
}\
1856
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1857
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1858
}\
1859
\
1860
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1861
    uint8_t half[256];\
1862
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1863
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1864
}\
1865
\
1866
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1867
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1868
}\
1869
\
1870
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1871
    uint8_t half[256];\
1872
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1873
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1874
}\
1875
\
1876
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1877
    uint8_t full[24*17];\
1878
    uint8_t half[256];\
1879
    copy_block17(full, src, 24, stride, 17);\
1880
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1881
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1882
}\
1883
\
1884
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1885
    uint8_t full[24*17];\
1886
    copy_block17(full, src, 24, stride, 17);\
1887
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1888
}\
1889
\
1890
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1891
    uint8_t full[24*17];\
1892
    uint8_t half[256];\
1893
    copy_block17(full, src, 24, stride, 17);\
1894
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1895
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1896
}\
1897
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1898
    uint8_t full[24*17];\
1899
    uint8_t halfH[272];\
1900
    uint8_t halfV[256];\
1901
    uint8_t halfHV[256];\
1902
    copy_block17(full, src, 24, stride, 17);\
1903
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1904
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1905
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1906
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1907
}\
1908
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1909
    uint8_t full[24*17];\
1910
    uint8_t halfH[272];\
1911
    uint8_t halfHV[256];\
1912
    copy_block17(full, src, 24, stride, 17);\
1913
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1914
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1915
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1916
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1917
}\
1918
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1919
    uint8_t full[24*17];\
1920
    uint8_t halfH[272];\
1921
    uint8_t halfV[256];\
1922
    uint8_t halfHV[256];\
1923
    copy_block17(full, src, 24, stride, 17);\
1924
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1926
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1928
}\
1929
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1930
    uint8_t full[24*17];\
1931
    uint8_t halfH[272];\
1932
    uint8_t halfHV[256];\
1933
    copy_block17(full, src, 24, stride, 17);\
1934
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1935
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1936
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1937
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1938
}\
1939
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1940
    uint8_t full[24*17];\
1941
    uint8_t halfH[272];\
1942
    uint8_t halfV[256];\
1943
    uint8_t halfHV[256];\
1944
    copy_block17(full, src, 24, stride, 17);\
1945
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1947
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1949
}\
1950
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1951
    uint8_t full[24*17];\
1952
    uint8_t halfH[272];\
1953
    uint8_t halfHV[256];\
1954
    copy_block17(full, src, 24, stride, 17);\
1955
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1956
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1957
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1958
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1959
}\
1960
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1961
    uint8_t full[24*17];\
1962
    uint8_t halfH[272];\
1963
    uint8_t halfV[256];\
1964
    uint8_t halfHV[256];\
1965
    copy_block17(full, src, 24, stride, 17);\
1966
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1967
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1968
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1969
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1970
}\
1971
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1972
    uint8_t full[24*17];\
1973
    uint8_t halfH[272];\
1974
    uint8_t halfHV[256];\
1975
    copy_block17(full, src, 24, stride, 17);\
1976
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1977
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1978
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1979
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1980
}\
1981
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1982
    uint8_t halfH[272];\
1983
    uint8_t halfHV[256];\
1984
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1985
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1986
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1987
}\
1988
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1989
    uint8_t halfH[272];\
1990
    uint8_t halfHV[256];\
1991
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1992
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1993
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1994
}\
1995
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1996
    uint8_t full[24*17];\
1997
    uint8_t halfH[272];\
1998
    uint8_t halfV[256];\
1999
    uint8_t halfHV[256];\
2000
    copy_block17(full, src, 24, stride, 17);\
2001
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2002
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2003
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2005
}\
2006
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2007
    uint8_t full[24*17];\
2008
    uint8_t halfH[272];\
2009
    copy_block17(full, src, 24, stride, 17);\
2010
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2011
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2012
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2013
}\
2014
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2015
    uint8_t full[24*17];\
2016
    uint8_t halfH[272];\
2017
    uint8_t halfV[256];\
2018
    uint8_t halfHV[256];\
2019
    copy_block17(full, src, 24, stride, 17);\
2020
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2021
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2022
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2023
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2024
}\
2025
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2026
    uint8_t full[24*17];\
2027
    uint8_t halfH[272];\
2028
    copy_block17(full, src, 24, stride, 17);\
2029
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2030
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2031
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2032
}\
2033
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2034
    uint8_t halfH[272];\
2035
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2036
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2037
}
2038

    
2039
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2040
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2041
#define op_put(a, b) a = cm[((b) + 16)>>5]
2042
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2043

    
2044
QPEL_MC(0, put_       , _       , op_put)
2045
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2046
QPEL_MC(0, avg_       , _       , op_avg)
2047
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2048
#undef op_avg
2049
#undef op_avg_no_rnd
2050
#undef op_put
2051
#undef op_put_no_rnd
2052

    
2053
#if 1
2054
#define H264_LOWPASS(OPNAME, OP, OP2) \
2055
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2056
    const int h=4;\
2057
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2058
    int i;\
2059
    for(i=0; i<h; i++)\
2060
    {\
2061
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2062
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2063
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2064
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2065
        dst+=dstStride;\
2066
        src+=srcStride;\
2067
    }\
2068
}\
2069
\
2070
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2071
    const int w=4;\
2072
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2073
    int i;\
2074
    for(i=0; i<w; i++)\
2075
    {\
2076
        const int srcB= src[-2*srcStride];\
2077
        const int srcA= src[-1*srcStride];\
2078
        const int src0= src[0 *srcStride];\
2079
        const int src1= src[1 *srcStride];\
2080
        const int src2= src[2 *srcStride];\
2081
        const int src3= src[3 *srcStride];\
2082
        const int src4= src[4 *srcStride];\
2083
        const int src5= src[5 *srcStride];\
2084
        const int src6= src[6 *srcStride];\
2085
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2086
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2087
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2088
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2089
        dst++;\
2090
        src++;\
2091
    }\
2092
}\
2093
\
2094
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2095
    const int h=4;\
2096
    const int w=4;\
2097
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2098
    int i;\
2099
    src -= 2*srcStride;\
2100
    for(i=0; i<h+5; i++)\
2101
    {\
2102
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2103
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2104
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2105
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2106
        tmp+=tmpStride;\
2107
        src+=srcStride;\
2108
    }\
2109
    tmp -= tmpStride*(h+5-2);\
2110
    for(i=0; i<w; i++)\
2111
    {\
2112
        const int tmpB= tmp[-2*tmpStride];\
2113
        const int tmpA= tmp[-1*tmpStride];\
2114
        const int tmp0= tmp[0 *tmpStride];\
2115
        const int tmp1= tmp[1 *tmpStride];\
2116
        const int tmp2= tmp[2 *tmpStride];\
2117
        const int tmp3= tmp[3 *tmpStride];\
2118
        const int tmp4= tmp[4 *tmpStride];\
2119
        const int tmp5= tmp[5 *tmpStride];\
2120
        const int tmp6= tmp[6 *tmpStride];\
2121
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2122
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2123
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2124
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2125
        dst++;\
2126
        tmp++;\
2127
    }\
2128
}\
2129
\
2130
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2131
    const int h=8;\
2132
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2133
    int i;\
2134
    for(i=0; i<h; i++)\
2135
    {\
2136
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2137
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2138
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2139
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2140
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2141
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2142
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2143
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2144
        dst+=dstStride;\
2145
        src+=srcStride;\
2146
    }\
2147
}\
2148
\
2149
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2150
    const int w=8;\
2151
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2152
    int i;\
2153
    for(i=0; i<w; i++)\
2154
    {\
2155
        const int srcB= src[-2*srcStride];\
2156
        const int srcA= src[-1*srcStride];\
2157
        const int src0= src[0 *srcStride];\
2158
        const int src1= src[1 *srcStride];\
2159
        const int src2= src[2 *srcStride];\
2160
        const int src3= src[3 *srcStride];\
2161
        const int src4= src[4 *srcStride];\
2162
        const int src5= src[5 *srcStride];\
2163
        const int src6= src[6 *srcStride];\
2164
        const int src7= src[7 *srcStride];\
2165
        const int src8= src[8 *srcStride];\
2166
        const int src9= src[9 *srcStride];\
2167
        const int src10=src[10*srcStride];\
2168
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2169
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2170
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2171
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2172
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2173
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2174
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2175
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2176
        dst++;\
2177
        src++;\
2178
    }\
2179
}\
2180
\
2181
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2182
    const int h=8;\
2183
    const int w=8;\
2184
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2185
    int i;\
2186
    src -= 2*srcStride;\
2187
    for(i=0; i<h+5; i++)\
2188
    {\
2189
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2190
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2191
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2192
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2193
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2194
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2195
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2196
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2197
        tmp+=tmpStride;\
2198
        src+=srcStride;\
2199
    }\
2200
    tmp -= tmpStride*(h+5-2);\
2201
    for(i=0; i<w; i++)\
2202
    {\
2203
        const int tmpB= tmp[-2*tmpStride];\
2204
        const int tmpA= tmp[-1*tmpStride];\
2205
        const int tmp0= tmp[0 *tmpStride];\
2206
        const int tmp1= tmp[1 *tmpStride];\
2207
        const int tmp2= tmp[2 *tmpStride];\
2208
        const int tmp3= tmp[3 *tmpStride];\
2209
        const int tmp4= tmp[4 *tmpStride];\
2210
        const int tmp5= tmp[5 *tmpStride];\
2211
        const int tmp6= tmp[6 *tmpStride];\
2212
        const int tmp7= tmp[7 *tmpStride];\
2213
        const int tmp8= tmp[8 *tmpStride];\
2214
        const int tmp9= tmp[9 *tmpStride];\
2215
        const int tmp10=tmp[10*tmpStride];\
2216
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2217
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2218
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2219
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2220
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2221
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2222
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2223
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2224
        dst++;\
2225
        tmp++;\
2226
    }\
2227
}\
2228
\
2229
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2230
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2231
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2232
    src += 8*srcStride;\
2233
    dst += 8*dstStride;\
2234
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2235
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2236
}\
2237
\
2238
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2239
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2240
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2241
    src += 8*srcStride;\
2242
    dst += 8*dstStride;\
2243
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2244
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2245
}\
2246
\
2247
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2248
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2249
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2250
    src += 8*srcStride;\
2251
    dst += 8*dstStride;\
2252
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2253
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2254
}\
2255

    
2256
#define H264_MC(OPNAME, SIZE) \
2257
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2258
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2259
}\
2260
\
2261
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2262
    uint8_t half[SIZE*SIZE];\
2263
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2264
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2265
}\
2266
\
2267
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2268
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2269
}\
2270
\
2271
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2272
    uint8_t half[SIZE*SIZE];\
2273
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2274
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2275
}\
2276
\
2277
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2278
    uint8_t full[SIZE*(SIZE+5)];\
2279
    uint8_t * const full_mid= full + SIZE*2;\
2280
    uint8_t half[SIZE*SIZE];\
2281
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2282
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2283
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2284
}\
2285
\
2286
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2287
    uint8_t full[SIZE*(SIZE+5)];\
2288
    uint8_t * const full_mid= full + SIZE*2;\
2289
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2290
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2291
}\
2292
\
2293
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2294
    uint8_t full[SIZE*(SIZE+5)];\
2295
    uint8_t * const full_mid= full + SIZE*2;\
2296
    uint8_t half[SIZE*SIZE];\
2297
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2298
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2299
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2300
}\
2301
\
2302
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2303
    uint8_t full[SIZE*(SIZE+5)];\
2304
    uint8_t * const full_mid= full + SIZE*2;\
2305
    uint8_t halfH[SIZE*SIZE];\
2306
    uint8_t halfV[SIZE*SIZE];\
2307
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2308
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2309
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2310
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2311
}\
2312
\
2313
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2314
    uint8_t full[SIZE*(SIZE+5)];\
2315
    uint8_t * const full_mid= full + SIZE*2;\
2316
    uint8_t halfH[SIZE*SIZE];\
2317
    uint8_t halfV[SIZE*SIZE];\
2318
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2319
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2320
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2321
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2322
}\
2323
\
2324
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2325
    uint8_t full[SIZE*(SIZE+5)];\
2326
    uint8_t * const full_mid= full + SIZE*2;\
2327
    uint8_t halfH[SIZE*SIZE];\
2328
    uint8_t halfV[SIZE*SIZE];\
2329
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2330
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2331
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2332
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2333
}\
2334
\
2335
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2336
    uint8_t full[SIZE*(SIZE+5)];\
2337
    uint8_t * const full_mid= full + SIZE*2;\
2338
    uint8_t halfH[SIZE*SIZE];\
2339
    uint8_t halfV[SIZE*SIZE];\
2340
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2341
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2342
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2343
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2344
}\
2345
\
2346
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2347
    int16_t tmp[SIZE*(SIZE+5)];\
2348
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2349
}\
2350
\
2351
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2352
    int16_t tmp[SIZE*(SIZE+5)];\
2353
    uint8_t halfH[SIZE*SIZE];\
2354
    uint8_t halfHV[SIZE*SIZE];\
2355
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2356
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2357
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2358
}\
2359
\
2360
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2361
    int16_t tmp[SIZE*(SIZE+5)];\
2362
    uint8_t halfH[SIZE*SIZE];\
2363
    uint8_t halfHV[SIZE*SIZE];\
2364
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2365
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2366
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2367
}\
2368
\
2369
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2370
    uint8_t full[SIZE*(SIZE+5)];\
2371
    uint8_t * const full_mid= full + SIZE*2;\
2372
    int16_t tmp[SIZE*(SIZE+5)];\
2373
    uint8_t halfV[SIZE*SIZE];\
2374
    uint8_t halfHV[SIZE*SIZE];\
2375
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2376
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2377
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2378
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2379
}\
2380
\
2381
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2382
    uint8_t full[SIZE*(SIZE+5)];\
2383
    uint8_t * const full_mid= full + SIZE*2;\
2384
    int16_t tmp[SIZE*(SIZE+5)];\
2385
    uint8_t halfV[SIZE*SIZE];\
2386
    uint8_t halfHV[SIZE*SIZE];\
2387
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2388
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2389
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2390
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2391
}\
2392

    
2393
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2394
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2395
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2396
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2397
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2398

    
2399
H264_LOWPASS(put_       , op_put, op2_put)
2400
H264_LOWPASS(avg_       , op_avg, op2_avg)
2401
H264_MC(put_, 4)
2402
H264_MC(put_, 8)
2403
H264_MC(put_, 16)
2404
H264_MC(avg_, 4)
2405
H264_MC(avg_, 8)
2406
H264_MC(avg_, 16)
2407

    
2408
#undef op_avg
2409
#undef op_put
2410
#undef op2_avg
2411
#undef op2_put
2412
#endif
2413

    
2414
#define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2415
#define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2416
#define H264_WEIGHT(W,H) \
2417
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2418
    int attribute_unused x, y; \
2419
    offset <<= log2_denom; \
2420
    if(log2_denom) offset += 1<<(log2_denom-1); \
2421
    for(y=0; y<H; y++, block += stride){ \
2422
        op_scale1(0); \
2423
        op_scale1(1); \
2424
        if(W==2) continue; \
2425
        op_scale1(2); \
2426
        op_scale1(3); \
2427
        if(W==4) continue; \
2428
        op_scale1(4); \
2429
        op_scale1(5); \
2430
        op_scale1(6); \
2431
        op_scale1(7); \
2432
        if(W==8) continue; \
2433
        op_scale1(8); \
2434
        op_scale1(9); \
2435
        op_scale1(10); \
2436
        op_scale1(11); \
2437
        op_scale1(12); \
2438
        op_scale1(13); \
2439
        op_scale1(14); \
2440
        op_scale1(15); \
2441
    } \
2442
} \
2443
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2444
    int attribute_unused x, y; \
2445
    int offset = (offsets + offsetd + 1) >> 1; \
2446
    offset = ((offset << 1) + 1) << log2_denom; \
2447
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2448
        op_scale2(0); \
2449
        op_scale2(1); \
2450
        if(W==2) continue; \
2451
        op_scale2(2); \
2452
        op_scale2(3); \
2453
        if(W==4) continue; \
2454
        op_scale2(4); \
2455
        op_scale2(5); \
2456
        op_scale2(6); \
2457
        op_scale2(7); \
2458
        if(W==8) continue; \
2459
        op_scale2(8); \
2460
        op_scale2(9); \
2461
        op_scale2(10); \
2462
        op_scale2(11); \
2463
        op_scale2(12); \
2464
        op_scale2(13); \
2465
        op_scale2(14); \
2466
        op_scale2(15); \
2467
    } \
2468
}
2469

    
2470
H264_WEIGHT(16,16)
2471
H264_WEIGHT(16,8)
2472
H264_WEIGHT(8,16)
2473
H264_WEIGHT(8,8)
2474
H264_WEIGHT(8,4)
2475
H264_WEIGHT(4,8)
2476
H264_WEIGHT(4,4)
2477
H264_WEIGHT(4,2)
2478
H264_WEIGHT(2,4)
2479
H264_WEIGHT(2,2)
2480

    
2481
#undef op_scale1
2482
#undef op_scale2
2483
#undef H264_WEIGHT
2484

    
2485
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2486
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2487
    int i;
2488

    
2489
    for(i=0; i<h; i++){
2490
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2491
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2492
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2493
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2494
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2495
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2496
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2497
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2498
        dst+=dstStride;
2499
        src+=srcStride;
2500
    }
2501
}
2502

    
2503
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2504
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2505
    int i;
2506

    
2507
    for(i=0; i<w; i++){
2508
        const int src_1= src[ -srcStride];
2509
        const int src0 = src[0          ];
2510
        const int src1 = src[  srcStride];
2511
        const int src2 = src[2*srcStride];
2512
        const int src3 = src[3*srcStride];
2513
        const int src4 = src[4*srcStride];
2514
        const int src5 = src[5*srcStride];
2515
        const int src6 = src[6*srcStride];
2516
        const int src7 = src[7*srcStride];
2517
        const int src8 = src[8*srcStride];
2518
        const int src9 = src[9*srcStride];
2519
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2520
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2521
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2522
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2523
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2524
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2525
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2526
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2527
        src++;
2528
        dst++;
2529
    }
2530
}
2531

    
2532
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2533
    put_pixels8_c(dst, src, stride, 8);
2534
}
2535

    
2536
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2537
    uint8_t half[64];
2538
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2539
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2540
}
2541

    
2542
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2543
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2544
}
2545

    
2546
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2547
    uint8_t half[64];
2548
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2549
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2550
}
2551

    
2552
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2553
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2554
}
2555

    
2556
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2557
    uint8_t halfH[88];
2558
    uint8_t halfV[64];
2559
    uint8_t halfHV[64];
2560
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2561
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2562
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2563
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2564
}
2565
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2566
    uint8_t halfH[88];
2567
    uint8_t halfV[64];
2568
    uint8_t halfHV[64];
2569
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2570
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2571
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2572
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2573
}
2574
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2575
    uint8_t halfH[88];
2576
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2577
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2578
}
2579

    
2580
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2581
    int x;
2582
    const int strength= ff_h263_loop_filter_strength[qscale];
2583

    
2584
    for(x=0; x<8; x++){
2585
        int d1, d2, ad1;
2586
        int p0= src[x-2*stride];
2587
        int p1= src[x-1*stride];
2588
        int p2= src[x+0*stride];
2589
        int p3= src[x+1*stride];
2590
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2591

    
2592
        if     (d<-2*strength) d1= 0;
2593
        else if(d<-  strength) d1=-2*strength - d;
2594
        else if(d<   strength) d1= d;
2595
        else if(d< 2*strength) d1= 2*strength - d;
2596
        else                   d1= 0;
2597

    
2598
        p1 += d1;
2599
        p2 -= d1;
2600
        if(p1&256) p1= ~(p1>>31);
2601
        if(p2&256) p2= ~(p2>>31);
2602

    
2603
        src[x-1*stride] = p1;
2604
        src[x+0*stride] = p2;
2605

    
2606
        ad1= ABS(d1)>>1;
2607

    
2608
        d2= clip((p0-p3)/4, -ad1, ad1);
2609

    
2610
        src[x-2*stride] = p0 - d2;
2611
        src[x+  stride] = p3 + d2;
2612
    }
2613
}
2614

    
2615
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2616
    int y;
2617
    const int strength= ff_h263_loop_filter_strength[qscale];
2618

    
2619
    for(y=0; y<8; y++){
2620
        int d1, d2, ad1;
2621
        int p0= src[y*stride-2];
2622
        int p1= src[y*stride-1];
2623
        int p2= src[y*stride+0];
2624
        int p3= src[y*stride+1];
2625
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2626

    
2627
        if     (d<-2*strength) d1= 0;
2628
        else if(d<-  strength) d1=-2*strength - d;
2629
        else if(d<   strength) d1= d;
2630
        else if(d< 2*strength) d1= 2*strength - d;
2631
        else                   d1= 0;
2632

    
2633
        p1 += d1;
2634
        p2 -= d1;
2635
        if(p1&256) p1= ~(p1>>31);
2636
        if(p2&256) p2= ~(p2>>31);
2637

    
2638
        src[y*stride-1] = p1;
2639
        src[y*stride+0] = p2;
2640

    
2641
        ad1= ABS(d1)>>1;
2642

    
2643
        d2= clip((p0-p3)/4, -ad1, ad1);
2644

    
2645
        src[y*stride-2] = p0 - d2;
2646
        src[y*stride+1] = p3 + d2;
2647
    }
2648
}
2649

    
2650
static void h261_loop_filter_c(uint8_t *src, int stride){
2651
    int x,y,xy,yz;
2652
    int temp[64];
2653

    
2654
    for(x=0; x<8; x++){
2655
        temp[x      ] = 4*src[x           ];
2656
        temp[x + 7*8] = 4*src[x + 7*stride];
2657
    }
2658
    for(y=1; y<7; y++){
2659
        for(x=0; x<8; x++){
2660
            xy = y * stride + x;
2661
            yz = y * 8 + x;
2662
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2663
        }
2664
    }
2665

    
2666
    for(y=0; y<8; y++){
2667
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2668
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2669
        for(x=1; x<7; x++){
2670
            xy = y * stride + x;
2671
            yz = y * 8 + x;
2672
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2673
        }
2674
    }
2675
}
2676

    
2677
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2678
{
2679
    int i, d;
2680
    for( i = 0; i < 4; i++ ) {
2681
        if( tc0[i] < 0 ) {
2682
            pix += 4*ystride;
2683
            continue;
2684
        }
2685
        for( d = 0; d < 4; d++ ) {
2686
            const int p0 = pix[-1*xstride];
2687
            const int p1 = pix[-2*xstride];
2688
            const int p2 = pix[-3*xstride];
2689
            const int q0 = pix[0];
2690
            const int q1 = pix[1*xstride];
2691
            const int q2 = pix[2*xstride];
2692

    
2693
            if( ABS( p0 - q0 ) < alpha &&
2694
                ABS( p1 - p0 ) < beta &&
2695
                ABS( q1 - q0 ) < beta ) {
2696

    
2697
                int tc = tc0[i];
2698
                int i_delta;
2699

    
2700
                if( ABS( p2 - p0 ) < beta ) {
2701
                    pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2702
                    tc++;
2703
                }
2704
                if( ABS( q2 - q0 ) < beta ) {
2705
                    pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2706
                    tc++;
2707
                }
2708

    
2709
                i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2710
                pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2711
                pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2712
            }
2713
            pix += ystride;
2714
        }
2715
    }
2716
}
2717
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2718
{
2719
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2720
}
2721
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2722
{
2723
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2724
}
2725

    
2726
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2727
{
2728
    int i, d;
2729
    for( i = 0; i < 4; i++ ) {
2730
        const int tc = tc0[i];
2731
        if( tc <= 0 ) {
2732
            pix += 2*ystride;
2733
            continue;
2734
        }
2735
        for( d = 0; d < 2; d++ ) {
2736
            const int p0 = pix[-1*xstride];
2737
            const int p1 = pix[-2*xstride];
2738
            const int q0 = pix[0];
2739
            const int q1 = pix[1*xstride];
2740

    
2741
            if( ABS( p0 - q0 ) < alpha &&
2742
                ABS( p1 - p0 ) < beta &&
2743
                ABS( q1 - q0 ) < beta ) {
2744

    
2745
                int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2746

    
2747
                pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2748
                pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2749
            }
2750
            pix += ystride;
2751
        }
2752
    }
2753
}
2754
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2755
{
2756
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2757
}
2758
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2759
{
2760
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2761
}
2762

    
2763
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2764
{
2765
    int d;
2766
    for( d = 0; d < 8; d++ ) {
2767
        const int p0 = pix[-1*xstride];
2768
        const int p1 = pix[-2*xstride];
2769
        const int q0 = pix[0];
2770
        const int q1 = pix[1*xstride];
2771

    
2772
        if( ABS( p0 - q0 ) < alpha &&
2773
            ABS( p1 - p0 ) < beta &&
2774
            ABS( q1 - q0 ) < beta ) {
2775

    
2776
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2777
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2778
        }
2779
        pix += ystride;
2780
    }
2781
}
2782
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2783
{
2784
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2785
}
2786
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2787
{
2788
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2789
}
2790

    
2791
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2792
{
2793
    int s, i;
2794

    
2795
    s = 0;
2796
    for(i=0;i<h;i++) {
2797
        s += abs(pix1[0] - pix2[0]);
2798
        s += abs(pix1[1] - pix2[1]);
2799
        s += abs(pix1[2] - pix2[2]);
2800
        s += abs(pix1[3] - pix2[3]);
2801
        s += abs(pix1[4] - pix2[4]);
2802
        s += abs(pix1[5] - pix2[5]);
2803
        s += abs(pix1[6] - pix2[6]);
2804
        s += abs(pix1[7] - pix2[7]);
2805
        s += abs(pix1[8] - pix2[8]);
2806
        s += abs(pix1[9] - pix2[9]);
2807
        s += abs(pix1[10] - pix2[10]);
2808
        s += abs(pix1[11] - pix2[11]);
2809
        s += abs(pix1[12] - pix2[12]);
2810
        s += abs(pix1[13] - pix2[13]);
2811
        s += abs(pix1[14] - pix2[14]);
2812
        s += abs(pix1[15] - pix2[15]);
2813
        pix1 += line_size;
2814
        pix2 += line_size;
2815
    }
2816
    return s;
2817
}
2818

    
2819
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2820
{
2821
    int s, i;
2822

    
2823
    s = 0;
2824
    for(i=0;i<h;i++) {
2825
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2826
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2827
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2828
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2829
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2830
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2831
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2832
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2833
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2834
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2835
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2836
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2837
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2838
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2839
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2840
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2841
        pix1 += line_size;
2842
        pix2 += line_size;
2843
    }
2844
    return s;
2845
}
2846

    
2847
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2848
{
2849
    int s, i;
2850
    uint8_t *pix3 = pix2 + line_size;
2851

    
2852
    s = 0;
2853
    for(i=0;i<h;i++) {
2854
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2855
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2856
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2857
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2858
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2859
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2860
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2861
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2862
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2863
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2864
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2865
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2866
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2867
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2868
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2869
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2870
        pix1 += line_size;
2871
        pix2 += line_size;
2872
        pix3 += line_size;
2873
    }
2874
    return s;
2875
}
2876

    
2877
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2878
{
2879
    int s, i;
2880
    uint8_t *pix3 = pix2 + line_size;
2881

    
2882
    s = 0;
2883
    for(i=0;i<h;i++) {
2884
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2885
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2886
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2887
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2888
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2889
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2890
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2891
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2892
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2893
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2894
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2895
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2896
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2897
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2898
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2899
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2900
        pix1 += line_size;
2901
        pix2 += line_size;
2902
        pix3 += line_size;
2903
    }
2904
    return s;
2905
}
2906

    
2907
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2908
{
2909
    int s, i;
2910

    
2911
    s = 0;
2912
    for(i=0;i<h;i++) {
2913
        s += abs(pix1[0] - pix2[0]);
2914
        s += abs(pix1[1] - pix2[1]);
2915
        s += abs(pix1[2] - pix2[2]);
2916
        s += abs(pix1[3] - pix2[3]);
2917
        s += abs(pix1[4] - pix2[4]);
2918
        s += abs(pix1[5] - pix2[5]);
2919
        s += abs(pix1[6] - pix2[6]);
2920
        s += abs(pix1[7] - pix2[7]);
2921
        pix1 += line_size;
2922
        pix2 += line_size;
2923
    }
2924
    return s;
2925
}
2926

    
2927
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2928
{
2929
    int s, i;
2930

    
2931
    s = 0;
2932
    for(i=0;i<h;i++) {
2933
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2934
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2935
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2936
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2937
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2938
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2939
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2940
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2941
        pix1 += line_size;
2942
        pix2 += line_size;
2943
    }
2944
    return s;
2945
}
2946

    
2947
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2948
{
2949
    int s, i;
2950
    uint8_t *pix3 = pix2 + line_size;
2951

    
2952
    s = 0;
2953
    for(i=0;i<h;i++) {
2954
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2955
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2956
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2957
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2958
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2959
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2960
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2961
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2962
        pix1 += line_size;
2963
        pix2 += line_size;
2964
        pix3 += line_size;
2965
    }
2966
    return s;
2967
}
2968

    
2969
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2970
{
2971
    int s, i;
2972
    uint8_t *pix3 = pix2 + line_size;
2973

    
2974
    s = 0;
2975
    for(i=0;i<h;i++) {
2976
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2977
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2978
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2979
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2980
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2981
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2982
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2983
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2984
        pix1 += line_size;
2985
        pix2 += line_size;
2986
        pix3 += line_size;
2987
    }
2988
    return s;
2989
}
2990

    
2991
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2992
    MpegEncContext *c = v;
2993
    int score1=0;
2994
    int score2=0;
2995
    int x,y;
2996

    
2997
    for(y=0; y<h; y++){
2998
        for(x=0; x<16; x++){
2999
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3000
        }
3001
        if(y+1<h){
3002
            for(x=0; x<15; x++){
3003
                score2+= ABS(  s1[x  ] - s1[x  +stride]
3004
                             - s1[x+1] + s1[x+1+stride])
3005
                        -ABS(  s2[x  ] - s2[x  +stride]
3006
                             - s2[x+1] + s2[x+1+stride]);
3007
            }
3008
        }
3009
        s1+= stride;
3010
        s2+= stride;
3011
    }
3012

    
3013
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3014
    else  return score1 + ABS(score2)*8;
3015
}
3016

    
3017
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3018
    MpegEncContext *c = v;
3019
    int score1=0;
3020
    int score2=0;
3021
    int x,y;
3022

    
3023
    for(y=0; y<h; y++){
3024
        for(x=0; x<8; x++){
3025
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3026
        }
3027
        if(y+1<h){
3028
            for(x=0; x<7; x++){
3029
                score2+= ABS(  s1[x  ] - s1[x  +stride]
3030
                             - s1[x+1] + s1[x+1+stride])
3031
                        -ABS(  s2[x  ] - s2[x  +stride]
3032
                             - s2[x+1] + s2[x+1+stride]);
3033
            }
3034
        }
3035
        s1+= stride;
3036
        s2+= stride;
3037
    }
3038

    
3039
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3040
    else  return score1 + ABS(score2)*8;
3041
}
3042

    
3043
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3044
    int i;
3045
    unsigned int sum=0;
3046

    
3047
    for(i=0; i<8*8; i++){
3048
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3049
        int w= weight[i];
3050
        b>>= RECON_SHIFT;
3051
        assert(-512<b && b<512);
3052

    
3053
        sum += (w*b)*(w*b)>>4;
3054
    }
3055
    return sum>>2;
3056
}
3057

    
3058
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3059
    int i;
3060

    
3061
    for(i=0; i<8*8; i++){
3062
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3063
    }
3064
}
3065

    
3066
/**
3067
 * permutes an 8x8 block.
3068
 * @param block the block which will be permuted according to the given permutation vector
3069
 * @param permutation the permutation vector
3070
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3071
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3072
 *                  (inverse) permutated to scantable order!
3073
 */
3074
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3075
{
3076
    int i;
3077
    DCTELEM temp[64];
3078

    
3079
    if(last<=0) return;
3080
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3081

    
3082
    for(i=0; i<=last; i++){
3083
        const int j= scantable[i];
3084
        temp[j]= block[j];
3085
        block[j]=0;
3086
    }
3087

    
3088
    for(i=0; i<=last; i++){
3089
        const int j= scantable[i];
3090
        const int perm_j= permutation[j];
3091
        block[perm_j]= temp[j];
3092
    }
3093
}
3094

    
3095
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3096
    return 0;
3097
}
3098

    
3099
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3100
    int i;
3101

    
3102
    memset(cmp, 0, sizeof(void*)*5);
3103

    
3104
    for(i=0; i<5; i++){
3105
        switch(type&0xFF){
3106
        case FF_CMP_SAD:
3107
            cmp[i]= c->sad[i];
3108
            break;
3109
        case FF_CMP_SATD:
3110
            cmp[i]= c->hadamard8_diff[i];
3111
            break;
3112
        case FF_CMP_SSE:
3113
            cmp[i]= c->sse[i];
3114
            break;
3115
        case FF_CMP_DCT:
3116
            cmp[i]= c->dct_sad[i];
3117
            break;
3118
        case FF_CMP_DCTMAX:
3119
            cmp[i]= c->dct_max[i];
3120
            break;
3121
        case FF_CMP_PSNR:
3122
            cmp[i]= c->quant_psnr[i];
3123
            break;
3124
        case FF_CMP_BIT:
3125
            cmp[i]= c->bit[i];
3126
            break;
3127
        case FF_CMP_RD:
3128
            cmp[i]= c->rd[i];
3129
            break;
3130
        case FF_CMP_VSAD:
3131
            cmp[i]= c->vsad[i];
3132
            break;
3133
        case FF_CMP_VSSE:
3134
            cmp[i]= c->vsse[i];
3135
            break;
3136
        case FF_CMP_ZERO:
3137
            cmp[i]= zero_cmp;
3138
            break;
3139
        case FF_CMP_NSSE:
3140
            cmp[i]= c->nsse[i];
3141
            break;
3142
        case FF_CMP_W53:
3143
            cmp[i]= c->w53[i];
3144
            break;
3145
        case FF_CMP_W97:
3146
            cmp[i]= c->w97[i];
3147
            break;
3148
        default:
3149
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3150
        }
3151
    }
3152
}
3153

    
3154
/**
3155
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3156
 */
3157
static void clear_blocks_c(DCTELEM *blocks)
3158
{
3159
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3160
}
3161

    
3162
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3163
    int i;
3164
    for(i=0; i+7<w; i+=8){
3165
        dst[i+0] += src[i+0];
3166
        dst[i+1] += src[i+1];
3167
        dst[i+2] += src[i+2];
3168
        dst[i+3] += src[i+3];
3169
        dst[i+4] += src[i+4];
3170
        dst[i+5] += src[i+5];
3171
        dst[i+6] += src[i+6];
3172
        dst[i+7] += src[i+7];
3173
    }
3174
    for(; i<w; i++)
3175
        dst[i+0] += src[i+0];
3176
}
3177

    
3178
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3179
    int i;
3180
    for(i=0; i+7<w; i+=8){
3181
        dst[i+0] = src1[i+0]-src2[i+0];
3182
        dst[i+1] = src1[i+1]-src2[i+1];
3183
        dst[i+2] = src1[i+2]-src2[i+2];
3184
        dst[i+3] = src1[i+3]-src2[i+3];
3185
        dst[i+4] = src1[i+4]-src2[i+4];
3186
        dst[i+5] = src1[i+5]-src2[i+5];
3187
        dst[i+6] = src1[i+6]-src2[i+6];
3188
        dst[i+7] = src1[i+7]-src2[i+7];
3189
    }
3190
    for(; i<w; i++)
3191
        dst[i+0] = src1[i+0]-src2[i+0];
3192
}
3193

    
3194
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3195
    int i;
3196
    uint8_t l, lt;
3197

    
3198
    l= *left;
3199
    lt= *left_top;
3200

    
3201
    for(i=0; i<w; i++){
3202
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3203
        lt= src1[i];
3204
        l= src2[i];
3205
        dst[i]= l - pred;
3206
    }
3207

    
3208
    *left= l;
3209
    *left_top= lt;
3210
}
3211

    
3212
#define BUTTERFLY2(o1,o2,i1,i2) \
3213
o1= (i1)+(i2);\
3214
o2= (i1)-(i2);
3215

    
3216
#define BUTTERFLY1(x,y) \
3217
{\
3218
    int a,b;\
3219
    a= x;\
3220
    b= y;\
3221
    x= a+b;\
3222
    y= a-b;\
3223
}
3224

    
3225
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3226

    
3227
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3228
    int i;
3229
    int temp[64];
3230
    int sum=0;
3231

    
3232
    assert(h==8);
3233

    
3234
    for(i=0; i<8; i++){
3235
        //FIXME try pointer walks
3236
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3237
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3238
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3239
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3240

    
3241
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3242
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3243
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3244
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3245

    
3246
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3247
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3248
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3249
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3250
    }
3251

    
3252
    for(i=0; i<8; i++){
3253
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3254
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3255
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3256
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3257

    
3258
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3259
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3260
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3261
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3262

    
3263
        sum +=
3264
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3265
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3266
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3267
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3268
    }
3269
#if 0
3270
static int maxi=0;
3271
if(sum>maxi){
3272
    maxi=sum;
3273
    printf("MAX:%d\n", maxi);
3274
}
3275
#endif
3276
    return sum;
3277
}
3278

    
3279
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3280
    int i;
3281
    int temp[64];
3282
    int sum=0;
3283

    
3284
    assert(h==8);
3285

    
3286
    for(i=0; i<8; i++){
3287
        //FIXME try pointer walks
3288
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3289
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3290
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3291
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3292

    
3293
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3294
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3295
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3296
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3297

    
3298
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3299
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3300
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3301
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3302
    }
3303

    
3304
    for(i=0; i<8; i++){
3305
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3306
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3307
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3308
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3309

    
3310
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3311
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3312
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3313
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3314

    
3315
        sum +=
3316
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3317
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3318
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3319
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3320
    }
3321

    
3322
    sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3323

    
3324
    return sum;
3325
}
3326

    
3327
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3328
    MpegEncContext * const s= (MpegEncContext *)c;
3329
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3330
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3331
    int sum=0, i;
3332

    
3333
    assert(h==8);
3334

    
3335
    s->dsp.diff_pixels(temp, src1, src2, stride);
3336
    s->dsp.fdct(temp);
3337

    
3338
    for(i=0; i<64; i++)
3339
        sum+= ABS(temp[i]);
3340

    
3341
    return sum;
3342
}
3343

    
3344
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3345
    MpegEncContext * const s= (MpegEncContext *)c;
3346
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3347
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3348
    int sum=0, i;
3349

    
3350
    assert(h==8);
3351

    
3352
    s->dsp.diff_pixels(temp, src1, src2, stride);
3353
    s->dsp.fdct(temp);
3354

    
3355
    for(i=0; i<64; i++)
3356
        sum= FFMAX(sum, ABS(temp[i]));
3357

    
3358
    return sum;
3359
}
3360

    
3361
void simple_idct(DCTELEM *block); //FIXME
3362

    
3363
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3364
    MpegEncContext * const s= (MpegEncContext *)c;
3365
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3366
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3367
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3368
    int sum=0, i;
3369

    
3370
    assert(h==8);
3371
    s->mb_intra=0;
3372

    
3373
    s->dsp.diff_pixels(temp, src1, src2, stride);
3374

    
3375
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3376

    
3377
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3378
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3379
    simple_idct(temp); //FIXME
3380

    
3381
    for(i=0; i<64; i++)
3382
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3383

    
3384
    return sum;
3385
}
3386

    
3387
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3388
    MpegEncContext * const s= (MpegEncContext *)c;
3389
    const uint8_t *scantable= s->intra_scantable.permutated;
3390
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3391
    uint64_t __align8 aligned_bak[stride];
3392
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3393
    uint8_t * const bak= (uint8_t*)aligned_bak;
3394
    int i, last, run, bits, level, distoration, start_i;
3395
    const int esc_length= s->ac_esc_length;
3396
    uint8_t * length;
3397
    uint8_t * last_length;
3398

    
3399
    assert(h==8);
3400

    
3401
    for(i=0; i<8; i++){
3402
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3403
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3404
    }
3405

    
3406
    s->dsp.diff_pixels(temp, src1, src2, stride);
3407

    
3408
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3409

    
3410
    bits=0;
3411

    
3412
    if (s->mb_intra) {
3413
        start_i = 1;
3414
        length     = s->intra_ac_vlc_length;
3415
        last_length= s->intra_ac_vlc_last_length;
3416
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3417
    } else {
3418
        start_i = 0;
3419
        length     = s->inter_ac_vlc_length;
3420
        last_length= s->inter_ac_vlc_last_length;
3421
    }
3422

    
3423
    if(last>=start_i){
3424
        run=0;
3425
        for(i=start_i; i<last; i++){
3426
            int j= scantable[i];
3427
            level= temp[j];
3428

    
3429
            if(level){
3430
                level+=64;
3431
                if((level&(~127)) == 0){
3432
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3433
                }else
3434
                    bits+= esc_length;
3435
                run=0;
3436
            }else
3437
                run++;
3438
        }
3439
        i= scantable[last];
3440

    
3441
        level= temp[i] + 64;
3442

    
3443
        assert(level - 64);
3444

    
3445
        if((level&(~127)) == 0){
3446
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3447
        }else
3448
            bits+= esc_length;
3449

    
3450
    }
3451

    
3452
    if(last>=0){
3453
        if(s->mb_intra)
3454
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3455
        else
3456
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3457
    }
3458

    
3459
    s->dsp.idct_add(bak, stride, temp);
3460

    
3461
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3462

    
3463
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3464
}
3465

    
3466
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3467
    MpegEncContext * const s= (MpegEncContext *)c;
3468
    const uint8_t *scantable= s->intra_scantable.permutated;
3469
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3470
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3471
    int i, last, run, bits, level, start_i;
3472
    const int esc_length= s->ac_esc_length;
3473
    uint8_t * length;
3474
    uint8_t * last_length;
3475

    
3476
    assert(h==8);
3477

    
3478
    s->dsp.diff_pixels(temp, src1, src2, stride);
3479

    
3480
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3481

    
3482
    bits=0;
3483

    
3484
    if (s->mb_intra) {
3485
        start_i = 1;
3486
        length     = s->intra_ac_vlc_length;
3487
        last_length= s->intra_ac_vlc_last_length;
3488
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3489
    } else {
3490
        start_i = 0;
3491
        length     = s->inter_ac_vlc_length;
3492
        last_length= s->inter_ac_vlc_last_length;
3493
    }
3494

    
3495
    if(last>=start_i){
3496
        run=0;
3497
        for(i=start_i; i<last; i++){
3498
            int j= scantable[i];
3499
            level= temp[j];
3500

    
3501
            if(level){
3502
                level+=64;
3503
                if((level&(~127)) == 0){
3504
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3505
                }else
3506
                    bits+= esc_length;
3507
                run=0;
3508
            }else
3509
                run++;
3510
        }
3511
        i= scantable[last];
3512

    
3513
        level= temp[i] + 64;
3514

    
3515
        assert(level - 64);
3516

    
3517
        if((level&(~127)) == 0){
3518
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3519
        }else
3520
            bits+= esc_length;
3521
    }
3522

    
3523
    return bits;
3524
}
3525

    
3526
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3527
    int score=0;
3528
    int x,y;
3529

    
3530
    for(y=1; y<h; y++){
3531
        for(x=0; x<16; x+=4){
3532
            score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3533
                   +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3534
        }
3535
        s+= stride;
3536
    }
3537

    
3538
    return score;
3539
}
3540

    
3541
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3542
    int score=0;
3543
    int x,y;
3544

    
3545
    for(y=1; y<h; y++){
3546
        for(x=0; x<16; x++){
3547
            score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3548
        }
3549
        s1+= stride;
3550
        s2+= stride;
3551
    }
3552

    
3553
    return score;
3554
}
3555

    
3556
#define SQ(a) ((a)*(a))
3557
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3558
    int score=0;
3559
    int x,y;
3560

    
3561
    for(y=1; y<h; y++){
3562
        for(x=0; x<16; x+=4){
3563
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3564
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3565
        }
3566
        s+= stride;
3567
    }
3568

    
3569
    return score;
3570
}
3571

    
3572
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3573
    int score=0;
3574
    int x,y;
3575

    
3576
    for(y=1; y<h; y++){
3577
        for(x=0; x<16; x++){
3578
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3579
        }
3580
        s1+= stride;
3581
        s2+= stride;
3582
    }
3583

    
3584
    return score;
3585
}
3586

    
3587
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3588
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3589
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3590
WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3591
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3592
WARPER8_16_SQ(rd8x8_c, rd16_c)
3593
WARPER8_16_SQ(bit8x8_c, bit16_c)
3594

    
3595
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3596
 converted */
3597
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3598
{
3599
    j_rev_dct (block);
3600
    put_pixels_clamped_c(block, dest, line_size);
3601
}
3602
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3603
{
3604
    j_rev_dct (block);
3605
    add_pixels_clamped_c(block, dest, line_size);
3606
}
3607

    
3608
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3609
{
3610
    j_rev_dct4 (block);
3611
    put_pixels_clamped4_c(block, dest, line_size);
3612
}
3613
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3614
{
3615
    j_rev_dct4 (block);
3616
    add_pixels_clamped4_c(block, dest, line_size);
3617
}
3618

    
3619
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3620
{
3621
    j_rev_dct2 (block);
3622
    put_pixels_clamped2_c(block, dest, line_size);
3623
}
3624
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3625
{
3626
    j_rev_dct2 (block);
3627
    add_pixels_clamped2_c(block, dest, line_size);
3628
}
3629

    
3630
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3631
{
3632
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3633

    
3634
    dest[0] = cm[(block[0] + 4)>>3];
3635
}
3636
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3637
{
3638
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3639

    
3640
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3641
}
3642

    
3643
/* init static data */
3644
void dsputil_static_init(void)
3645
{
3646
    int i;
3647

    
3648
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3649
    for(i=0;i<MAX_NEG_CROP;i++) {
3650
        cropTbl[i] = 0;
3651
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
3652
    }
3653

    
3654
    for(i=0;i<512;i++) {
3655
        squareTbl[i] = (i - 256) * (i - 256);
3656
    }
3657

    
3658
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3659
}
3660

    
3661

    
3662
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3663
{
3664
    int i;
3665

    
3666
#ifdef CONFIG_ENCODERS
3667
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3668
        c->fdct = fdct_ifast;
3669
        c->fdct248 = fdct_ifast248;
3670
    }
3671
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3672
        c->fdct = ff_faandct;
3673
        c->fdct248 = ff_faandct248;
3674
    }
3675
    else {
3676
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3677
        c->fdct248 = ff_fdct248_islow;
3678
    }
3679
#endif //CONFIG_ENCODERS
3680

    
3681
    if(avctx->lowres==1){
3682
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3683
            c->idct_put= ff_jref_idct4_put;
3684
            c->idct_add= ff_jref_idct4_add;
3685
        }else{
3686
            c->idct_put= ff_h264_lowres_idct_put_c;
3687
            c->idct_add= ff_h264_lowres_idct_add_c;
3688
        }
3689
        c->idct    = j_rev_dct4;
3690
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3691
    }else if(avctx->lowres==2){
3692
        c->idct_put= ff_jref_idct2_put;
3693
        c->idct_add= ff_jref_idct2_add;
3694
        c->idct    = j_rev_dct2;
3695
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3696
    }else if(avctx->lowres==3){
3697
        c->idct_put= ff_jref_idct1_put;
3698
        c->idct_add= ff_jref_idct1_add;
3699
        c->idct    = j_rev_dct1;
3700
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3701
    }else{
3702
        if(avctx->idct_algo==FF_IDCT_INT){
3703
            c->idct_put= ff_jref_idct_put;
3704
            c->idct_add= ff_jref_idct_add;
3705
            c->idct    = j_rev_dct;
3706
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3707
        }else if(avctx->idct_algo==FF_IDCT_VP3){
3708
            c->idct_put= ff_vp3_idct_put_c;
3709
            c->idct_add= ff_vp3_idct_add_c;
3710
            c->idct    = ff_vp3_idct_c;
3711
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3712
        }else{ //accurate/default
3713
            c->idct_put= simple_idct_put;
3714
            c->idct_add= simple_idct_add;
3715
            c->idct    = simple_idct;
3716
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3717
        }
3718
    }
3719

    
3720
    c->h264_idct_add= ff_h264_idct_add_c;
3721
    c->h264_idct8_add= ff_h264_idct8_add_c;
3722

    
3723
    c->get_pixels = get_pixels_c;
3724
    c->diff_pixels = diff_pixels_c;
3725
    c->put_pixels_clamped = put_pixels_clamped_c;
3726
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3727
    c->add_pixels_clamped = add_pixels_clamped_c;
3728
    c->add_pixels8 = add_pixels8_c;
3729
    c->add_pixels4 = add_pixels4_c;
3730
    c->gmc1 = gmc1_c;
3731
    c->gmc = gmc_c;
3732
    c->clear_blocks = clear_blocks_c;
3733
    c->pix_sum = pix_sum_c;
3734
    c->pix_norm1 = pix_norm1_c;
3735

    
3736
    /* TODO [0] 16  [1] 8 */
3737
    c->pix_abs[0][0] = pix_abs16_c;
3738
    c->pix_abs[0][1] = pix_abs16_x2_c;
3739
    c->pix_abs[0][2] = pix_abs16_y2_c;
3740
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3741
    c->pix_abs[1][0] = pix_abs8_c;
3742
    c->pix_abs[1][1] = pix_abs8_x2_c;
3743
    c->pix_abs[1][2] = pix_abs8_y2_c;
3744
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3745

    
3746
#define dspfunc(PFX, IDX, NUM) \
3747
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3748
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3749
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3750
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3751

    
3752
    dspfunc(put, 0, 16);
3753
    dspfunc(put_no_rnd, 0, 16);
3754
    dspfunc(put, 1, 8);
3755
    dspfunc(put_no_rnd, 1, 8);
3756
    dspfunc(put, 2, 4);
3757
    dspfunc(put, 3, 2);
3758

    
3759
    dspfunc(avg, 0, 16);
3760
    dspfunc(avg_no_rnd, 0, 16);
3761
    dspfunc(avg, 1, 8);
3762
    dspfunc(avg_no_rnd, 1, 8);
3763
    dspfunc(avg, 2, 4);
3764
    dspfunc(avg, 3, 2);
3765
#undef dspfunc
3766

    
3767
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3768
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3769

    
3770
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3771
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3772
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3773
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3774
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3775
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3776
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3777
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3778
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3779

    
3780
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3781
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3782
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3783
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3784
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3785
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3786
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3787
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3788
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3789

    
3790
#define dspfunc(PFX, IDX, NUM) \
3791
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3792
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3793
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3794
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3795
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3796
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3797
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3798
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3799
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3800
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3801
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3802
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3803
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3804
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3805
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3806
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3807

    
3808
    dspfunc(put_qpel, 0, 16);
3809
    dspfunc(put_no_rnd_qpel, 0, 16);
3810

    
3811
    dspfunc(avg_qpel, 0, 16);
3812
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3813

    
3814
    dspfunc(put_qpel, 1, 8);
3815
    dspfunc(put_no_rnd_qpel, 1, 8);
3816

    
3817
    dspfunc(avg_qpel, 1, 8);
3818
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3819

    
3820
    dspfunc(put_h264_qpel, 0, 16);
3821
    dspfunc(put_h264_qpel, 1, 8);
3822
    dspfunc(put_h264_qpel, 2, 4);
3823
    dspfunc(avg_h264_qpel, 0, 16);
3824
    dspfunc(avg_h264_qpel, 1, 8);
3825
    dspfunc(avg_h264_qpel, 2, 4);
3826

    
3827
#undef dspfunc
3828
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3829
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3830
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3831
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3832
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3833
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3834

    
3835
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3836
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3837
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3838
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3839
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3840
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3841
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3842
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3843
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3844
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3845
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3846
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3847
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3848
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3849
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3850
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3851
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3852
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3853
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3854
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3855

    
3856
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3857
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3858
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3859
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3860
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3861
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3862
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3863
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3864

    
3865
#define SET_CMP_FUNC(name) \
3866
    c->name[0]= name ## 16_c;\
3867
    c->name[1]= name ## 8x8_c;
3868

    
3869
    SET_CMP_FUNC(hadamard8_diff)
3870
    c->hadamard8_diff[4]= hadamard8_intra16_c;
3871
    SET_CMP_FUNC(dct_sad)
3872
    SET_CMP_FUNC(dct_max)
3873
    c->sad[0]= pix_abs16_c;
3874
    c->sad[1]= pix_abs8_c;
3875
    c->sse[0]= sse16_c;
3876
    c->sse[1]= sse8_c;
3877
    c->sse[2]= sse4_c;
3878
    SET_CMP_FUNC(quant_psnr)
3879
    SET_CMP_FUNC(rd)
3880
    SET_CMP_FUNC(bit)
3881
    c->vsad[0]= vsad16_c;
3882
    c->vsad[4]= vsad_intra16_c;
3883
    c->vsse[0]= vsse16_c;
3884
    c->vsse[4]= vsse_intra16_c;
3885
    c->nsse[0]= nsse16_c;
3886
    c->nsse[1]= nsse8_c;
3887
    c->w53[0]= w53_16_c;
3888
    c->w53[1]= w53_8_c;
3889
    c->w97[0]= w97_16_c;
3890
    c->w97[1]= w97_8_c;
3891

    
3892
    c->add_bytes= add_bytes_c;
3893
    c->diff_bytes= diff_bytes_c;
3894
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3895
    c->bswap_buf= bswap_buf;
3896

    
3897
    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
3898
    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
3899
    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
3900
    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
3901
    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
3902
    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
3903

    
3904
    c->h263_h_loop_filter= h263_h_loop_filter_c;
3905
    c->h263_v_loop_filter= h263_v_loop_filter_c;
3906

    
3907
    c->h261_loop_filter= h261_loop_filter_c;
3908

    
3909
    c->try_8x8basis= try_8x8basis_c;
3910
    c->add_8x8basis= add_8x8basis_c;
3911

    
3912
#ifdef HAVE_MMX
3913
    dsputil_init_mmx(c, avctx);
3914
#endif
3915
#ifdef ARCH_ARMV4L
3916
    dsputil_init_armv4l(c, avctx);
3917
#endif
3918
#ifdef HAVE_MLIB
3919
    dsputil_init_mlib(c, avctx);
3920
#endif
3921
#ifdef ARCH_SPARC
3922
   dsputil_init_vis(c,avctx);
3923
#endif
3924
#ifdef ARCH_ALPHA
3925
    dsputil_init_alpha(c, avctx);
3926
#endif
3927
#ifdef ARCH_POWERPC
3928
    dsputil_init_ppc(c, avctx);
3929
#endif
3930
#ifdef HAVE_MMI
3931
    dsputil_init_mmi(c, avctx);
3932
#endif
3933
#ifdef ARCH_SH4
3934
    dsputil_init_sh4(c,avctx);
3935
#endif
3936

    
3937
    switch(c->idct_permutation_type){
3938
    case FF_NO_IDCT_PERM:
3939
        for(i=0; i<64; i++)
3940
            c->idct_permutation[i]= i;
3941
        break;
3942
    case FF_LIBMPEG2_IDCT_PERM:
3943
        for(i=0; i<64; i++)
3944
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3945
        break;
3946
    case FF_SIMPLE_IDCT_PERM:
3947
        for(i=0; i<64; i++)
3948
            c->idct_permutation[i]= simple_mmx_permutation[i];
3949
        break;
3950
    case FF_TRANSPOSE_IDCT_PERM:
3951
        for(i=0; i<64; i++)
3952
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3953
        break;
3954
    case FF_PARTTRANS_IDCT_PERM:
3955
        for(i=0; i<64; i++)
3956
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3957
        break;
3958
    default:
3959
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3960
    }
3961
}
3962