Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 5cf08f23

History | View | Annotate | Download (139 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 *
20
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21
 */
22
 
23
/**
24
 * @file dsputil.c
25
 * DSP utils
26
 */
27
 
28
#include "avcodec.h"
29
#include "dsputil.h"
30
#include "mpegvideo.h"
31
#include "simple_idct.h"
32
#include "faandct.h"
33

    
34
/* snow.c */
35
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
36

    
37
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
38
uint32_t squareTbl[512] = {0, };
39

    
40
const uint8_t ff_zigzag_direct[64] = {
41
    0,   1,  8, 16,  9,  2,  3, 10,
42
    17, 24, 32, 25, 18, 11,  4,  5,
43
    12, 19, 26, 33, 40, 48, 41, 34,
44
    27, 20, 13,  6,  7, 14, 21, 28,
45
    35, 42, 49, 56, 57, 50, 43, 36,
46
    29, 22, 15, 23, 30, 37, 44, 51,
47
    58, 59, 52, 45, 38, 31, 39, 46,
48
    53, 60, 61, 54, 47, 55, 62, 63
49
};
50

    
51
/* Specific zigzag scan for 248 idct. NOTE that unlike the
52
   specification, we interleave the fields */
53
const uint8_t ff_zigzag248_direct[64] = {
54
     0,  8,  1,  9, 16, 24,  2, 10,
55
    17, 25, 32, 40, 48, 56, 33, 41,
56
    18, 26,  3, 11,  4, 12, 19, 27,
57
    34, 42, 49, 57, 50, 58, 35, 43,
58
    20, 28,  5, 13,  6, 14, 21, 29,
59
    36, 44, 51, 59, 52, 60, 37, 45,
60
    22, 30,  7, 15, 23, 31, 38, 46,
61
    53, 61, 54, 62, 39, 47, 55, 63,
62
};
63

    
64
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
65
uint16_t __align8 inv_zigzag_direct16[64] = {0, };
66

    
67
const uint8_t ff_alternate_horizontal_scan[64] = {
68
    0,  1,   2,  3,  8,  9, 16, 17, 
69
    10, 11,  4,  5,  6,  7, 15, 14,
70
    13, 12, 19, 18, 24, 25, 32, 33, 
71
    26, 27, 20, 21, 22, 23, 28, 29,
72
    30, 31, 34, 35, 40, 41, 48, 49, 
73
    42, 43, 36, 37, 38, 39, 44, 45,
74
    46, 47, 50, 51, 56, 57, 58, 59, 
75
    52, 53, 54, 55, 60, 61, 62, 63,
76
};
77

    
78
const uint8_t ff_alternate_vertical_scan[64] = {
79
    0,  8,  16, 24,  1,  9,  2, 10, 
80
    17, 25, 32, 40, 48, 56, 57, 49,
81
    41, 33, 26, 18,  3, 11,  4, 12, 
82
    19, 27, 34, 42, 50, 58, 35, 43,
83
    51, 59, 20, 28,  5, 13,  6, 14, 
84
    21, 29, 36, 44, 52, 60, 37, 45,
85
    53, 61, 22, 30,  7, 15, 23, 31, 
86
    38, 46, 54, 62, 39, 47, 55, 63,
87
};
88

    
89
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
90
const uint32_t inverse[256]={
91
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
92
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
93
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
94
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
95
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
96
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
97
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
98
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
99
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
100
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
101
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
102
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
103
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
104
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
105
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
106
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
107
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
108
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
109
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
110
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
111
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
112
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
113
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
114
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
115
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
116
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
117
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
118
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
119
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
120
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
121
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
122
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
123
};
124

    
125
/* Input permutation for the simple_idct_mmx */
126
static const uint8_t simple_mmx_permutation[64]={
127
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
128
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
129
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
130
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
131
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
132
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
133
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
134
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
135
};
136

    
137
static int pix_sum_c(uint8_t * pix, int line_size)
138
{
139
    int s, i, j;
140

    
141
    s = 0;
142
    for (i = 0; i < 16; i++) {
143
        for (j = 0; j < 16; j += 8) {
144
            s += pix[0];
145
            s += pix[1];
146
            s += pix[2];
147
            s += pix[3];
148
            s += pix[4];
149
            s += pix[5];
150
            s += pix[6];
151
            s += pix[7];
152
            pix += 8;
153
        }
154
        pix += line_size - 16;
155
    }
156
    return s;
157
}
158

    
159
static int pix_norm1_c(uint8_t * pix, int line_size)
160
{
161
    int s, i, j;
162
    uint32_t *sq = squareTbl + 256;
163

    
164
    s = 0;
165
    for (i = 0; i < 16; i++) {
166
        for (j = 0; j < 16; j += 8) {
167
#if 0
168
            s += sq[pix[0]];
169
            s += sq[pix[1]];
170
            s += sq[pix[2]];
171
            s += sq[pix[3]];
172
            s += sq[pix[4]];
173
            s += sq[pix[5]];
174
            s += sq[pix[6]];
175
            s += sq[pix[7]];
176
#else
177
#if LONG_MAX > 2147483647
178
            register uint64_t x=*(uint64_t*)pix;
179
            s += sq[x&0xff];
180
            s += sq[(x>>8)&0xff];
181
            s += sq[(x>>16)&0xff];
182
            s += sq[(x>>24)&0xff];
183
            s += sq[(x>>32)&0xff];
184
            s += sq[(x>>40)&0xff];
185
            s += sq[(x>>48)&0xff];
186
            s += sq[(x>>56)&0xff];
187
#else
188
            register uint32_t x=*(uint32_t*)pix;
189
            s += sq[x&0xff];
190
            s += sq[(x>>8)&0xff];
191
            s += sq[(x>>16)&0xff];
192
            s += sq[(x>>24)&0xff];
193
            x=*(uint32_t*)(pix+4);
194
            s += sq[x&0xff];
195
            s += sq[(x>>8)&0xff];
196
            s += sq[(x>>16)&0xff];
197
            s += sq[(x>>24)&0xff];
198
#endif
199
#endif
200
            pix += 8;
201
        }
202
        pix += line_size - 16;
203
    }
204
    return s;
205
}
206

    
207
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
208
    int i;
209
    
210
    for(i=0; i+8<=w; i+=8){
211
        dst[i+0]= bswap_32(src[i+0]);
212
        dst[i+1]= bswap_32(src[i+1]);
213
        dst[i+2]= bswap_32(src[i+2]);
214
        dst[i+3]= bswap_32(src[i+3]);
215
        dst[i+4]= bswap_32(src[i+4]);
216
        dst[i+5]= bswap_32(src[i+5]);
217
        dst[i+6]= bswap_32(src[i+6]);
218
        dst[i+7]= bswap_32(src[i+7]);
219
    }
220
    for(;i<w; i++){
221
        dst[i+0]= bswap_32(src[i+0]);
222
    }
223
}
224

    
225
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
226
{
227
    int s, i;
228
    uint32_t *sq = squareTbl + 256;
229

    
230
    s = 0;
231
    for (i = 0; i < h; i++) {
232
        s += sq[pix1[0] - pix2[0]];
233
        s += sq[pix1[1] - pix2[1]];
234
        s += sq[pix1[2] - pix2[2]];
235
        s += sq[pix1[3] - pix2[3]];
236
        pix1 += line_size;
237
        pix2 += line_size;
238
    }
239
    return s;
240
}
241

    
242
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
243
{
244
    int s, i;
245
    uint32_t *sq = squareTbl + 256;
246

    
247
    s = 0;
248
    for (i = 0; i < h; i++) {
249
        s += sq[pix1[0] - pix2[0]];
250
        s += sq[pix1[1] - pix2[1]];
251
        s += sq[pix1[2] - pix2[2]];
252
        s += sq[pix1[3] - pix2[3]];
253
        s += sq[pix1[4] - pix2[4]];
254
        s += sq[pix1[5] - pix2[5]];
255
        s += sq[pix1[6] - pix2[6]];
256
        s += sq[pix1[7] - pix2[7]];
257
        pix1 += line_size;
258
        pix2 += line_size;
259
    }
260
    return s;
261
}
262

    
263
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
264
{
265
    int s, i;
266
    uint32_t *sq = squareTbl + 256;
267

    
268
    s = 0;
269
    for (i = 0; i < h; i++) {
270
        s += sq[pix1[ 0] - pix2[ 0]];
271
        s += sq[pix1[ 1] - pix2[ 1]];
272
        s += sq[pix1[ 2] - pix2[ 2]];
273
        s += sq[pix1[ 3] - pix2[ 3]];
274
        s += sq[pix1[ 4] - pix2[ 4]];
275
        s += sq[pix1[ 5] - pix2[ 5]];
276
        s += sq[pix1[ 6] - pix2[ 6]];
277
        s += sq[pix1[ 7] - pix2[ 7]];
278
        s += sq[pix1[ 8] - pix2[ 8]];
279
        s += sq[pix1[ 9] - pix2[ 9]];
280
        s += sq[pix1[10] - pix2[10]];
281
        s += sq[pix1[11] - pix2[11]];
282
        s += sq[pix1[12] - pix2[12]];
283
        s += sq[pix1[13] - pix2[13]];
284
        s += sq[pix1[14] - pix2[14]];
285
        s += sq[pix1[15] - pix2[15]];
286

    
287
        pix1 += line_size;
288
        pix2 += line_size;
289
    }
290
    return s;
291
}
292

    
293

    
294
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
295
#ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
296
    int s, i, j;
297
    const int dec_count= w==8 ? 3 : 4;
298
    int tmp[16*16];
299
#if 0
300
    int level, ori;
301
    static const int scale[2][2][4][4]={ 
302
      {
303
        {
304
            //8x8 dec=3
305
            {268, 239, 239, 213},
306
            {  0, 224, 224, 152},
307
            {  0, 135, 135, 110},
308
        },{
309
            //16x16 dec=4
310
            {344, 310, 310, 280},
311
            {  0, 320, 320, 228},
312
            {  0, 175, 175, 136},
313
            {  0, 129, 129, 102},
314
        }
315
      },{
316
        {//FIXME 5/3
317
            //8x8 dec=3
318
            {275, 245, 245, 218},
319
            {  0, 230, 230, 156},
320
            {  0, 138, 138, 113},
321
        },{
322
            //16x16 dec=4
323
            {352, 317, 317, 286},
324
            {  0, 328, 328, 233},
325
            {  0, 180, 180, 140},
326
            {  0, 132, 132, 105},
327
        }
328
      }
329
    };
330
#endif
331

    
332
    for (i = 0; i < h; i++) {
333
        for (j = 0; j < w; j+=4) {
334
            tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
335
            tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
336
            tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
337
            tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
338
        }
339
        pix1 += line_size;
340
        pix2 += line_size;
341
    }
342

    
343
    ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
344

    
345
    s=0;
346
#if 0
347
    for(level=0; level<dec_count; level++){
348
        for(ori= level ? 1 : 0; ori<4; ori++){
349
            int sx= (ori&1) ? 1<<level: 0;
350
            int stride= 16<<(dec_count-level);
351
            int sy= (ori&2) ? stride>>1 : 0;
352
            int size= 1<<level;
353
            
354
            for(i=0; i<size; i++){
355
                for(j=0; j<size; j++){
356
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
357
                    s += ABS(v);
358
                }
359
            }
360
        }
361
    }
362
#endif
363
    for (i = 0; i < h; i++) {
364
        for (j = 0; j < w; j+=4) {
365
            s+= ABS(tmp[16*i+j+0]);
366
            s+= ABS(tmp[16*i+j+1]);
367
            s+= ABS(tmp[16*i+j+2]);
368
            s+= ABS(tmp[16*i+j+3]);
369
        }
370
    }
371
    assert(s>=0); 
372
    
373
    return s>>2;
374
#endif
375
}
376

    
377
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
379
}
380

    
381
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
383
}
384

    
385
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
387
}
388

    
389
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
390
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
391
}
392

    
393
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
394
{
395
    int i;
396

    
397
    /* read the pixels */
398
    for(i=0;i<8;i++) {
399
        block[0] = pixels[0];
400
        block[1] = pixels[1];
401
        block[2] = pixels[2];
402
        block[3] = pixels[3];
403
        block[4] = pixels[4];
404
        block[5] = pixels[5];
405
        block[6] = pixels[6];
406
        block[7] = pixels[7];
407
        pixels += line_size;
408
        block += 8;
409
    }
410
}
411

    
412
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
413
                          const uint8_t *s2, int stride){
414
    int i;
415

    
416
    /* read the pixels */
417
    for(i=0;i<8;i++) {
418
        block[0] = s1[0] - s2[0];
419
        block[1] = s1[1] - s2[1];
420
        block[2] = s1[2] - s2[2];
421
        block[3] = s1[3] - s2[3];
422
        block[4] = s1[4] - s2[4];
423
        block[5] = s1[5] - s2[5];
424
        block[6] = s1[6] - s2[6];
425
        block[7] = s1[7] - s2[7];
426
        s1 += stride;
427
        s2 += stride;
428
        block += 8;
429
    }
430
}
431

    
432

    
433
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
434
                                 int line_size)
435
{
436
    int i;
437
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
438
    
439
    /* read the pixels */
440
    for(i=0;i<8;i++) {
441
        pixels[0] = cm[block[0]];
442
        pixels[1] = cm[block[1]];
443
        pixels[2] = cm[block[2]];
444
        pixels[3] = cm[block[3]];
445
        pixels[4] = cm[block[4]];
446
        pixels[5] = cm[block[5]];
447
        pixels[6] = cm[block[6]];
448
        pixels[7] = cm[block[7]];
449

    
450
        pixels += line_size;
451
        block += 8;
452
    }
453
}
454

    
455
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
456
                                 int line_size)
457
{
458
    int i;
459
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
460
    
461
    /* read the pixels */
462
    for(i=0;i<4;i++) {
463
        pixels[0] = cm[block[0]];
464
        pixels[1] = cm[block[1]];
465
        pixels[2] = cm[block[2]];
466
        pixels[3] = cm[block[3]];
467

    
468
        pixels += line_size;
469
        block += 8;
470
    }
471
}
472

    
473
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
474
                                 int line_size)
475
{
476
    int i;
477
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
478
    
479
    /* read the pixels */
480
    for(i=0;i<2;i++) {
481
        pixels[0] = cm[block[0]];
482
        pixels[1] = cm[block[1]];
483

    
484
        pixels += line_size;
485
        block += 8;
486
    }
487
}
488

    
489
static void put_signed_pixels_clamped_c(const DCTELEM *block, 
490
                                        uint8_t *restrict pixels,
491
                                        int line_size)
492
{
493
    int i, j;
494

    
495
    for (i = 0; i < 8; i++) {
496
        for (j = 0; j < 8; j++) {
497
            if (*block < -128)
498
                *pixels = 0;
499
            else if (*block > 127)
500
                *pixels = 255;
501
            else
502
                *pixels = (uint8_t)(*block + 128);
503
            block++;
504
            pixels++;
505
        }
506
        pixels += (line_size - 8);
507
    }
508
}
509

    
510
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
511
                          int line_size)
512
{
513
    int i;
514
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
515
    
516
    /* read the pixels */
517
    for(i=0;i<8;i++) {
518
        pixels[0] = cm[pixels[0] + block[0]];
519
        pixels[1] = cm[pixels[1] + block[1]];
520
        pixels[2] = cm[pixels[2] + block[2]];
521
        pixels[3] = cm[pixels[3] + block[3]];
522
        pixels[4] = cm[pixels[4] + block[4]];
523
        pixels[5] = cm[pixels[5] + block[5]];
524
        pixels[6] = cm[pixels[6] + block[6]];
525
        pixels[7] = cm[pixels[7] + block[7]];
526
        pixels += line_size;
527
        block += 8;
528
    }
529
}
530

    
531
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
532
                          int line_size)
533
{
534
    int i;
535
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
536
    
537
    /* read the pixels */
538
    for(i=0;i<4;i++) {
539
        pixels[0] = cm[pixels[0] + block[0]];
540
        pixels[1] = cm[pixels[1] + block[1]];
541
        pixels[2] = cm[pixels[2] + block[2]];
542
        pixels[3] = cm[pixels[3] + block[3]];
543
        pixels += line_size;
544
        block += 8;
545
    }
546
}
547

    
548
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
549
                          int line_size)
550
{
551
    int i;
552
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
553
    
554
    /* read the pixels */
555
    for(i=0;i<2;i++) {
556
        pixels[0] = cm[pixels[0] + block[0]];
557
        pixels[1] = cm[pixels[1] + block[1]];
558
        pixels += line_size;
559
        block += 8;
560
    }
561
}
562
#if 0
563

564
#define PIXOP2(OPNAME, OP) \
565
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
566
{\
567
    int i;\
568
    for(i=0; i<h; i++){\
569
        OP(*((uint64_t*)block), LD64(pixels));\
570
        pixels+=line_size;\
571
        block +=line_size;\
572
    }\
573
}\
574
\
575
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
576
{\
577
    int i;\
578
    for(i=0; i<h; i++){\
579
        const uint64_t a= LD64(pixels  );\
580
        const uint64_t b= LD64(pixels+1);\
581
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
582
        pixels+=line_size;\
583
        block +=line_size;\
584
    }\
585
}\
586
\
587
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
588
{\
589
    int i;\
590
    for(i=0; i<h; i++){\
591
        const uint64_t a= LD64(pixels  );\
592
        const uint64_t b= LD64(pixels+1);\
593
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
594
        pixels+=line_size;\
595
        block +=line_size;\
596
    }\
597
}\
598
\
599
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
600
{\
601
    int i;\
602
    for(i=0; i<h; i++){\
603
        const uint64_t a= LD64(pixels          );\
604
        const uint64_t b= LD64(pixels+line_size);\
605
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
606
        pixels+=line_size;\
607
        block +=line_size;\
608
    }\
609
}\
610
\
611
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
612
{\
613
    int i;\
614
    for(i=0; i<h; i++){\
615
        const uint64_t a= LD64(pixels          );\
616
        const uint64_t b= LD64(pixels+line_size);\
617
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
618
        pixels+=line_size;\
619
        block +=line_size;\
620
    }\
621
}\
622
\
623
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
624
{\
625
        int i;\
626
        const uint64_t a= LD64(pixels  );\
627
        const uint64_t b= LD64(pixels+1);\
628
        uint64_t l0=  (a&0x0303030303030303ULL)\
629
                    + (b&0x0303030303030303ULL)\
630
                    + 0x0202020202020202ULL;\
631
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
632
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
633
        uint64_t l1,h1;\
634
\
635
        pixels+=line_size;\
636
        for(i=0; i<h; i+=2){\
637
            uint64_t a= LD64(pixels  );\
638
            uint64_t b= LD64(pixels+1);\
639
            l1=  (a&0x0303030303030303ULL)\
640
               + (b&0x0303030303030303ULL);\
641
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
642
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
643
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
644
            pixels+=line_size;\
645
            block +=line_size;\
646
            a= LD64(pixels  );\
647
            b= LD64(pixels+1);\
648
            l0=  (a&0x0303030303030303ULL)\
649
               + (b&0x0303030303030303ULL)\
650
               + 0x0202020202020202ULL;\
651
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
652
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
653
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
654
            pixels+=line_size;\
655
            block +=line_size;\
656
        }\
657
}\
658
\
659
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
660
{\
661
        int i;\
662
        const uint64_t a= LD64(pixels  );\
663
        const uint64_t b= LD64(pixels+1);\
664
        uint64_t l0=  (a&0x0303030303030303ULL)\
665
                    + (b&0x0303030303030303ULL)\
666
                    + 0x0101010101010101ULL;\
667
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
668
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
669
        uint64_t l1,h1;\
670
\
671
        pixels+=line_size;\
672
        for(i=0; i<h; i+=2){\
673
            uint64_t a= LD64(pixels  );\
674
            uint64_t b= LD64(pixels+1);\
675
            l1=  (a&0x0303030303030303ULL)\
676
               + (b&0x0303030303030303ULL);\
677
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
678
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
679
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
680
            pixels+=line_size;\
681
            block +=line_size;\
682
            a= LD64(pixels  );\
683
            b= LD64(pixels+1);\
684
            l0=  (a&0x0303030303030303ULL)\
685
               + (b&0x0303030303030303ULL)\
686
               + 0x0101010101010101ULL;\
687
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
688
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
689
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
690
            pixels+=line_size;\
691
            block +=line_size;\
692
        }\
693
}\
694
\
695
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
696
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
697
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
698
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
699
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
700
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
701
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
702

703
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
704
#else // 64 bit variant
705

    
706
#define PIXOP2(OPNAME, OP) \
707
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
708
    int i;\
709
    for(i=0; i<h; i++){\
710
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
711
        pixels+=line_size;\
712
        block +=line_size;\
713
    }\
714
}\
715
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
716
    int i;\
717
    for(i=0; i<h; i++){\
718
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
719
        pixels+=line_size;\
720
        block +=line_size;\
721
    }\
722
}\
723
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
724
    int i;\
725
    for(i=0; i<h; i++){\
726
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
727
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
728
        pixels+=line_size;\
729
        block +=line_size;\
730
    }\
731
}\
732
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
733
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
734
}\
735
\
736
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
737
                                                int src_stride1, int src_stride2, int h){\
738
    int i;\
739
    for(i=0; i<h; i++){\
740
        uint32_t a,b;\
741
        a= LD32(&src1[i*src_stride1  ]);\
742
        b= LD32(&src2[i*src_stride2  ]);\
743
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
744
        a= LD32(&src1[i*src_stride1+4]);\
745
        b= LD32(&src2[i*src_stride2+4]);\
746
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
747
    }\
748
}\
749
\
750
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
751
                                                int src_stride1, int src_stride2, int h){\
752
    int i;\
753
    for(i=0; i<h; i++){\
754
        uint32_t a,b;\
755
        a= LD32(&src1[i*src_stride1  ]);\
756
        b= LD32(&src2[i*src_stride2  ]);\
757
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
758
        a= LD32(&src1[i*src_stride1+4]);\
759
        b= LD32(&src2[i*src_stride2+4]);\
760
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
761
    }\
762
}\
763
\
764
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
765
                                                int src_stride1, int src_stride2, int h){\
766
    int i;\
767
    for(i=0; i<h; i++){\
768
        uint32_t a,b;\
769
        a= LD32(&src1[i*src_stride1  ]);\
770
        b= LD32(&src2[i*src_stride2  ]);\
771
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
772
    }\
773
}\
774
\
775
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
776
                                                int src_stride1, int src_stride2, int h){\
777
    int i;\
778
    for(i=0; i<h; i++){\
779
        uint32_t a,b;\
780
        a= LD16(&src1[i*src_stride1  ]);\
781
        b= LD16(&src2[i*src_stride2  ]);\
782
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
783
    }\
784
}\
785
\
786
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
787
                                                int src_stride1, int src_stride2, int h){\
788
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
789
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
790
}\
791
\
792
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
793
                                                int src_stride1, int src_stride2, int h){\
794
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
795
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
796
}\
797
\
798
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
799
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
800
}\
801
\
802
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
803
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
804
}\
805
\
806
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
807
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
808
}\
809
\
810
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
811
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
812
}\
813
\
814
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
815
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
816
    int i;\
817
    for(i=0; i<h; i++){\
818
        uint32_t a, b, c, d, l0, l1, h0, h1;\
819
        a= LD32(&src1[i*src_stride1]);\
820
        b= LD32(&src2[i*src_stride2]);\
821
        c= LD32(&src3[i*src_stride3]);\
822
        d= LD32(&src4[i*src_stride4]);\
823
        l0=  (a&0x03030303UL)\
824
           + (b&0x03030303UL)\
825
           + 0x02020202UL;\
826
        h0= ((a&0xFCFCFCFCUL)>>2)\
827
          + ((b&0xFCFCFCFCUL)>>2);\
828
        l1=  (c&0x03030303UL)\
829
           + (d&0x03030303UL);\
830
        h1= ((c&0xFCFCFCFCUL)>>2)\
831
          + ((d&0xFCFCFCFCUL)>>2);\
832
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
833
        a= LD32(&src1[i*src_stride1+4]);\
834
        b= LD32(&src2[i*src_stride2+4]);\
835
        c= LD32(&src3[i*src_stride3+4]);\
836
        d= LD32(&src4[i*src_stride4+4]);\
837
        l0=  (a&0x03030303UL)\
838
           + (b&0x03030303UL)\
839
           + 0x02020202UL;\
840
        h0= ((a&0xFCFCFCFCUL)>>2)\
841
          + ((b&0xFCFCFCFCUL)>>2);\
842
        l1=  (c&0x03030303UL)\
843
           + (d&0x03030303UL);\
844
        h1= ((c&0xFCFCFCFCUL)>>2)\
845
          + ((d&0xFCFCFCFCUL)>>2);\
846
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
847
    }\
848
}\
849
\
850
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
851
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
852
}\
853
\
854
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
855
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
856
}\
857
\
858
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
859
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
860
}\
861
\
862
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
863
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
864
}\
865
\
866
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
867
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
868
    int i;\
869
    for(i=0; i<h; i++){\
870
        uint32_t a, b, c, d, l0, l1, h0, h1;\
871
        a= LD32(&src1[i*src_stride1]);\
872
        b= LD32(&src2[i*src_stride2]);\
873
        c= LD32(&src3[i*src_stride3]);\
874
        d= LD32(&src4[i*src_stride4]);\
875
        l0=  (a&0x03030303UL)\
876
           + (b&0x03030303UL)\
877
           + 0x01010101UL;\
878
        h0= ((a&0xFCFCFCFCUL)>>2)\
879
          + ((b&0xFCFCFCFCUL)>>2);\
880
        l1=  (c&0x03030303UL)\
881
           + (d&0x03030303UL);\
882
        h1= ((c&0xFCFCFCFCUL)>>2)\
883
          + ((d&0xFCFCFCFCUL)>>2);\
884
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
885
        a= LD32(&src1[i*src_stride1+4]);\
886
        b= LD32(&src2[i*src_stride2+4]);\
887
        c= LD32(&src3[i*src_stride3+4]);\
888
        d= LD32(&src4[i*src_stride4+4]);\
889
        l0=  (a&0x03030303UL)\
890
           + (b&0x03030303UL)\
891
           + 0x01010101UL;\
892
        h0= ((a&0xFCFCFCFCUL)>>2)\
893
          + ((b&0xFCFCFCFCUL)>>2);\
894
        l1=  (c&0x03030303UL)\
895
           + (d&0x03030303UL);\
896
        h1= ((c&0xFCFCFCFCUL)>>2)\
897
          + ((d&0xFCFCFCFCUL)>>2);\
898
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
899
    }\
900
}\
901
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
902
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
903
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
904
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
905
}\
906
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
907
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
908
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
909
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
910
}\
911
\
912
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
913
{\
914
        int i, a0, b0, a1, b1;\
915
        a0= pixels[0];\
916
        b0= pixels[1] + 2;\
917
        a0 += b0;\
918
        b0 += pixels[2];\
919
\
920
        pixels+=line_size;\
921
        for(i=0; i<h; i+=2){\
922
            a1= pixels[0];\
923
            b1= pixels[1];\
924
            a1 += b1;\
925
            b1 += pixels[2];\
926
\
927
            block[0]= (a1+a0)>>2; /* FIXME non put */\
928
            block[1]= (b1+b0)>>2;\
929
\
930
            pixels+=line_size;\
931
            block +=line_size;\
932
\
933
            a0= pixels[0];\
934
            b0= pixels[1] + 2;\
935
            a0 += b0;\
936
            b0 += pixels[2];\
937
\
938
            block[0]= (a1+a0)>>2;\
939
            block[1]= (b1+b0)>>2;\
940
            pixels+=line_size;\
941
            block +=line_size;\
942
        }\
943
}\
944
\
945
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
946
{\
947
        int i;\
948
        const uint32_t a= LD32(pixels  );\
949
        const uint32_t b= LD32(pixels+1);\
950
        uint32_t l0=  (a&0x03030303UL)\
951
                    + (b&0x03030303UL)\
952
                    + 0x02020202UL;\
953
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
954
                   + ((b&0xFCFCFCFCUL)>>2);\
955
        uint32_t l1,h1;\
956
\
957
        pixels+=line_size;\
958
        for(i=0; i<h; i+=2){\
959
            uint32_t a= LD32(pixels  );\
960
            uint32_t b= LD32(pixels+1);\
961
            l1=  (a&0x03030303UL)\
962
               + (b&0x03030303UL);\
963
            h1= ((a&0xFCFCFCFCUL)>>2)\
964
              + ((b&0xFCFCFCFCUL)>>2);\
965
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
966
            pixels+=line_size;\
967
            block +=line_size;\
968
            a= LD32(pixels  );\
969
            b= LD32(pixels+1);\
970
            l0=  (a&0x03030303UL)\
971
               + (b&0x03030303UL)\
972
               + 0x02020202UL;\
973
            h0= ((a&0xFCFCFCFCUL)>>2)\
974
              + ((b&0xFCFCFCFCUL)>>2);\
975
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
976
            pixels+=line_size;\
977
            block +=line_size;\
978
        }\
979
}\
980
\
981
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
982
{\
983
    int j;\
984
    for(j=0; j<2; j++){\
985
        int i;\
986
        const uint32_t a= LD32(pixels  );\
987
        const uint32_t b= LD32(pixels+1);\
988
        uint32_t l0=  (a&0x03030303UL)\
989
                    + (b&0x03030303UL)\
990
                    + 0x02020202UL;\
991
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
992
                   + ((b&0xFCFCFCFCUL)>>2);\
993
        uint32_t l1,h1;\
994
\
995
        pixels+=line_size;\
996
        for(i=0; i<h; i+=2){\
997
            uint32_t a= LD32(pixels  );\
998
            uint32_t b= LD32(pixels+1);\
999
            l1=  (a&0x03030303UL)\
1000
               + (b&0x03030303UL);\
1001
            h1= ((a&0xFCFCFCFCUL)>>2)\
1002
              + ((b&0xFCFCFCFCUL)>>2);\
1003
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1004
            pixels+=line_size;\
1005
            block +=line_size;\
1006
            a= LD32(pixels  );\
1007
            b= LD32(pixels+1);\
1008
            l0=  (a&0x03030303UL)\
1009
               + (b&0x03030303UL)\
1010
               + 0x02020202UL;\
1011
            h0= ((a&0xFCFCFCFCUL)>>2)\
1012
              + ((b&0xFCFCFCFCUL)>>2);\
1013
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1014
            pixels+=line_size;\
1015
            block +=line_size;\
1016
        }\
1017
        pixels+=4-line_size*(h+1);\
1018
        block +=4-line_size*h;\
1019
    }\
1020
}\
1021
\
1022
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1023
{\
1024
    int j;\
1025
    for(j=0; j<2; j++){\
1026
        int i;\
1027
        const uint32_t a= LD32(pixels  );\
1028
        const uint32_t b= LD32(pixels+1);\
1029
        uint32_t l0=  (a&0x03030303UL)\
1030
                    + (b&0x03030303UL)\
1031
                    + 0x01010101UL;\
1032
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1033
                   + ((b&0xFCFCFCFCUL)>>2);\
1034
        uint32_t l1,h1;\
1035
\
1036
        pixels+=line_size;\
1037
        for(i=0; i<h; i+=2){\
1038
            uint32_t a= LD32(pixels  );\
1039
            uint32_t b= LD32(pixels+1);\
1040
            l1=  (a&0x03030303UL)\
1041
               + (b&0x03030303UL);\
1042
            h1= ((a&0xFCFCFCFCUL)>>2)\
1043
              + ((b&0xFCFCFCFCUL)>>2);\
1044
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045
            pixels+=line_size;\
1046
            block +=line_size;\
1047
            a= LD32(pixels  );\
1048
            b= LD32(pixels+1);\
1049
            l0=  (a&0x03030303UL)\
1050
               + (b&0x03030303UL)\
1051
               + 0x01010101UL;\
1052
            h0= ((a&0xFCFCFCFCUL)>>2)\
1053
              + ((b&0xFCFCFCFCUL)>>2);\
1054
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1055
            pixels+=line_size;\
1056
            block +=line_size;\
1057
        }\
1058
        pixels+=4-line_size*(h+1);\
1059
        block +=4-line_size*h;\
1060
    }\
1061
}\
1062
\
1063
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1064
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1065
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1066
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1067
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1068
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1069
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1070
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1071

    
1072
#define op_avg(a, b) a = rnd_avg32(a, b)
1073
#endif
1074
#define op_put(a, b) a = b
1075

    
1076
PIXOP2(avg, op_avg)
1077
PIXOP2(put, op_put)
1078
#undef op_avg
1079
#undef op_put
1080

    
1081
#define avg2(a,b) ((a+b+1)>>1)
1082
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1083

    
1084
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1085
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1086
}
1087

    
1088
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1089
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1090
}
1091

    
1092
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1093
{
1094
    const int A=(16-x16)*(16-y16);
1095
    const int B=(   x16)*(16-y16);
1096
    const int C=(16-x16)*(   y16);
1097
    const int D=(   x16)*(   y16);
1098
    int i;
1099

    
1100
    for(i=0; i<h; i++)
1101
    {
1102
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1103
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1104
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1105
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1106
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1107
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1108
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1109
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1110
        dst+= stride;
1111
        src+= stride;
1112
    }
1113
}
1114

    
1115
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
1116
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1117
{
1118
    int y, vx, vy;
1119
    const int s= 1<<shift;
1120
    
1121
    width--;
1122
    height--;
1123

    
1124
    for(y=0; y<h; y++){
1125
        int x;
1126

    
1127
        vx= ox;
1128
        vy= oy;
1129
        for(x=0; x<8; x++){ //XXX FIXME optimize
1130
            int src_x, src_y, frac_x, frac_y, index;
1131

    
1132
            src_x= vx>>16;
1133
            src_y= vy>>16;
1134
            frac_x= src_x&(s-1);
1135
            frac_y= src_y&(s-1);
1136
            src_x>>=shift;
1137
            src_y>>=shift;
1138
  
1139
            if((unsigned)src_x < width){
1140
                if((unsigned)src_y < height){
1141
                    index= src_x + src_y*stride;
1142
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1143
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1144
                                        + (  src[index+stride  ]*(s-frac_x)
1145
                                           + src[index+stride+1]*   frac_x )*   frac_y
1146
                                        + r)>>(shift*2);
1147
                }else{
1148
                    index= src_x + clip(src_y, 0, height)*stride;                    
1149
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
1150
                                          + src[index       +1]*   frac_x )*s
1151
                                        + r)>>(shift*2);
1152
                }
1153
            }else{
1154
                if((unsigned)src_y < height){
1155
                    index= clip(src_x, 0, width) + src_y*stride;                    
1156
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
1157
                                           + src[index+stride  ]*   frac_y )*s
1158
                                        + r)>>(shift*2);
1159
                }else{
1160
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
1161
                    dst[y*stride + x]=    src[index         ];
1162
                }
1163
            }
1164
            
1165
            vx+= dxx;
1166
            vy+= dyx;
1167
        }
1168
        ox += dxy;
1169
        oy += dyy;
1170
    }
1171
}
1172

    
1173
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1174
    switch(width){
1175
    case 2: put_pixels2_c (dst, src, stride, height); break;
1176
    case 4: put_pixels4_c (dst, src, stride, height); break;
1177
    case 8: put_pixels8_c (dst, src, stride, height); break;
1178
    case 16:put_pixels16_c(dst, src, stride, height); break;
1179
    }
1180
}
1181

    
1182
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1183
    int i,j;
1184
    for (i=0; i < height; i++) {
1185
      for (j=0; j < width; j++) {
1186
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1187
      }
1188
      src += stride;
1189
      dst += stride;
1190
    }
1191
}
1192

    
1193
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1194
    int i,j;
1195
    for (i=0; i < height; i++) {
1196
      for (j=0; j < width; j++) {
1197
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1198
      }
1199
      src += stride;
1200
      dst += stride;
1201
    }
1202
}
1203
    
1204
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1205
    int i,j;
1206
    for (i=0; i < height; i++) {
1207
      for (j=0; j < width; j++) {
1208
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1209
      }
1210
      src += stride;
1211
      dst += stride;
1212
    }
1213
}
1214
    
1215
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1216
    int i,j;
1217
    for (i=0; i < height; i++) {
1218
      for (j=0; j < width; j++) {
1219
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1220
      }
1221
      src += stride;
1222
      dst += stride;
1223
    }
1224
}
1225

    
1226
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1227
    int i,j;
1228
    for (i=0; i < height; i++) {
1229
      for (j=0; j < width; j++) {
1230
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1231
      }
1232
      src += stride;
1233
      dst += stride;
1234
    }
1235
}
1236

    
1237
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1238
    int i,j;
1239
    for (i=0; i < height; i++) {
1240
      for (j=0; j < width; j++) {
1241
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1242
      }
1243
      src += stride;
1244
      dst += stride;
1245
    }
1246
}
1247

    
1248
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1249
    int i,j;
1250
    for (i=0; i < height; i++) {
1251
      for (j=0; j < width; j++) {
1252
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1253
      }
1254
      src += stride;
1255
      dst += stride;
1256
    }
1257
}
1258

    
1259
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1260
    int i,j;
1261
    for (i=0; i < height; i++) {
1262
      for (j=0; j < width; j++) {
1263
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1264
      }
1265
      src += stride;
1266
      dst += stride;
1267
    }
1268
}
1269

    
1270
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1271
    switch(width){
1272
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1273
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1274
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1275
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1276
    }
1277
}
1278

    
1279
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280
    int i,j;
1281
    for (i=0; i < height; i++) {
1282
      for (j=0; j < width; j++) {
1283
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1284
      }
1285
      src += stride;
1286
      dst += stride;
1287
    }
1288
}
1289

    
1290
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291
    int i,j;
1292
    for (i=0; i < height; i++) {
1293
      for (j=0; j < width; j++) {
1294
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1295
      }
1296
      src += stride;
1297
      dst += stride;
1298
    }
1299
}
1300
    
1301
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302
    int i,j;
1303
    for (i=0; i < height; i++) {
1304
      for (j=0; j < width; j++) {
1305
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1306
      }
1307
      src += stride;
1308
      dst += stride;
1309
    }
1310
}
1311
    
1312
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313
    int i,j;
1314
    for (i=0; i < height; i++) {
1315
      for (j=0; j < width; j++) {
1316
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1317
      }
1318
      src += stride;
1319
      dst += stride;
1320
    }
1321
}
1322

    
1323
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324
    int i,j;
1325
    for (i=0; i < height; i++) {
1326
      for (j=0; j < width; j++) {
1327
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1328
      }
1329
      src += stride;
1330
      dst += stride;
1331
    }
1332
}
1333

    
1334
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1335
    int i,j;
1336
    for (i=0; i < height; i++) {
1337
      for (j=0; j < width; j++) {
1338
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1339
      }
1340
      src += stride;
1341
      dst += stride;
1342
    }
1343
}
1344

    
1345
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1346
    int i,j;
1347
    for (i=0; i < height; i++) {
1348
      for (j=0; j < width; j++) {
1349
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1350
      }
1351
      src += stride;
1352
      dst += stride;
1353
    }
1354
}
1355

    
1356
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1357
    int i,j;
1358
    for (i=0; i < height; i++) {
1359
      for (j=0; j < width; j++) {
1360
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1361
      }
1362
      src += stride;
1363
      dst += stride;
1364
    }
1365
}
1366
#if 0
1367
#define TPEL_WIDTH(width)\
1368
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1369
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1370
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1371
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1372
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1373
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1374
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1375
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1376
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1377
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1378
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1379
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1380
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1381
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1382
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1383
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1384
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1385
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1386
#endif
1387

    
1388
#define H264_CHROMA_MC(OPNAME, OP)\
1389
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1390
    const int A=(8-x)*(8-y);\
1391
    const int B=(  x)*(8-y);\
1392
    const int C=(8-x)*(  y);\
1393
    const int D=(  x)*(  y);\
1394
    int i;\
1395
    \
1396
    assert(x<8 && y<8 && x>=0 && y>=0);\
1397
\
1398
    for(i=0; i<h; i++)\
1399
    {\
1400
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1401
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1402
        dst+= stride;\
1403
        src+= stride;\
1404
    }\
1405
}\
1406
\
1407
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1408
    const int A=(8-x)*(8-y);\
1409
    const int B=(  x)*(8-y);\
1410
    const int C=(8-x)*(  y);\
1411
    const int D=(  x)*(  y);\
1412
    int i;\
1413
    \
1414
    assert(x<8 && y<8 && x>=0 && y>=0);\
1415
\
1416
    for(i=0; i<h; i++)\
1417
    {\
1418
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1419
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1420
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1421
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1422
        dst+= stride;\
1423
        src+= stride;\
1424
    }\
1425
}\
1426
\
1427
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1428
    const int A=(8-x)*(8-y);\
1429
    const int B=(  x)*(8-y);\
1430
    const int C=(8-x)*(  y);\
1431
    const int D=(  x)*(  y);\
1432
    int i;\
1433
    \
1434
    assert(x<8 && y<8 && x>=0 && y>=0);\
1435
\
1436
    for(i=0; i<h; i++)\
1437
    {\
1438
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1439
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1440
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1441
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1442
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1443
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1444
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1445
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1446
        dst+= stride;\
1447
        src+= stride;\
1448
    }\
1449
}
1450

    
1451
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1452
#define op_put(a, b) a = (((b) + 32)>>6)
1453

    
1454
H264_CHROMA_MC(put_       , op_put)
1455
H264_CHROMA_MC(avg_       , op_avg)
1456
#undef op_avg
1457
#undef op_put
1458

    
1459
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1460
{
1461
    int i;
1462
    for(i=0; i<h; i++)
1463
    {
1464
        ST32(dst   , LD32(src   ));
1465
        dst+=dstStride;
1466
        src+=srcStride;
1467
    }
1468
}
1469

    
1470
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1471
{
1472
    int i;
1473
    for(i=0; i<h; i++)
1474
    {
1475
        ST32(dst   , LD32(src   ));
1476
        ST32(dst+4 , LD32(src+4 ));
1477
        dst+=dstStride;
1478
        src+=srcStride;
1479
    }
1480
}
1481

    
1482
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1483
{
1484
    int i;
1485
    for(i=0; i<h; i++)
1486
    {
1487
        ST32(dst   , LD32(src   ));
1488
        ST32(dst+4 , LD32(src+4 ));
1489
        ST32(dst+8 , LD32(src+8 ));
1490
        ST32(dst+12, LD32(src+12));
1491
        dst+=dstStride;
1492
        src+=srcStride;
1493
    }
1494
}
1495

    
1496
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1497
{
1498
    int i;
1499
    for(i=0; i<h; i++)
1500
    {
1501
        ST32(dst   , LD32(src   ));
1502
        ST32(dst+4 , LD32(src+4 ));
1503
        ST32(dst+8 , LD32(src+8 ));
1504
        ST32(dst+12, LD32(src+12));
1505
        dst[16]= src[16];
1506
        dst+=dstStride;
1507
        src+=srcStride;
1508
    }
1509
}
1510

    
1511
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1512
{
1513
    int i;
1514
    for(i=0; i<h; i++)
1515
    {
1516
        ST32(dst   , LD32(src   ));
1517
        ST32(dst+4 , LD32(src+4 ));
1518
        dst[8]= src[8];
1519
        dst+=dstStride;
1520
        src+=srcStride;
1521
    }
1522
}
1523

    
1524

    
1525
#define QPEL_MC(r, OPNAME, RND, OP) \
1526
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1527
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1528
    int i;\
1529
    for(i=0; i<h; i++)\
1530
    {\
1531
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1532
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1533
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1534
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1535
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1536
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1537
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1538
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1539
        dst+=dstStride;\
1540
        src+=srcStride;\
1541
    }\
1542
}\
1543
\
1544
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1545
    const int w=8;\
1546
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1547
    int i;\
1548
    for(i=0; i<w; i++)\
1549
    {\
1550
        const int src0= src[0*srcStride];\
1551
        const int src1= src[1*srcStride];\
1552
        const int src2= src[2*srcStride];\
1553
        const int src3= src[3*srcStride];\
1554
        const int src4= src[4*srcStride];\
1555
        const int src5= src[5*srcStride];\
1556
        const int src6= src[6*srcStride];\
1557
        const int src7= src[7*srcStride];\
1558
        const int src8= src[8*srcStride];\
1559
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1560
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1561
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1562
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1563
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1564
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1565
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1566
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1567
        dst++;\
1568
        src++;\
1569
    }\
1570
}\
1571
\
1572
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1573
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1574
    int i;\
1575
    \
1576
    for(i=0; i<h; i++)\
1577
    {\
1578
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1579
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1580
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1581
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1582
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1583
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1584
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1585
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1586
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1587
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1588
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1589
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1590
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1591
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1592
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1593
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1594
        dst+=dstStride;\
1595
        src+=srcStride;\
1596
    }\
1597
}\
1598
\
1599
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1600
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1601
    int i;\
1602
    const int w=16;\
1603
    for(i=0; i<w; i++)\
1604
    {\
1605
        const int src0= src[0*srcStride];\
1606
        const int src1= src[1*srcStride];\
1607
        const int src2= src[2*srcStride];\
1608
        const int src3= src[3*srcStride];\
1609
        const int src4= src[4*srcStride];\
1610
        const int src5= src[5*srcStride];\
1611
        const int src6= src[6*srcStride];\
1612
        const int src7= src[7*srcStride];\
1613
        const int src8= src[8*srcStride];\
1614
        const int src9= src[9*srcStride];\
1615
        const int src10= src[10*srcStride];\
1616
        const int src11= src[11*srcStride];\
1617
        const int src12= src[12*srcStride];\
1618
        const int src13= src[13*srcStride];\
1619
        const int src14= src[14*srcStride];\
1620
        const int src15= src[15*srcStride];\
1621
        const int src16= src[16*srcStride];\
1622
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1623
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1624
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1625
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1626
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1627
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1628
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1629
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1630
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1631
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1632
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1633
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1634
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1635
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1636
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1637
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1638
        dst++;\
1639
        src++;\
1640
    }\
1641
}\
1642
\
1643
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1644
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1645
}\
1646
\
1647
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1648
    uint8_t half[64];\
1649
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1650
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1651
}\
1652
\
1653
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1654
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1655
}\
1656
\
1657
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1658
    uint8_t half[64];\
1659
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1660
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1661
}\
1662
\
1663
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1664
    uint8_t full[16*9];\
1665
    uint8_t half[64];\
1666
    copy_block9(full, src, 16, stride, 9);\
1667
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1668
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1669
}\
1670
\
1671
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1672
    uint8_t full[16*9];\
1673
    copy_block9(full, src, 16, stride, 9);\
1674
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1675
}\
1676
\
1677
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1678
    uint8_t full[16*9];\
1679
    uint8_t half[64];\
1680
    copy_block9(full, src, 16, stride, 9);\
1681
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1682
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1683
}\
1684
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1685
    uint8_t full[16*9];\
1686
    uint8_t halfH[72];\
1687
    uint8_t halfV[64];\
1688
    uint8_t halfHV[64];\
1689
    copy_block9(full, src, 16, stride, 9);\
1690
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1691
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1692
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1693
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1694
}\
1695
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1696
    uint8_t full[16*9];\
1697
    uint8_t halfH[72];\
1698
    uint8_t halfHV[64];\
1699
    copy_block9(full, src, 16, stride, 9);\
1700
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1701
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1702
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1703
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1704
}\
1705
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1706
    uint8_t full[16*9];\
1707
    uint8_t halfH[72];\
1708
    uint8_t halfV[64];\
1709
    uint8_t halfHV[64];\
1710
    copy_block9(full, src, 16, stride, 9);\
1711
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1712
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1713
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1714
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1715
}\
1716
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1717
    uint8_t full[16*9];\
1718
    uint8_t halfH[72];\
1719
    uint8_t halfHV[64];\
1720
    copy_block9(full, src, 16, stride, 9);\
1721
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1722
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1723
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1724
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1725
}\
1726
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1727
    uint8_t full[16*9];\
1728
    uint8_t halfH[72];\
1729
    uint8_t halfV[64];\
1730
    uint8_t halfHV[64];\
1731
    copy_block9(full, src, 16, stride, 9);\
1732
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1733
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1734
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1735
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1736
}\
1737
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1738
    uint8_t full[16*9];\
1739
    uint8_t halfH[72];\
1740
    uint8_t halfHV[64];\
1741
    copy_block9(full, src, 16, stride, 9);\
1742
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1744
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1746
}\
1747
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1748
    uint8_t full[16*9];\
1749
    uint8_t halfH[72];\
1750
    uint8_t halfV[64];\
1751
    uint8_t halfHV[64];\
1752
    copy_block9(full, src, 16, stride, 9);\
1753
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1754
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1755
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1756
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1757
}\
1758
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1759
    uint8_t full[16*9];\
1760
    uint8_t halfH[72];\
1761
    uint8_t halfHV[64];\
1762
    copy_block9(full, src, 16, stride, 9);\
1763
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1765
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1767
}\
1768
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1769
    uint8_t halfH[72];\
1770
    uint8_t halfHV[64];\
1771
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1772
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1774
}\
1775
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1776
    uint8_t halfH[72];\
1777
    uint8_t halfHV[64];\
1778
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1779
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1780
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1781
}\
1782
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1783
    uint8_t full[16*9];\
1784
    uint8_t halfH[72];\
1785
    uint8_t halfV[64];\
1786
    uint8_t halfHV[64];\
1787
    copy_block9(full, src, 16, stride, 9);\
1788
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1790
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1791
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1792
}\
1793
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1794
    uint8_t full[16*9];\
1795
    uint8_t halfH[72];\
1796
    copy_block9(full, src, 16, stride, 9);\
1797
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1798
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1799
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1800
}\
1801
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802
    uint8_t full[16*9];\
1803
    uint8_t halfH[72];\
1804
    uint8_t halfV[64];\
1805
    uint8_t halfHV[64];\
1806
    copy_block9(full, src, 16, stride, 9);\
1807
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1809
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1811
}\
1812
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1813
    uint8_t full[16*9];\
1814
    uint8_t halfH[72];\
1815
    copy_block9(full, src, 16, stride, 9);\
1816
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1817
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1818
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1819
}\
1820
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1821
    uint8_t halfH[72];\
1822
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1823
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1824
}\
1825
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1826
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1827
}\
1828
\
1829
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1830
    uint8_t half[256];\
1831
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1832
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1833
}\
1834
\
1835
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1836
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1837
}\
1838
\
1839
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1840
    uint8_t half[256];\
1841
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1842
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1843
}\
1844
\
1845
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1846
    uint8_t full[24*17];\
1847
    uint8_t half[256];\
1848
    copy_block17(full, src, 24, stride, 17);\
1849
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1850
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1851
}\
1852
\
1853
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1854
    uint8_t full[24*17];\
1855
    copy_block17(full, src, 24, stride, 17);\
1856
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1857
}\
1858
\
1859
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1860
    uint8_t full[24*17];\
1861
    uint8_t half[256];\
1862
    copy_block17(full, src, 24, stride, 17);\
1863
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1864
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1865
}\
1866
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1867
    uint8_t full[24*17];\
1868
    uint8_t halfH[272];\
1869
    uint8_t halfV[256];\
1870
    uint8_t halfHV[256];\
1871
    copy_block17(full, src, 24, stride, 17);\
1872
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1873
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1874
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1875
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1876
}\
1877
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1878
    uint8_t full[24*17];\
1879
    uint8_t halfH[272];\
1880
    uint8_t halfHV[256];\
1881
    copy_block17(full, src, 24, stride, 17);\
1882
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1883
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1884
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1885
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1886
}\
1887
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1888
    uint8_t full[24*17];\
1889
    uint8_t halfH[272];\
1890
    uint8_t halfV[256];\
1891
    uint8_t halfHV[256];\
1892
    copy_block17(full, src, 24, stride, 17);\
1893
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1894
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1895
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1896
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1897
}\
1898
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1899
    uint8_t full[24*17];\
1900
    uint8_t halfH[272];\
1901
    uint8_t halfHV[256];\
1902
    copy_block17(full, src, 24, stride, 17);\
1903
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1904
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1905
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1906
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1907
}\
1908
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1909
    uint8_t full[24*17];\
1910
    uint8_t halfH[272];\
1911
    uint8_t halfV[256];\
1912
    uint8_t halfHV[256];\
1913
    copy_block17(full, src, 24, stride, 17);\
1914
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1915
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1916
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1917
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1918
}\
1919
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1920
    uint8_t full[24*17];\
1921
    uint8_t halfH[272];\
1922
    uint8_t halfHV[256];\
1923
    copy_block17(full, src, 24, stride, 17);\
1924
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1926
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1928
}\
1929
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1930
    uint8_t full[24*17];\
1931
    uint8_t halfH[272];\
1932
    uint8_t halfV[256];\
1933
    uint8_t halfHV[256];\
1934
    copy_block17(full, src, 24, stride, 17);\
1935
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1936
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1937
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1938
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1939
}\
1940
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1941
    uint8_t full[24*17];\
1942
    uint8_t halfH[272];\
1943
    uint8_t halfHV[256];\
1944
    copy_block17(full, src, 24, stride, 17);\
1945
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1947
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1949
}\
1950
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1951
    uint8_t halfH[272];\
1952
    uint8_t halfHV[256];\
1953
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1954
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1955
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1956
}\
1957
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1958
    uint8_t halfH[272];\
1959
    uint8_t halfHV[256];\
1960
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1961
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1962
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1963
}\
1964
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1965
    uint8_t full[24*17];\
1966
    uint8_t halfH[272];\
1967
    uint8_t halfV[256];\
1968
    uint8_t halfHV[256];\
1969
    copy_block17(full, src, 24, stride, 17);\
1970
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1972
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1973
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1974
}\
1975
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1976
    uint8_t full[24*17];\
1977
    uint8_t halfH[272];\
1978
    copy_block17(full, src, 24, stride, 17);\
1979
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1980
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1981
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1982
}\
1983
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1984
    uint8_t full[24*17];\
1985
    uint8_t halfH[272];\
1986
    uint8_t halfV[256];\
1987
    uint8_t halfHV[256];\
1988
    copy_block17(full, src, 24, stride, 17);\
1989
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1991
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1993
}\
1994
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1995
    uint8_t full[24*17];\
1996
    uint8_t halfH[272];\
1997
    copy_block17(full, src, 24, stride, 17);\
1998
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1999
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2000
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2001
}\
2002
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2003
    uint8_t halfH[272];\
2004
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2005
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2006
}
2007

    
2008
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2009
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2010
#define op_put(a, b) a = cm[((b) + 16)>>5]
2011
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2012

    
2013
QPEL_MC(0, put_       , _       , op_put)
2014
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2015
QPEL_MC(0, avg_       , _       , op_avg)
2016
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2017
#undef op_avg
2018
#undef op_avg_no_rnd
2019
#undef op_put
2020
#undef op_put_no_rnd
2021

    
2022
#if 1
2023
#define H264_LOWPASS(OPNAME, OP, OP2) \
2024
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2025
    const int h=4;\
2026
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2027
    int i;\
2028
    for(i=0; i<h; i++)\
2029
    {\
2030
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2031
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2032
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2033
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2034
        dst+=dstStride;\
2035
        src+=srcStride;\
2036
    }\
2037
}\
2038
\
2039
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2040
    const int w=4;\
2041
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2042
    int i;\
2043
    for(i=0; i<w; i++)\
2044
    {\
2045
        const int srcB= src[-2*srcStride];\
2046
        const int srcA= src[-1*srcStride];\
2047
        const int src0= src[0 *srcStride];\
2048
        const int src1= src[1 *srcStride];\
2049
        const int src2= src[2 *srcStride];\
2050
        const int src3= src[3 *srcStride];\
2051
        const int src4= src[4 *srcStride];\
2052
        const int src5= src[5 *srcStride];\
2053
        const int src6= src[6 *srcStride];\
2054
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2055
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2056
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2057
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2058
        dst++;\
2059
        src++;\
2060
    }\
2061
}\
2062
\
2063
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2064
    const int h=4;\
2065
    const int w=4;\
2066
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2067
    int i;\
2068
    src -= 2*srcStride;\
2069
    for(i=0; i<h+5; i++)\
2070
    {\
2071
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2072
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2073
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2074
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2075
        tmp+=tmpStride;\
2076
        src+=srcStride;\
2077
    }\
2078
    tmp -= tmpStride*(h+5-2);\
2079
    for(i=0; i<w; i++)\
2080
    {\
2081
        const int tmpB= tmp[-2*tmpStride];\
2082
        const int tmpA= tmp[-1*tmpStride];\
2083
        const int tmp0= tmp[0 *tmpStride];\
2084
        const int tmp1= tmp[1 *tmpStride];\
2085
        const int tmp2= tmp[2 *tmpStride];\
2086
        const int tmp3= tmp[3 *tmpStride];\
2087
        const int tmp4= tmp[4 *tmpStride];\
2088
        const int tmp5= tmp[5 *tmpStride];\
2089
        const int tmp6= tmp[6 *tmpStride];\
2090
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2091
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2092
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2093
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2094
        dst++;\
2095
        tmp++;\
2096
    }\
2097
}\
2098
\
2099
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2100
    const int h=8;\
2101
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2102
    int i;\
2103
    for(i=0; i<h; i++)\
2104
    {\
2105
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2106
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2107
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2108
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2109
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2110
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2111
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2112
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2113
        dst+=dstStride;\
2114
        src+=srcStride;\
2115
    }\
2116
}\
2117
\
2118
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2119
    const int w=8;\
2120
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2121
    int i;\
2122
    for(i=0; i<w; i++)\
2123
    {\
2124
        const int srcB= src[-2*srcStride];\
2125
        const int srcA= src[-1*srcStride];\
2126
        const int src0= src[0 *srcStride];\
2127
        const int src1= src[1 *srcStride];\
2128
        const int src2= src[2 *srcStride];\
2129
        const int src3= src[3 *srcStride];\
2130
        const int src4= src[4 *srcStride];\
2131
        const int src5= src[5 *srcStride];\
2132
        const int src6= src[6 *srcStride];\
2133
        const int src7= src[7 *srcStride];\
2134
        const int src8= src[8 *srcStride];\
2135
        const int src9= src[9 *srcStride];\
2136
        const int src10=src[10*srcStride];\
2137
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2138
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2139
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2140
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2141
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2142
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2143
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2144
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2145
        dst++;\
2146
        src++;\
2147
    }\
2148
}\
2149
\
2150
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2151
    const int h=8;\
2152
    const int w=8;\
2153
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2154
    int i;\
2155
    src -= 2*srcStride;\
2156
    for(i=0; i<h+5; i++)\
2157
    {\
2158
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2159
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2160
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2161
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2162
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2163
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2164
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2165
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2166
        tmp+=tmpStride;\
2167
        src+=srcStride;\
2168
    }\
2169
    tmp -= tmpStride*(h+5-2);\
2170
    for(i=0; i<w; i++)\
2171
    {\
2172
        const int tmpB= tmp[-2*tmpStride];\
2173
        const int tmpA= tmp[-1*tmpStride];\
2174
        const int tmp0= tmp[0 *tmpStride];\
2175
        const int tmp1= tmp[1 *tmpStride];\
2176
        const int tmp2= tmp[2 *tmpStride];\
2177
        const int tmp3= tmp[3 *tmpStride];\
2178
        const int tmp4= tmp[4 *tmpStride];\
2179
        const int tmp5= tmp[5 *tmpStride];\
2180
        const int tmp6= tmp[6 *tmpStride];\
2181
        const int tmp7= tmp[7 *tmpStride];\
2182
        const int tmp8= tmp[8 *tmpStride];\
2183
        const int tmp9= tmp[9 *tmpStride];\
2184
        const int tmp10=tmp[10*tmpStride];\
2185
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2186
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2187
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2188
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2189
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2190
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2191
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2192
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2193
        dst++;\
2194
        tmp++;\
2195
    }\
2196
}\
2197
\
2198
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2199
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2200
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2201
    src += 8*srcStride;\
2202
    dst += 8*dstStride;\
2203
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2204
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2205
}\
2206
\
2207
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2208
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2209
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2210
    src += 8*srcStride;\
2211
    dst += 8*dstStride;\
2212
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2213
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2214
}\
2215
\
2216
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2217
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2218
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2219
    src += 8*srcStride;\
2220
    dst += 8*dstStride;\
2221
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2222
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2223
}\
2224

    
2225
#define H264_MC(OPNAME, SIZE) \
2226
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2227
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2228
}\
2229
\
2230
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2231
    uint8_t half[SIZE*SIZE];\
2232
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2233
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2234
}\
2235
\
2236
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2237
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2238
}\
2239
\
2240
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2241
    uint8_t half[SIZE*SIZE];\
2242
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2243
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2244
}\
2245
\
2246
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2247
    uint8_t full[SIZE*(SIZE+5)];\
2248
    uint8_t * const full_mid= full + SIZE*2;\
2249
    uint8_t half[SIZE*SIZE];\
2250
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2251
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2252
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2253
}\
2254
\
2255
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2256
    uint8_t full[SIZE*(SIZE+5)];\
2257
    uint8_t * const full_mid= full + SIZE*2;\
2258
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2259
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2260
}\
2261
\
2262
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2263
    uint8_t full[SIZE*(SIZE+5)];\
2264
    uint8_t * const full_mid= full + SIZE*2;\
2265
    uint8_t half[SIZE*SIZE];\
2266
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2267
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2268
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2269
}\
2270
\
2271
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2272
    uint8_t full[SIZE*(SIZE+5)];\
2273
    uint8_t * const full_mid= full + SIZE*2;\
2274
    uint8_t halfH[SIZE*SIZE];\
2275
    uint8_t halfV[SIZE*SIZE];\
2276
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2277
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2278
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2279
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2280
}\
2281
\
2282
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2283
    uint8_t full[SIZE*(SIZE+5)];\
2284
    uint8_t * const full_mid= full + SIZE*2;\
2285
    uint8_t halfH[SIZE*SIZE];\
2286
    uint8_t halfV[SIZE*SIZE];\
2287
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2288
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2289
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2290
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2291
}\
2292
\
2293
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2294
    uint8_t full[SIZE*(SIZE+5)];\
2295
    uint8_t * const full_mid= full + SIZE*2;\
2296
    uint8_t halfH[SIZE*SIZE];\
2297
    uint8_t halfV[SIZE*SIZE];\
2298
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2299
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2300
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2301
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2302
}\
2303
\
2304
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2305
    uint8_t full[SIZE*(SIZE+5)];\
2306
    uint8_t * const full_mid= full + SIZE*2;\
2307
    uint8_t halfH[SIZE*SIZE];\
2308
    uint8_t halfV[SIZE*SIZE];\
2309
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2310
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2311
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2312
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2313
}\
2314
\
2315
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2316
    int16_t tmp[SIZE*(SIZE+5)];\
2317
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2318
}\
2319
\
2320
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2321
    int16_t tmp[SIZE*(SIZE+5)];\
2322
    uint8_t halfH[SIZE*SIZE];\
2323
    uint8_t halfHV[SIZE*SIZE];\
2324
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2325
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2326
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2327
}\
2328
\
2329
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2330
    int16_t tmp[SIZE*(SIZE+5)];\
2331
    uint8_t halfH[SIZE*SIZE];\
2332
    uint8_t halfHV[SIZE*SIZE];\
2333
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2334
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2335
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2336
}\
2337
\
2338
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2339
    uint8_t full[SIZE*(SIZE+5)];\
2340
    uint8_t * const full_mid= full + SIZE*2;\
2341
    int16_t tmp[SIZE*(SIZE+5)];\
2342
    uint8_t halfV[SIZE*SIZE];\
2343
    uint8_t halfHV[SIZE*SIZE];\
2344
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2345
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2346
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2347
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2348
}\
2349
\
2350
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2351
    uint8_t full[SIZE*(SIZE+5)];\
2352
    uint8_t * const full_mid= full + SIZE*2;\
2353
    int16_t tmp[SIZE*(SIZE+5)];\
2354
    uint8_t halfV[SIZE*SIZE];\
2355
    uint8_t halfHV[SIZE*SIZE];\
2356
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2357
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2358
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2359
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2360
}\
2361

    
2362
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2363
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2364
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2365
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2366
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2367

    
2368
H264_LOWPASS(put_       , op_put, op2_put)
2369
H264_LOWPASS(avg_       , op_avg, op2_avg)
2370
H264_MC(put_, 4)
2371
H264_MC(put_, 8)
2372
H264_MC(put_, 16)
2373
H264_MC(avg_, 4)
2374
H264_MC(avg_, 8)
2375
H264_MC(avg_, 16)
2376

    
2377
#undef op_avg
2378
#undef op_put
2379
#undef op2_avg
2380
#undef op2_put
2381
#endif
2382

    
2383
#define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2384
#define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2385
#define H264_WEIGHT(W,H) \
2386
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2387
    int attribute_unused x, y; \
2388
    offset <<= log2_denom; \
2389
    if(log2_denom) offset += 1<<(log2_denom-1); \
2390
    for(y=0; y<H; y++, block += stride){ \
2391
        op_scale1(0); \
2392
        op_scale1(1); \
2393
        if(W==2) continue; \
2394
        op_scale1(2); \
2395
        op_scale1(3); \
2396
        if(W==4) continue; \
2397
        op_scale1(4); \
2398
        op_scale1(5); \
2399
        op_scale1(6); \
2400
        op_scale1(7); \
2401
        if(W==8) continue; \
2402
        op_scale1(8); \
2403
        op_scale1(9); \
2404
        op_scale1(10); \
2405
        op_scale1(11); \
2406
        op_scale1(12); \
2407
        op_scale1(13); \
2408
        op_scale1(14); \
2409
        op_scale1(15); \
2410
    } \
2411
} \
2412
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2413
    int attribute_unused x, y; \
2414
    int offset = (offsets + offsetd + 1) >> 1; \
2415
    offset = ((offset << 1) + 1) << log2_denom; \
2416
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2417
        op_scale2(0); \
2418
        op_scale2(1); \
2419
        if(W==2) continue; \
2420
        op_scale2(2); \
2421
        op_scale2(3); \
2422
        if(W==4) continue; \
2423
        op_scale2(4); \
2424
        op_scale2(5); \
2425
        op_scale2(6); \
2426
        op_scale2(7); \
2427
        if(W==8) continue; \
2428
        op_scale2(8); \
2429
        op_scale2(9); \
2430
        op_scale2(10); \
2431
        op_scale2(11); \
2432
        op_scale2(12); \
2433
        op_scale2(13); \
2434
        op_scale2(14); \
2435
        op_scale2(15); \
2436
    } \
2437
}
2438

    
2439
H264_WEIGHT(16,16)
2440
H264_WEIGHT(16,8)
2441
H264_WEIGHT(8,16)
2442
H264_WEIGHT(8,8)
2443
H264_WEIGHT(8,4)
2444
H264_WEIGHT(4,8)
2445
H264_WEIGHT(4,4)
2446
H264_WEIGHT(4,2)
2447
H264_WEIGHT(2,4)
2448
H264_WEIGHT(2,2)
2449

    
2450
#undef op_scale1
2451
#undef op_scale2
2452
#undef H264_WEIGHT
2453

    
2454
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2455
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2456
    int i;
2457

    
2458
    for(i=0; i<h; i++){
2459
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2460
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2461
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2462
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2463
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2464
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2465
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2466
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2467
        dst+=dstStride;
2468
        src+=srcStride;        
2469
    }
2470
}
2471

    
2472
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2473
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2474
    int i;
2475

    
2476
    for(i=0; i<w; i++){
2477
        const int src_1= src[ -srcStride];
2478
        const int src0 = src[0          ];
2479
        const int src1 = src[  srcStride];
2480
        const int src2 = src[2*srcStride];
2481
        const int src3 = src[3*srcStride];
2482
        const int src4 = src[4*srcStride];
2483
        const int src5 = src[5*srcStride];
2484
        const int src6 = src[6*srcStride];
2485
        const int src7 = src[7*srcStride];
2486
        const int src8 = src[8*srcStride];
2487
        const int src9 = src[9*srcStride];
2488
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2489
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2490
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2491
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2492
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2493
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2494
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2495
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2496
        src++;
2497
        dst++;
2498
    }
2499
}
2500

    
2501
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2502
    put_pixels8_c(dst, src, stride, 8);
2503
}
2504

    
2505
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2506
    uint8_t half[64];
2507
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2508
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2509
}
2510

    
2511
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2512
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2513
}
2514

    
2515
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2516
    uint8_t half[64];
2517
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2518
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2519
}
2520

    
2521
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2522
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2523
}
2524

    
2525
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2526
    uint8_t halfH[88];
2527
    uint8_t halfV[64];
2528
    uint8_t halfHV[64];
2529
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2530
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2531
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2532
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2533
}
2534
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2535
    uint8_t halfH[88];
2536
    uint8_t halfV[64];
2537
    uint8_t halfHV[64];
2538
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2539
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2540
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2541
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2542
}
2543
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2544
    uint8_t halfH[88];
2545
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2546
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2547
}
2548

    
2549
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2550
    int x;
2551
    const int strength= ff_h263_loop_filter_strength[qscale];
2552
    
2553
    for(x=0; x<8; x++){
2554
        int d1, d2, ad1;
2555
        int p0= src[x-2*stride];
2556
        int p1= src[x-1*stride];
2557
        int p2= src[x+0*stride];
2558
        int p3= src[x+1*stride];
2559
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2560

    
2561
        if     (d<-2*strength) d1= 0;
2562
        else if(d<-  strength) d1=-2*strength - d;
2563
        else if(d<   strength) d1= d;
2564
        else if(d< 2*strength) d1= 2*strength - d;
2565
        else                   d1= 0;
2566
        
2567
        p1 += d1;
2568
        p2 -= d1;
2569
        if(p1&256) p1= ~(p1>>31);
2570
        if(p2&256) p2= ~(p2>>31);
2571
        
2572
        src[x-1*stride] = p1;
2573
        src[x+0*stride] = p2;
2574

    
2575
        ad1= ABS(d1)>>1;
2576
        
2577
        d2= clip((p0-p3)/4, -ad1, ad1);
2578
        
2579
        src[x-2*stride] = p0 - d2;
2580
        src[x+  stride] = p3 + d2;
2581
    }
2582
}
2583

    
2584
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2585
    int y;
2586
    const int strength= ff_h263_loop_filter_strength[qscale];
2587
    
2588
    for(y=0; y<8; y++){
2589
        int d1, d2, ad1;
2590
        int p0= src[y*stride-2];
2591
        int p1= src[y*stride-1];
2592
        int p2= src[y*stride+0];
2593
        int p3= src[y*stride+1];
2594
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2595

    
2596
        if     (d<-2*strength) d1= 0;
2597
        else if(d<-  strength) d1=-2*strength - d;
2598
        else if(d<   strength) d1= d;
2599
        else if(d< 2*strength) d1= 2*strength - d;
2600
        else                   d1= 0;
2601
        
2602
        p1 += d1;
2603
        p2 -= d1;
2604
        if(p1&256) p1= ~(p1>>31);
2605
        if(p2&256) p2= ~(p2>>31);
2606
        
2607
        src[y*stride-1] = p1;
2608
        src[y*stride+0] = p2;
2609

    
2610
        ad1= ABS(d1)>>1;
2611
        
2612
        d2= clip((p0-p3)/4, -ad1, ad1);
2613
        
2614
        src[y*stride-2] = p0 - d2;
2615
        src[y*stride+1] = p3 + d2;
2616
    }
2617
}
2618

    
2619
static void h261_loop_filter_c(uint8_t *src, int stride){
2620
    int x,y,xy,yz;
2621
    int temp[64];
2622

    
2623
    for(x=0; x<8; x++){
2624
        temp[x      ] = 4*src[x           ];
2625
        temp[x + 7*8] = 4*src[x + 7*stride];
2626
    }
2627
    for(y=1; y<7; y++){
2628
        for(x=0; x<8; x++){
2629
            xy = y * stride + x;
2630
            yz = y * 8 + x;
2631
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2632
        }
2633
    }
2634
        
2635
    for(y=0; y<8; y++){
2636
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2637
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2638
        for(x=1; x<7; x++){
2639
            xy = y * stride + x;
2640
            yz = y * 8 + x;
2641
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2642
        }
2643
    }
2644
}
2645

    
2646
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2647
{
2648
    int i, d;
2649
    for( i = 0; i < 4; i++ ) {
2650
        if( tc0[i] < 0 ) {
2651
            pix += 4*ystride;
2652
            continue;
2653
        }
2654
        for( d = 0; d < 4; d++ ) {
2655
            const int p0 = pix[-1*xstride];
2656
            const int p1 = pix[-2*xstride];
2657
            const int p2 = pix[-3*xstride];
2658
            const int q0 = pix[0];
2659
            const int q1 = pix[1*xstride];
2660
            const int q2 = pix[2*xstride];
2661
    
2662
            if( ABS( p0 - q0 ) < alpha &&
2663
                ABS( p1 - p0 ) < beta &&
2664
                ABS( q1 - q0 ) < beta ) {
2665
    
2666
                int tc = tc0[i];
2667
                int i_delta;
2668
    
2669
                if( ABS( p2 - p0 ) < beta ) {
2670
                    pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2671
                    tc++;
2672
                }
2673
                if( ABS( q2 - q0 ) < beta ) {
2674
                    pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2675
                    tc++;
2676
                }
2677
    
2678
                i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2679
                pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2680
                pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2681
            }
2682
            pix += ystride;
2683
        }
2684
    }
2685
}
2686
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2687
{
2688
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2689
}
2690
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2691
{
2692
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2693
}
2694

    
2695
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2696
{
2697
    int i, d;
2698
    for( i = 0; i < 4; i++ ) {
2699
        const int tc = tc0[i];
2700
        if( tc <= 0 ) {
2701
            pix += 2*ystride;
2702
            continue;
2703
        }
2704
        for( d = 0; d < 2; d++ ) {
2705
            const int p0 = pix[-1*xstride];
2706
            const int p1 = pix[-2*xstride];
2707
            const int q0 = pix[0];
2708
            const int q1 = pix[1*xstride];
2709

    
2710
            if( ABS( p0 - q0 ) < alpha &&
2711
                ABS( p1 - p0 ) < beta &&
2712
                ABS( q1 - q0 ) < beta ) {
2713

    
2714
                int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2715

    
2716
                pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2717
                pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2718
            }
2719
            pix += ystride;
2720
        }
2721
    }
2722
}
2723
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2724
{
2725
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2726
}
2727
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2728
{
2729
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2730
}
2731

    
2732
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2733
{
2734
    int d;
2735
    for( d = 0; d < 8; d++ ) {
2736
        const int p0 = pix[-1*xstride];
2737
        const int p1 = pix[-2*xstride];
2738
        const int q0 = pix[0];
2739
        const int q1 = pix[1*xstride];
2740

    
2741
        if( ABS( p0 - q0 ) < alpha &&
2742
            ABS( p1 - p0 ) < beta &&
2743
            ABS( q1 - q0 ) < beta ) {
2744

    
2745
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2746
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2747
        }
2748
        pix += ystride;
2749
    }
2750
}
2751
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2752
{
2753
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2754
}
2755
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2756
{
2757
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2758
}
2759

    
2760
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2761
{
2762
    int s, i;
2763

    
2764
    s = 0;
2765
    for(i=0;i<h;i++) {
2766
        s += abs(pix1[0] - pix2[0]);
2767
        s += abs(pix1[1] - pix2[1]);
2768
        s += abs(pix1[2] - pix2[2]);
2769
        s += abs(pix1[3] - pix2[3]);
2770
        s += abs(pix1[4] - pix2[4]);
2771
        s += abs(pix1[5] - pix2[5]);
2772
        s += abs(pix1[6] - pix2[6]);
2773
        s += abs(pix1[7] - pix2[7]);
2774
        s += abs(pix1[8] - pix2[8]);
2775
        s += abs(pix1[9] - pix2[9]);
2776
        s += abs(pix1[10] - pix2[10]);
2777
        s += abs(pix1[11] - pix2[11]);
2778
        s += abs(pix1[12] - pix2[12]);
2779
        s += abs(pix1[13] - pix2[13]);
2780
        s += abs(pix1[14] - pix2[14]);
2781
        s += abs(pix1[15] - pix2[15]);
2782
        pix1 += line_size;
2783
        pix2 += line_size;
2784
    }
2785
    return s;
2786
}
2787

    
2788
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2789
{
2790
    int s, i;
2791

    
2792
    s = 0;
2793
    for(i=0;i<h;i++) {
2794
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2795
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2796
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2797
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2798
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2799
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2800
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2801
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2802
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2803
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2804
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2805
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2806
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2807
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2808
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2809
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2810
        pix1 += line_size;
2811
        pix2 += line_size;
2812
    }
2813
    return s;
2814
}
2815

    
2816
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2817
{
2818
    int s, i;
2819
    uint8_t *pix3 = pix2 + line_size;
2820

    
2821
    s = 0;
2822
    for(i=0;i<h;i++) {
2823
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2824
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2825
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2826
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2827
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2828
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2829
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2830
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2831
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2832
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2833
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2834
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2835
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2836
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2837
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2838
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2839
        pix1 += line_size;
2840
        pix2 += line_size;
2841
        pix3 += line_size;
2842
    }
2843
    return s;
2844
}
2845

    
2846
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2847
{
2848
    int s, i;
2849
    uint8_t *pix3 = pix2 + line_size;
2850

    
2851
    s = 0;
2852
    for(i=0;i<h;i++) {
2853
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2854
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2855
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2856
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2857
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2858
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2859
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2860
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2861
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2862
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2863
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2864
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2865
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2866
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2867
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2868
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2869
        pix1 += line_size;
2870
        pix2 += line_size;
2871
        pix3 += line_size;
2872
    }
2873
    return s;
2874
}
2875

    
2876
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2877
{
2878
    int s, i;
2879

    
2880
    s = 0;
2881
    for(i=0;i<h;i++) {
2882
        s += abs(pix1[0] - pix2[0]);
2883
        s += abs(pix1[1] - pix2[1]);
2884
        s += abs(pix1[2] - pix2[2]);
2885
        s += abs(pix1[3] - pix2[3]);
2886
        s += abs(pix1[4] - pix2[4]);
2887
        s += abs(pix1[5] - pix2[5]);
2888
        s += abs(pix1[6] - pix2[6]);
2889
        s += abs(pix1[7] - pix2[7]);
2890
        pix1 += line_size;
2891
        pix2 += line_size;
2892
    }
2893
    return s;
2894
}
2895

    
2896
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2897
{
2898
    int s, i;
2899

    
2900
    s = 0;
2901
    for(i=0;i<h;i++) {
2902
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2903
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2904
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2905
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2906
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2907
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2908
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2909
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2910
        pix1 += line_size;
2911
        pix2 += line_size;
2912
    }
2913
    return s;
2914
}
2915

    
2916
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2917
{
2918
    int s, i;
2919
    uint8_t *pix3 = pix2 + line_size;
2920

    
2921
    s = 0;
2922
    for(i=0;i<h;i++) {
2923
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2924
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2925
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2926
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2927
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2928
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2929
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2930
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2931
        pix1 += line_size;
2932
        pix2 += line_size;
2933
        pix3 += line_size;
2934
    }
2935
    return s;
2936
}
2937

    
2938
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2939
{
2940
    int s, i;
2941
    uint8_t *pix3 = pix2 + line_size;
2942

    
2943
    s = 0;
2944
    for(i=0;i<h;i++) {
2945
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2946
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2947
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2948
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2949
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2950
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2951
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2952
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2953
        pix1 += line_size;
2954
        pix2 += line_size;
2955
        pix3 += line_size;
2956
    }
2957
    return s;
2958
}
2959

    
2960
static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2961
    int score1=0;
2962
    int score2=0;
2963
    int x,y;
2964

    
2965
    for(y=0; y<h; y++){
2966
        for(x=0; x<16; x++){
2967
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2968
        }
2969
        if(y+1<h){
2970
            for(x=0; x<15; x++){
2971
                score2+= ABS(  s1[x  ] - s1[x  +stride]
2972
                             - s1[x+1] + s1[x+1+stride])
2973
                        -ABS(  s2[x  ] - s2[x  +stride]
2974
                             - s2[x+1] + s2[x+1+stride]);
2975
            }
2976
        }
2977
        s1+= stride;
2978
        s2+= stride;
2979
    }
2980

    
2981
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2982
    else  return score1 + ABS(score2)*8;
2983
}
2984

    
2985
static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2986
    int score1=0;
2987
    int score2=0;
2988
    int x,y;
2989
    
2990
    for(y=0; y<h; y++){
2991
        for(x=0; x<8; x++){
2992
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2993
        }
2994
        if(y+1<h){
2995
            for(x=0; x<7; x++){
2996
                score2+= ABS(  s1[x  ] - s1[x  +stride]
2997
                             - s1[x+1] + s1[x+1+stride])
2998
                        -ABS(  s2[x  ] - s2[x  +stride]
2999
                             - s2[x+1] + s2[x+1+stride]);
3000
            }
3001
        }
3002
        s1+= stride;
3003
        s2+= stride;
3004
    }
3005
    
3006
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3007
    else  return score1 + ABS(score2)*8;
3008
}
3009

    
3010
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3011
    int i;
3012
    unsigned int sum=0;
3013

    
3014
    for(i=0; i<8*8; i++){
3015
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3016
        int w= weight[i];
3017
        b>>= RECON_SHIFT;
3018
        assert(-512<b && b<512);
3019

    
3020
        sum += (w*b)*(w*b)>>4;
3021
    }
3022
    return sum>>2;
3023
}
3024

    
3025
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3026
    int i;
3027

    
3028
    for(i=0; i<8*8; i++){
3029
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3030
    }    
3031
}
3032

    
3033
/**
3034
 * permutes an 8x8 block.
3035
 * @param block the block which will be permuted according to the given permutation vector
3036
 * @param permutation the permutation vector
3037
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3038
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
3039
 *                  (inverse) permutated to scantable order!
3040
 */
3041
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3042
{
3043
    int i;
3044
    DCTELEM temp[64];
3045
    
3046
    if(last<=0) return;
3047
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3048

    
3049
    for(i=0; i<=last; i++){
3050
        const int j= scantable[i];
3051
        temp[j]= block[j];
3052
        block[j]=0;
3053
    }
3054
    
3055
    for(i=0; i<=last; i++){
3056
        const int j= scantable[i];
3057
        const int perm_j= permutation[j];
3058
        block[perm_j]= temp[j];
3059
    }
3060
}
3061

    
3062
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3063
    return 0;
3064
}
3065

    
3066
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3067
    int i;
3068
    
3069
    memset(cmp, 0, sizeof(void*)*5);
3070
        
3071
    for(i=0; i<5; i++){
3072
        switch(type&0xFF){
3073
        case FF_CMP_SAD:
3074
            cmp[i]= c->sad[i];
3075
            break;
3076
        case FF_CMP_SATD:
3077
            cmp[i]= c->hadamard8_diff[i];
3078
            break;
3079
        case FF_CMP_SSE:
3080
            cmp[i]= c->sse[i];
3081
            break;
3082
        case FF_CMP_DCT:
3083
            cmp[i]= c->dct_sad[i];
3084
            break;
3085
        case FF_CMP_DCTMAX:
3086
            cmp[i]= c->dct_max[i];
3087
            break;
3088
        case FF_CMP_PSNR:
3089
            cmp[i]= c->quant_psnr[i];
3090
            break;
3091
        case FF_CMP_BIT:
3092
            cmp[i]= c->bit[i];
3093
            break;
3094
        case FF_CMP_RD:
3095
            cmp[i]= c->rd[i];
3096
            break;
3097
        case FF_CMP_VSAD:
3098
            cmp[i]= c->vsad[i];
3099
            break;
3100
        case FF_CMP_VSSE:
3101
            cmp[i]= c->vsse[i];
3102
            break;
3103
        case FF_CMP_ZERO:
3104
            cmp[i]= zero_cmp;
3105
            break;
3106
        case FF_CMP_NSSE:
3107
            cmp[i]= c->nsse[i];
3108
            break;
3109
        case FF_CMP_W53:
3110
            cmp[i]= c->w53[i];
3111
            break;
3112
        case FF_CMP_W97:
3113
            cmp[i]= c->w97[i];
3114
            break;
3115
        default:
3116
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3117
        }
3118
    }
3119
}
3120

    
3121
/**
3122
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3123
 */
3124
static void clear_blocks_c(DCTELEM *blocks)
3125
{
3126
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3127
}
3128

    
3129
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3130
    int i;
3131
    for(i=0; i+7<w; i+=8){
3132
        dst[i+0] += src[i+0];
3133
        dst[i+1] += src[i+1];
3134
        dst[i+2] += src[i+2];
3135
        dst[i+3] += src[i+3];
3136
        dst[i+4] += src[i+4];
3137
        dst[i+5] += src[i+5];
3138
        dst[i+6] += src[i+6];
3139
        dst[i+7] += src[i+7];
3140
    }
3141
    for(; i<w; i++)
3142
        dst[i+0] += src[i+0];
3143
}
3144

    
3145
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3146
    int i;
3147
    for(i=0; i+7<w; i+=8){
3148
        dst[i+0] = src1[i+0]-src2[i+0];
3149
        dst[i+1] = src1[i+1]-src2[i+1];
3150
        dst[i+2] = src1[i+2]-src2[i+2];
3151
        dst[i+3] = src1[i+3]-src2[i+3];
3152
        dst[i+4] = src1[i+4]-src2[i+4];
3153
        dst[i+5] = src1[i+5]-src2[i+5];
3154
        dst[i+6] = src1[i+6]-src2[i+6];
3155
        dst[i+7] = src1[i+7]-src2[i+7];
3156
    }
3157
    for(; i<w; i++)
3158
        dst[i+0] = src1[i+0]-src2[i+0];
3159
}
3160

    
3161
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3162
    int i;
3163
    uint8_t l, lt;
3164

    
3165
    l= *left;
3166
    lt= *left_top;
3167

    
3168
    for(i=0; i<w; i++){
3169
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3170
        lt= src1[i];
3171
        l= src2[i];
3172
        dst[i]= l - pred;
3173
    }    
3174

    
3175
    *left= l;
3176
    *left_top= lt;
3177
}
3178

    
3179
#define BUTTERFLY2(o1,o2,i1,i2) \
3180
o1= (i1)+(i2);\
3181
o2= (i1)-(i2);
3182

    
3183
#define BUTTERFLY1(x,y) \
3184
{\
3185
    int a,b;\
3186
    a= x;\
3187
    b= y;\
3188
    x= a+b;\
3189
    y= a-b;\
3190
}
3191

    
3192
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3193

    
3194
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3195
    int i;
3196
    int temp[64];
3197
    int sum=0;
3198
    
3199
    assert(h==8);
3200

    
3201
    for(i=0; i<8; i++){
3202
        //FIXME try pointer walks
3203
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3204
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3205
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3206
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3207
        
3208
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3209
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3210
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3211
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3212
        
3213
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3214
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3215
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3216
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3217
    }
3218

    
3219
    for(i=0; i<8; i++){
3220
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3221
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3222
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3223
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3224
        
3225
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3226
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3227
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3228
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3229

    
3230
        sum += 
3231
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3232
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3233
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3234
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3235
    }
3236
#if 0
3237
static int maxi=0;
3238
if(sum>maxi){
3239
    maxi=sum;
3240
    printf("MAX:%d\n", maxi);
3241
}
3242
#endif
3243
    return sum;
3244
}
3245

    
3246
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3247
    int i;
3248
    int temp[64];
3249
    int sum=0;
3250
    
3251
    assert(h==8);
3252
    
3253
    for(i=0; i<8; i++){
3254
        //FIXME try pointer walks
3255
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3256
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3257
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3258
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3259
        
3260
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3261
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3262
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3263
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3264
        
3265
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3266
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3267
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3268
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3269
    }
3270

    
3271
    for(i=0; i<8; i++){
3272
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3273
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3274
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3275
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3276
        
3277
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3278
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3279
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3280
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3281
    
3282
        sum += 
3283
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3284
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3285
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3286
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3287
    }
3288
    
3289
    sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3290
    
3291
    return sum;
3292
}
3293

    
3294
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3295
    MpegEncContext * const s= (MpegEncContext *)c;
3296
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3297
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3298
    int sum=0, i;
3299
    
3300
    assert(h==8);
3301

    
3302
    s->dsp.diff_pixels(temp, src1, src2, stride);
3303
    s->dsp.fdct(temp);
3304

    
3305
    for(i=0; i<64; i++)
3306
        sum+= ABS(temp[i]);
3307
        
3308
    return sum;
3309
}
3310

    
3311
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3312
    MpegEncContext * const s= (MpegEncContext *)c;
3313
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3314
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3315
    int sum=0, i;
3316
    
3317
    assert(h==8);
3318

    
3319
    s->dsp.diff_pixels(temp, src1, src2, stride);
3320
    s->dsp.fdct(temp);
3321

    
3322
    for(i=0; i<64; i++)
3323
        sum= FFMAX(sum, ABS(temp[i]));
3324
        
3325
    return sum;
3326
}
3327

    
3328
void simple_idct(DCTELEM *block); //FIXME
3329

    
3330
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3331
    MpegEncContext * const s= (MpegEncContext *)c;
3332
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3333
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3334
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3335
    int sum=0, i;
3336

    
3337
    assert(h==8);
3338
    s->mb_intra=0;
3339
    
3340
    s->dsp.diff_pixels(temp, src1, src2, stride);
3341
    
3342
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3343
    
3344
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3345
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3346
    simple_idct(temp); //FIXME 
3347
    
3348
    for(i=0; i<64; i++)
3349
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3350
        
3351
    return sum;
3352
}
3353

    
3354
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3355
    MpegEncContext * const s= (MpegEncContext *)c;
3356
    const uint8_t *scantable= s->intra_scantable.permutated;
3357
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3358
    uint64_t __align8 aligned_bak[stride];
3359
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3360
    uint8_t * const bak= (uint8_t*)aligned_bak;
3361
    int i, last, run, bits, level, distoration, start_i;
3362
    const int esc_length= s->ac_esc_length;
3363
    uint8_t * length;
3364
    uint8_t * last_length;
3365
    
3366
    assert(h==8);
3367

    
3368
    for(i=0; i<8; i++){
3369
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3370
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3371
    }
3372

    
3373
    s->dsp.diff_pixels(temp, src1, src2, stride);
3374

    
3375
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3376

    
3377
    bits=0;
3378
    
3379
    if (s->mb_intra) {
3380
        start_i = 1; 
3381
        length     = s->intra_ac_vlc_length;
3382
        last_length= s->intra_ac_vlc_last_length;
3383
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3384
    } else {
3385
        start_i = 0;
3386
        length     = s->inter_ac_vlc_length;
3387
        last_length= s->inter_ac_vlc_last_length;
3388
    }
3389
    
3390
    if(last>=start_i){
3391
        run=0;
3392
        for(i=start_i; i<last; i++){
3393
            int j= scantable[i];
3394
            level= temp[j];
3395
        
3396
            if(level){
3397
                level+=64;
3398
                if((level&(~127)) == 0){
3399
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3400
                }else
3401
                    bits+= esc_length;
3402
                run=0;
3403
            }else
3404
                run++;
3405
        }
3406
        i= scantable[last];
3407
       
3408
        level= temp[i] + 64;
3409

    
3410
        assert(level - 64);
3411
        
3412
        if((level&(~127)) == 0){
3413
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3414
        }else
3415
            bits+= esc_length;
3416
    
3417
    }
3418

    
3419
    if(last>=0){
3420
        if(s->mb_intra)
3421
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3422
        else
3423
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3424
    }
3425
    
3426
    s->dsp.idct_add(bak, stride, temp);
3427
    
3428
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3429

    
3430
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3431
}
3432

    
3433
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3434
    MpegEncContext * const s= (MpegEncContext *)c;
3435
    const uint8_t *scantable= s->intra_scantable.permutated;
3436
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3437
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3438
    int i, last, run, bits, level, start_i;
3439
    const int esc_length= s->ac_esc_length;
3440
    uint8_t * length;
3441
    uint8_t * last_length;
3442

    
3443
    assert(h==8);
3444
    
3445
    s->dsp.diff_pixels(temp, src1, src2, stride);
3446

    
3447
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3448

    
3449
    bits=0;
3450
    
3451
    if (s->mb_intra) {
3452
        start_i = 1; 
3453
        length     = s->intra_ac_vlc_length;
3454
        last_length= s->intra_ac_vlc_last_length;
3455
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3456
    } else {
3457
        start_i = 0;
3458
        length     = s->inter_ac_vlc_length;
3459
        last_length= s->inter_ac_vlc_last_length;
3460
    }
3461
    
3462
    if(last>=start_i){
3463
        run=0;
3464
        for(i=start_i; i<last; i++){
3465
            int j= scantable[i];
3466
            level= temp[j];
3467
        
3468
            if(level){
3469
                level+=64;
3470
                if((level&(~127)) == 0){
3471
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3472
                }else
3473
                    bits+= esc_length;
3474
                run=0;
3475
            }else
3476
                run++;
3477
        }
3478
        i= scantable[last];
3479
                
3480
        level= temp[i] + 64;
3481
        
3482
        assert(level - 64);
3483
        
3484
        if((level&(~127)) == 0){
3485
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3486
        }else
3487
            bits+= esc_length;
3488
    }
3489

    
3490
    return bits;
3491
}
3492

    
3493
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3494
    int score=0;
3495
    int x,y;
3496
    
3497
    for(y=1; y<h; y++){
3498
        for(x=0; x<16; x+=4){
3499
            score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride]) 
3500
                   +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3501
        }
3502
        s+= stride;
3503
    }
3504
    
3505
    return score;
3506
}
3507

    
3508
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3509
    int score=0;
3510
    int x,y;
3511
    
3512
    for(y=1; y<h; y++){
3513
        for(x=0; x<16; x++){
3514
            score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3515
        }
3516
        s1+= stride;
3517
        s2+= stride;
3518
    }
3519
    
3520
    return score;
3521
}
3522

    
3523
#define SQ(a) ((a)*(a))
3524
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3525
    int score=0;
3526
    int x,y;
3527
    
3528
    for(y=1; y<h; y++){
3529
        for(x=0; x<16; x+=4){
3530
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride]) 
3531
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3532
        }
3533
        s+= stride;
3534
    }
3535
    
3536
    return score;
3537
}
3538

    
3539
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3540
    int score=0;
3541
    int x,y;
3542
    
3543
    for(y=1; y<h; y++){
3544
        for(x=0; x<16; x++){
3545
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3546
        }
3547
        s1+= stride;
3548
        s2+= stride;
3549
    }
3550
    
3551
    return score;
3552
}
3553

    
3554
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3555
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3556
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3557
WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3558
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3559
WARPER8_16_SQ(rd8x8_c, rd16_c)
3560
WARPER8_16_SQ(bit8x8_c, bit16_c)
3561

    
3562
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3563
 converted */
3564
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3565
{
3566
    j_rev_dct (block);
3567
    put_pixels_clamped_c(block, dest, line_size);
3568
}
3569
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3570
{
3571
    j_rev_dct (block);
3572
    add_pixels_clamped_c(block, dest, line_size);
3573
}
3574

    
3575
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3576
{
3577
    j_rev_dct4 (block);
3578
    put_pixels_clamped4_c(block, dest, line_size);
3579
}
3580
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3581
{
3582
    j_rev_dct4 (block);
3583
    add_pixels_clamped4_c(block, dest, line_size);
3584
}
3585

    
3586
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3587
{
3588
    j_rev_dct2 (block);
3589
    put_pixels_clamped2_c(block, dest, line_size);
3590
}
3591
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3592
{
3593
    j_rev_dct2 (block);
3594
    add_pixels_clamped2_c(block, dest, line_size);
3595
}
3596

    
3597
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3598
{
3599
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3600

    
3601
    dest[0] = cm[(block[0] + 4)>>3];
3602
}
3603
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3604
{
3605
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3606

    
3607
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3608
}
3609

    
3610
/* init static data */
3611
void dsputil_static_init(void)
3612
{
3613
    int i;
3614

    
3615
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3616
    for(i=0;i<MAX_NEG_CROP;i++) {
3617
        cropTbl[i] = 0;
3618
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
3619
    }
3620
    
3621
    for(i=0;i<512;i++) {
3622
        squareTbl[i] = (i - 256) * (i - 256);
3623
    }
3624
    
3625
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3626
}
3627

    
3628

    
3629
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3630
{
3631
    int i;
3632

    
3633
#ifdef CONFIG_ENCODERS
3634
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3635
        c->fdct = fdct_ifast;
3636
        c->fdct248 = fdct_ifast248;
3637
    } 
3638
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3639
        c->fdct = ff_faandct;
3640
        c->fdct248 = ff_faandct248; 
3641
    } 
3642
    else {
3643
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3644
        c->fdct248 = ff_fdct248_islow;
3645
    }
3646
#endif //CONFIG_ENCODERS
3647

    
3648
    if(avctx->lowres==1){
3649
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3650
            c->idct_put= ff_jref_idct4_put;
3651
            c->idct_add= ff_jref_idct4_add;
3652
        }else{
3653
            c->idct_put= ff_h264_lowres_idct_put_c;
3654
            c->idct_add= ff_h264_lowres_idct_add_c;
3655
        }
3656
        c->idct    = j_rev_dct4;
3657
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3658
    }else if(avctx->lowres==2){
3659
        c->idct_put= ff_jref_idct2_put;
3660
        c->idct_add= ff_jref_idct2_add;
3661
        c->idct    = j_rev_dct2;
3662
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3663
    }else if(avctx->lowres==3){
3664
        c->idct_put= ff_jref_idct1_put;
3665
        c->idct_add= ff_jref_idct1_add;
3666
        c->idct    = j_rev_dct1;
3667
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3668
    }else{
3669
        if(avctx->idct_algo==FF_IDCT_INT){
3670
            c->idct_put= ff_jref_idct_put;
3671
            c->idct_add= ff_jref_idct_add;
3672
            c->idct    = j_rev_dct;
3673
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3674
        }else if(avctx->idct_algo==FF_IDCT_VP3){
3675
            c->idct_put= ff_vp3_idct_put_c;
3676
            c->idct_add= ff_vp3_idct_add_c;
3677
            c->idct    = ff_vp3_idct_c;
3678
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3679
        }else{ //accurate/default
3680
            c->idct_put= simple_idct_put;
3681
            c->idct_add= simple_idct_add;
3682
            c->idct    = simple_idct;
3683
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3684
        }
3685
    }
3686

    
3687
    c->h264_idct_add= ff_h264_idct_add_c;
3688

    
3689
    c->get_pixels = get_pixels_c;
3690
    c->diff_pixels = diff_pixels_c;
3691
    c->put_pixels_clamped = put_pixels_clamped_c;
3692
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3693
    c->add_pixels_clamped = add_pixels_clamped_c;
3694
    c->gmc1 = gmc1_c;
3695
    c->gmc = gmc_c;
3696
    c->clear_blocks = clear_blocks_c;
3697
    c->pix_sum = pix_sum_c;
3698
    c->pix_norm1 = pix_norm1_c;
3699

    
3700
    /* TODO [0] 16  [1] 8 */
3701
    c->pix_abs[0][0] = pix_abs16_c;
3702
    c->pix_abs[0][1] = pix_abs16_x2_c;
3703
    c->pix_abs[0][2] = pix_abs16_y2_c;
3704
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3705
    c->pix_abs[1][0] = pix_abs8_c;
3706
    c->pix_abs[1][1] = pix_abs8_x2_c;
3707
    c->pix_abs[1][2] = pix_abs8_y2_c;
3708
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3709

    
3710
#define dspfunc(PFX, IDX, NUM) \
3711
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3712
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3713
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3714
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3715

    
3716
    dspfunc(put, 0, 16);
3717
    dspfunc(put_no_rnd, 0, 16);
3718
    dspfunc(put, 1, 8);
3719
    dspfunc(put_no_rnd, 1, 8);
3720
    dspfunc(put, 2, 4);
3721
    dspfunc(put, 3, 2);
3722

    
3723
    dspfunc(avg, 0, 16);
3724
    dspfunc(avg_no_rnd, 0, 16);
3725
    dspfunc(avg, 1, 8);
3726
    dspfunc(avg_no_rnd, 1, 8);
3727
    dspfunc(avg, 2, 4);
3728
    dspfunc(avg, 3, 2);
3729
#undef dspfunc
3730

    
3731
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3732
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3733

    
3734
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3735
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3736
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3737
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3738
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3739
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3740
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3741
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3742
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3743

    
3744
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3745
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3746
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3747
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3748
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3749
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3750
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3751
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3752
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3753

    
3754
#define dspfunc(PFX, IDX, NUM) \
3755
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3756
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3757
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3758
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3759
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3760
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3761
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3762
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3763
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3764
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3765
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3766
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3767
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3768
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3769
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3770
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3771

    
3772
    dspfunc(put_qpel, 0, 16);
3773
    dspfunc(put_no_rnd_qpel, 0, 16);
3774

    
3775
    dspfunc(avg_qpel, 0, 16);
3776
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3777

    
3778
    dspfunc(put_qpel, 1, 8);
3779
    dspfunc(put_no_rnd_qpel, 1, 8);
3780

    
3781
    dspfunc(avg_qpel, 1, 8);
3782
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3783

    
3784
    dspfunc(put_h264_qpel, 0, 16);
3785
    dspfunc(put_h264_qpel, 1, 8);
3786
    dspfunc(put_h264_qpel, 2, 4);
3787
    dspfunc(avg_h264_qpel, 0, 16);
3788
    dspfunc(avg_h264_qpel, 1, 8);
3789
    dspfunc(avg_h264_qpel, 2, 4);
3790

    
3791
#undef dspfunc
3792
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3793
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3794
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3795
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3796
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3797
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3798

    
3799
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3800
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3801
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3802
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3803
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3804
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3805
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3806
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3807
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3808
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3809
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3810
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3811
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3812
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3813
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3814
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3815
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3816
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3817
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3818
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3819

    
3820
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3821
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3822
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3823
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3824
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3825
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3826
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3827
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3828
        
3829
#define SET_CMP_FUNC(name) \
3830
    c->name[0]= name ## 16_c;\
3831
    c->name[1]= name ## 8x8_c;
3832
    
3833
    SET_CMP_FUNC(hadamard8_diff)
3834
    c->hadamard8_diff[4]= hadamard8_intra16_c;
3835
    SET_CMP_FUNC(dct_sad)
3836
    SET_CMP_FUNC(dct_max)
3837
    c->sad[0]= pix_abs16_c;
3838
    c->sad[1]= pix_abs8_c;
3839
    c->sse[0]= sse16_c;
3840
    c->sse[1]= sse8_c;
3841
    c->sse[2]= sse4_c;
3842
    SET_CMP_FUNC(quant_psnr)
3843
    SET_CMP_FUNC(rd)
3844
    SET_CMP_FUNC(bit)
3845
    c->vsad[0]= vsad16_c;
3846
    c->vsad[4]= vsad_intra16_c;
3847
    c->vsse[0]= vsse16_c;
3848
    c->vsse[4]= vsse_intra16_c;
3849
    c->nsse[0]= nsse16_c;
3850
    c->nsse[1]= nsse8_c;
3851
    c->w53[0]= w53_16_c;
3852
    c->w53[1]= w53_8_c;
3853
    c->w97[0]= w97_16_c;
3854
    c->w97[1]= w97_8_c;
3855

    
3856
    c->add_bytes= add_bytes_c;
3857
    c->diff_bytes= diff_bytes_c;
3858
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3859
    c->bswap_buf= bswap_buf;
3860

    
3861
    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
3862
    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
3863
    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
3864
    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
3865
    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
3866
    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
3867
    
3868
    c->h263_h_loop_filter= h263_h_loop_filter_c;
3869
    c->h263_v_loop_filter= h263_v_loop_filter_c;
3870
    
3871
    c->h261_loop_filter= h261_loop_filter_c;
3872
    
3873
    c->try_8x8basis= try_8x8basis_c;
3874
    c->add_8x8basis= add_8x8basis_c;
3875

    
3876
#ifdef HAVE_MMX
3877
    dsputil_init_mmx(c, avctx);
3878
#endif
3879
#ifdef ARCH_ARMV4L
3880
    dsputil_init_armv4l(c, avctx);
3881
#endif
3882
#ifdef HAVE_MLIB
3883
    dsputil_init_mlib(c, avctx);
3884
#endif
3885
#ifdef ARCH_SPARC
3886
   dsputil_init_vis(c,avctx);
3887
#endif
3888
#ifdef ARCH_ALPHA
3889
    dsputil_init_alpha(c, avctx);
3890
#endif
3891
#ifdef ARCH_POWERPC
3892
    dsputil_init_ppc(c, avctx);
3893
#endif
3894
#ifdef HAVE_MMI
3895
    dsputil_init_mmi(c, avctx);
3896
#endif
3897
#ifdef ARCH_SH4
3898
    dsputil_init_sh4(c,avctx);
3899
#endif
3900

    
3901
    switch(c->idct_permutation_type){
3902
    case FF_NO_IDCT_PERM:
3903
        for(i=0; i<64; i++)
3904
            c->idct_permutation[i]= i;
3905
        break;
3906
    case FF_LIBMPEG2_IDCT_PERM:
3907
        for(i=0; i<64; i++)
3908
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3909
        break;
3910
    case FF_SIMPLE_IDCT_PERM:
3911
        for(i=0; i<64; i++)
3912
            c->idct_permutation[i]= simple_mmx_permutation[i];
3913
        break;
3914
    case FF_TRANSPOSE_IDCT_PERM:
3915
        for(i=0; i<64; i++)
3916
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3917
        break;
3918
    case FF_PARTTRANS_IDCT_PERM:
3919
        for(i=0; i<64; i++)
3920
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3921
        break;
3922
    default:
3923
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3924
    }
3925
}
3926