Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 9f2d1b4f

History | View | Annotate | Download (134 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 *
20
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21
 */
22
 
23
/**
24
 * @file dsputil.c
25
 * DSP utils
26
 */
27
 
28
#include "avcodec.h"
29
#include "dsputil.h"
30
#include "mpegvideo.h"
31
#include "simple_idct.h"
32
#include "faandct.h"
33

    
34
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
35
uint32_t squareTbl[512] = {0, };
36

    
37
const uint8_t ff_zigzag_direct[64] = {
38
    0,   1,  8, 16,  9,  2,  3, 10,
39
    17, 24, 32, 25, 18, 11,  4,  5,
40
    12, 19, 26, 33, 40, 48, 41, 34,
41
    27, 20, 13,  6,  7, 14, 21, 28,
42
    35, 42, 49, 56, 57, 50, 43, 36,
43
    29, 22, 15, 23, 30, 37, 44, 51,
44
    58, 59, 52, 45, 38, 31, 39, 46,
45
    53, 60, 61, 54, 47, 55, 62, 63
46
};
47

    
48
/* Specific zigzag scan for 248 idct. NOTE that unlike the
49
   specification, we interleave the fields */
50
const uint8_t ff_zigzag248_direct[64] = {
51
     0,  8,  1,  9, 16, 24,  2, 10,
52
    17, 25, 32, 40, 48, 56, 33, 41,
53
    18, 26,  3, 11,  4, 12, 19, 27,
54
    34, 42, 49, 57, 50, 58, 35, 43,
55
    20, 28,  5, 13,  6, 14, 21, 29,
56
    36, 44, 51, 59, 52, 60, 37, 45,
57
    22, 30,  7, 15, 23, 31, 38, 46,
58
    53, 61, 54, 62, 39, 47, 55, 63,
59
};
60

    
61
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62
uint16_t __align8 inv_zigzag_direct16[64] = {0, };
63

    
64
const uint8_t ff_alternate_horizontal_scan[64] = {
65
    0,  1,   2,  3,  8,  9, 16, 17, 
66
    10, 11,  4,  5,  6,  7, 15, 14,
67
    13, 12, 19, 18, 24, 25, 32, 33, 
68
    26, 27, 20, 21, 22, 23, 28, 29,
69
    30, 31, 34, 35, 40, 41, 48, 49, 
70
    42, 43, 36, 37, 38, 39, 44, 45,
71
    46, 47, 50, 51, 56, 57, 58, 59, 
72
    52, 53, 54, 55, 60, 61, 62, 63,
73
};
74

    
75
const uint8_t ff_alternate_vertical_scan[64] = {
76
    0,  8,  16, 24,  1,  9,  2, 10, 
77
    17, 25, 32, 40, 48, 56, 57, 49,
78
    41, 33, 26, 18,  3, 11,  4, 12, 
79
    19, 27, 34, 42, 50, 58, 35, 43,
80
    51, 59, 20, 28,  5, 13,  6, 14, 
81
    21, 29, 36, 44, 52, 60, 37, 45,
82
    53, 61, 22, 30,  7, 15, 23, 31, 
83
    38, 46, 54, 62, 39, 47, 55, 63,
84
};
85

    
86
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87
const uint32_t inverse[256]={
88
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
89
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
90
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
91
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
92
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
93
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
94
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
95
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
96
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
97
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
98
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
99
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
100
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
101
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
102
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
103
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
104
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
105
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
106
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
107
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
108
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
109
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
110
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
111
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
112
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
113
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
114
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
115
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
116
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
117
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
118
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
119
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
120
};
121

    
122
/* Input permutation for the simple_idct_mmx */
123
static const uint8_t simple_mmx_permutation[64]={
124
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
125
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
126
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
127
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
128
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
129
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
130
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
131
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132
};
133

    
134
static int pix_sum_c(uint8_t * pix, int line_size)
135
{
136
    int s, i, j;
137

    
138
    s = 0;
139
    for (i = 0; i < 16; i++) {
140
        for (j = 0; j < 16; j += 8) {
141
            s += pix[0];
142
            s += pix[1];
143
            s += pix[2];
144
            s += pix[3];
145
            s += pix[4];
146
            s += pix[5];
147
            s += pix[6];
148
            s += pix[7];
149
            pix += 8;
150
        }
151
        pix += line_size - 16;
152
    }
153
    return s;
154
}
155

    
156
static int pix_norm1_c(uint8_t * pix, int line_size)
157
{
158
    int s, i, j;
159
    uint32_t *sq = squareTbl + 256;
160

    
161
    s = 0;
162
    for (i = 0; i < 16; i++) {
163
        for (j = 0; j < 16; j += 8) {
164
#if 0
165
            s += sq[pix[0]];
166
            s += sq[pix[1]];
167
            s += sq[pix[2]];
168
            s += sq[pix[3]];
169
            s += sq[pix[4]];
170
            s += sq[pix[5]];
171
            s += sq[pix[6]];
172
            s += sq[pix[7]];
173
#else
174
#if LONG_MAX > 2147483647
175
            register uint64_t x=*(uint64_t*)pix;
176
            s += sq[x&0xff];
177
            s += sq[(x>>8)&0xff];
178
            s += sq[(x>>16)&0xff];
179
            s += sq[(x>>24)&0xff];
180
            s += sq[(x>>32)&0xff];
181
            s += sq[(x>>40)&0xff];
182
            s += sq[(x>>48)&0xff];
183
            s += sq[(x>>56)&0xff];
184
#else
185
            register uint32_t x=*(uint32_t*)pix;
186
            s += sq[x&0xff];
187
            s += sq[(x>>8)&0xff];
188
            s += sq[(x>>16)&0xff];
189
            s += sq[(x>>24)&0xff];
190
            x=*(uint32_t*)(pix+4);
191
            s += sq[x&0xff];
192
            s += sq[(x>>8)&0xff];
193
            s += sq[(x>>16)&0xff];
194
            s += sq[(x>>24)&0xff];
195
#endif
196
#endif
197
            pix += 8;
198
        }
199
        pix += line_size - 16;
200
    }
201
    return s;
202
}
203

    
204
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205
    int i;
206
    
207
    for(i=0; i+8<=w; i+=8){
208
        dst[i+0]= bswap_32(src[i+0]);
209
        dst[i+1]= bswap_32(src[i+1]);
210
        dst[i+2]= bswap_32(src[i+2]);
211
        dst[i+3]= bswap_32(src[i+3]);
212
        dst[i+4]= bswap_32(src[i+4]);
213
        dst[i+5]= bswap_32(src[i+5]);
214
        dst[i+6]= bswap_32(src[i+6]);
215
        dst[i+7]= bswap_32(src[i+7]);
216
    }
217
    for(;i<w; i++){
218
        dst[i+0]= bswap_32(src[i+0]);
219
    }
220
}
221

    
222
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223
{
224
    int s, i;
225
    uint32_t *sq = squareTbl + 256;
226

    
227
    s = 0;
228
    for (i = 0; i < h; i++) {
229
        s += sq[pix1[0] - pix2[0]];
230
        s += sq[pix1[1] - pix2[1]];
231
        s += sq[pix1[2] - pix2[2]];
232
        s += sq[pix1[3] - pix2[3]];
233
        pix1 += line_size;
234
        pix2 += line_size;
235
    }
236
    return s;
237
}
238

    
239
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
240
{
241
    int s, i;
242
    uint32_t *sq = squareTbl + 256;
243

    
244
    s = 0;
245
    for (i = 0; i < h; i++) {
246
        s += sq[pix1[0] - pix2[0]];
247
        s += sq[pix1[1] - pix2[1]];
248
        s += sq[pix1[2] - pix2[2]];
249
        s += sq[pix1[3] - pix2[3]];
250
        s += sq[pix1[4] - pix2[4]];
251
        s += sq[pix1[5] - pix2[5]];
252
        s += sq[pix1[6] - pix2[6]];
253
        s += sq[pix1[7] - pix2[7]];
254
        pix1 += line_size;
255
        pix2 += line_size;
256
    }
257
    return s;
258
}
259

    
260
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
261
{
262
    int s, i;
263
    uint32_t *sq = squareTbl + 256;
264

    
265
    s = 0;
266
    for (i = 0; i < h; i++) {
267
        s += sq[pix1[ 0] - pix2[ 0]];
268
        s += sq[pix1[ 1] - pix2[ 1]];
269
        s += sq[pix1[ 2] - pix2[ 2]];
270
        s += sq[pix1[ 3] - pix2[ 3]];
271
        s += sq[pix1[ 4] - pix2[ 4]];
272
        s += sq[pix1[ 5] - pix2[ 5]];
273
        s += sq[pix1[ 6] - pix2[ 6]];
274
        s += sq[pix1[ 7] - pix2[ 7]];
275
        s += sq[pix1[ 8] - pix2[ 8]];
276
        s += sq[pix1[ 9] - pix2[ 9]];
277
        s += sq[pix1[10] - pix2[10]];
278
        s += sq[pix1[11] - pix2[11]];
279
        s += sq[pix1[12] - pix2[12]];
280
        s += sq[pix1[13] - pix2[13]];
281
        s += sq[pix1[14] - pix2[14]];
282
        s += sq[pix1[15] - pix2[15]];
283

    
284
        pix1 += line_size;
285
        pix2 += line_size;
286
    }
287
    return s;
288
}
289

    
290

    
291
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
292
    int s, i, j;
293
    const int dec_count= w==8 ? 3 : 4;
294
    int tmp[16*16];
295
#if 0
296
    int level, ori;
297
    static const int scale[2][2][4][4]={ 
298
      {
299
        {
300
            //8x8 dec=3
301
            {268, 239, 239, 213},
302
            {  0, 224, 224, 152},
303
            {  0, 135, 135, 110},
304
        },{
305
            //16x16 dec=4
306
            {344, 310, 310, 280},
307
            {  0, 320, 320, 228},
308
            {  0, 175, 175, 136},
309
            {  0, 129, 129, 102},
310
        }
311
      },{
312
        {//FIXME 5/3
313
            //8x8 dec=3
314
            {275, 245, 245, 218},
315
            {  0, 230, 230, 156},
316
            {  0, 138, 138, 113},
317
        },{
318
            //16x16 dec=4
319
            {352, 317, 317, 286},
320
            {  0, 328, 328, 233},
321
            {  0, 180, 180, 140},
322
            {  0, 132, 132, 105},
323
        }
324
      }
325
    };
326
#endif
327

    
328
    for (i = 0; i < h; i++) {
329
        for (j = 0; j < w; j+=4) {
330
            tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
331
            tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
332
            tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
333
            tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
334
        }
335
        pix1 += line_size;
336
        pix2 += line_size;
337
    }
338
    ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
339

    
340
    s=0;
341
#if 0
342
    for(level=0; level<dec_count; level++){
343
        for(ori= level ? 1 : 0; ori<4; ori++){
344
            int sx= (ori&1) ? 1<<level: 0;
345
            int stride= 16<<(dec_count-level);
346
            int sy= (ori&2) ? stride>>1 : 0;
347
            int size= 1<<level;
348
            
349
            for(i=0; i<size; i++){
350
                for(j=0; j<size; j++){
351
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
352
                    s += ABS(v);
353
                }
354
            }
355
        }
356
    }
357
#endif
358
    for (i = 0; i < h; i++) {
359
        for (j = 0; j < w; j+=4) {
360
            s+= ABS(tmp[16*i+j+0]);
361
            s+= ABS(tmp[16*i+j+1]);
362
            s+= ABS(tmp[16*i+j+2]);
363
            s+= ABS(tmp[16*i+j+3]);
364
        }
365
    }
366
    assert(s>=0); 
367
    
368
    return s>>2;
369
}
370

    
371
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
372
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
373
}
374

    
375
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
376
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
377
}
378

    
379
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
380
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
381
}
382

    
383
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
384
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
385
}
386

    
387
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
388
{
389
    int i;
390

    
391
    /* read the pixels */
392
    for(i=0;i<8;i++) {
393
        block[0] = pixels[0];
394
        block[1] = pixels[1];
395
        block[2] = pixels[2];
396
        block[3] = pixels[3];
397
        block[4] = pixels[4];
398
        block[5] = pixels[5];
399
        block[6] = pixels[6];
400
        block[7] = pixels[7];
401
        pixels += line_size;
402
        block += 8;
403
    }
404
}
405

    
406
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
407
                          const uint8_t *s2, int stride){
408
    int i;
409

    
410
    /* read the pixels */
411
    for(i=0;i<8;i++) {
412
        block[0] = s1[0] - s2[0];
413
        block[1] = s1[1] - s2[1];
414
        block[2] = s1[2] - s2[2];
415
        block[3] = s1[3] - s2[3];
416
        block[4] = s1[4] - s2[4];
417
        block[5] = s1[5] - s2[5];
418
        block[6] = s1[6] - s2[6];
419
        block[7] = s1[7] - s2[7];
420
        s1 += stride;
421
        s2 += stride;
422
        block += 8;
423
    }
424
}
425

    
426

    
427
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
428
                                 int line_size)
429
{
430
    int i;
431
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
432
    
433
    /* read the pixels */
434
    for(i=0;i<8;i++) {
435
        pixels[0] = cm[block[0]];
436
        pixels[1] = cm[block[1]];
437
        pixels[2] = cm[block[2]];
438
        pixels[3] = cm[block[3]];
439
        pixels[4] = cm[block[4]];
440
        pixels[5] = cm[block[5]];
441
        pixels[6] = cm[block[6]];
442
        pixels[7] = cm[block[7]];
443

    
444
        pixels += line_size;
445
        block += 8;
446
    }
447
}
448

    
449
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
450
                                 int line_size)
451
{
452
    int i;
453
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
454
    
455
    /* read the pixels */
456
    for(i=0;i<4;i++) {
457
        pixels[0] = cm[block[0]];
458
        pixels[1] = cm[block[1]];
459
        pixels[2] = cm[block[2]];
460
        pixels[3] = cm[block[3]];
461

    
462
        pixels += line_size;
463
        block += 8;
464
    }
465
}
466

    
467
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
468
                                 int line_size)
469
{
470
    int i;
471
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
472
    
473
    /* read the pixels */
474
    for(i=0;i<2;i++) {
475
        pixels[0] = cm[block[0]];
476
        pixels[1] = cm[block[1]];
477

    
478
        pixels += line_size;
479
        block += 8;
480
    }
481
}
482

    
483
static void put_signed_pixels_clamped_c(const DCTELEM *block, 
484
                                        uint8_t *restrict pixels,
485
                                        int line_size)
486
{
487
    int i, j;
488

    
489
    for (i = 0; i < 8; i++) {
490
        for (j = 0; j < 8; j++) {
491
            if (*block < -128)
492
                *pixels = 0;
493
            else if (*block > 127)
494
                *pixels = 255;
495
            else
496
                *pixels = (uint8_t)(*block + 128);
497
            block++;
498
            pixels++;
499
        }
500
        pixels += (line_size - 8);
501
    }
502
}
503

    
504
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
505
                          int line_size)
506
{
507
    int i;
508
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
509
    
510
    /* read the pixels */
511
    for(i=0;i<8;i++) {
512
        pixels[0] = cm[pixels[0] + block[0]];
513
        pixels[1] = cm[pixels[1] + block[1]];
514
        pixels[2] = cm[pixels[2] + block[2]];
515
        pixels[3] = cm[pixels[3] + block[3]];
516
        pixels[4] = cm[pixels[4] + block[4]];
517
        pixels[5] = cm[pixels[5] + block[5]];
518
        pixels[6] = cm[pixels[6] + block[6]];
519
        pixels[7] = cm[pixels[7] + block[7]];
520
        pixels += line_size;
521
        block += 8;
522
    }
523
}
524

    
525
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
526
                          int line_size)
527
{
528
    int i;
529
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
530
    
531
    /* read the pixels */
532
    for(i=0;i<4;i++) {
533
        pixels[0] = cm[pixels[0] + block[0]];
534
        pixels[1] = cm[pixels[1] + block[1]];
535
        pixels[2] = cm[pixels[2] + block[2]];
536
        pixels[3] = cm[pixels[3] + block[3]];
537
        pixels += line_size;
538
        block += 8;
539
    }
540
}
541

    
542
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
543
                          int line_size)
544
{
545
    int i;
546
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
547
    
548
    /* read the pixels */
549
    for(i=0;i<2;i++) {
550
        pixels[0] = cm[pixels[0] + block[0]];
551
        pixels[1] = cm[pixels[1] + block[1]];
552
        pixels += line_size;
553
        block += 8;
554
    }
555
}
556
#if 0
557

558
#define PIXOP2(OPNAME, OP) \
559
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
560
{\
561
    int i;\
562
    for(i=0; i<h; i++){\
563
        OP(*((uint64_t*)block), LD64(pixels));\
564
        pixels+=line_size;\
565
        block +=line_size;\
566
    }\
567
}\
568
\
569
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
570
{\
571
    int i;\
572
    for(i=0; i<h; i++){\
573
        const uint64_t a= LD64(pixels  );\
574
        const uint64_t b= LD64(pixels+1);\
575
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
576
        pixels+=line_size;\
577
        block +=line_size;\
578
    }\
579
}\
580
\
581
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
582
{\
583
    int i;\
584
    for(i=0; i<h; i++){\
585
        const uint64_t a= LD64(pixels  );\
586
        const uint64_t b= LD64(pixels+1);\
587
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
588
        pixels+=line_size;\
589
        block +=line_size;\
590
    }\
591
}\
592
\
593
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
594
{\
595
    int i;\
596
    for(i=0; i<h; i++){\
597
        const uint64_t a= LD64(pixels          );\
598
        const uint64_t b= LD64(pixels+line_size);\
599
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
600
        pixels+=line_size;\
601
        block +=line_size;\
602
    }\
603
}\
604
\
605
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
606
{\
607
    int i;\
608
    for(i=0; i<h; i++){\
609
        const uint64_t a= LD64(pixels          );\
610
        const uint64_t b= LD64(pixels+line_size);\
611
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
612
        pixels+=line_size;\
613
        block +=line_size;\
614
    }\
615
}\
616
\
617
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
618
{\
619
        int i;\
620
        const uint64_t a= LD64(pixels  );\
621
        const uint64_t b= LD64(pixels+1);\
622
        uint64_t l0=  (a&0x0303030303030303ULL)\
623
                    + (b&0x0303030303030303ULL)\
624
                    + 0x0202020202020202ULL;\
625
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
626
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
627
        uint64_t l1,h1;\
628
\
629
        pixels+=line_size;\
630
        for(i=0; i<h; i+=2){\
631
            uint64_t a= LD64(pixels  );\
632
            uint64_t b= LD64(pixels+1);\
633
            l1=  (a&0x0303030303030303ULL)\
634
               + (b&0x0303030303030303ULL);\
635
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
636
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
637
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
638
            pixels+=line_size;\
639
            block +=line_size;\
640
            a= LD64(pixels  );\
641
            b= LD64(pixels+1);\
642
            l0=  (a&0x0303030303030303ULL)\
643
               + (b&0x0303030303030303ULL)\
644
               + 0x0202020202020202ULL;\
645
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
646
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
647
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
648
            pixels+=line_size;\
649
            block +=line_size;\
650
        }\
651
}\
652
\
653
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
654
{\
655
        int i;\
656
        const uint64_t a= LD64(pixels  );\
657
        const uint64_t b= LD64(pixels+1);\
658
        uint64_t l0=  (a&0x0303030303030303ULL)\
659
                    + (b&0x0303030303030303ULL)\
660
                    + 0x0101010101010101ULL;\
661
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
662
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
663
        uint64_t l1,h1;\
664
\
665
        pixels+=line_size;\
666
        for(i=0; i<h; i+=2){\
667
            uint64_t a= LD64(pixels  );\
668
            uint64_t b= LD64(pixels+1);\
669
            l1=  (a&0x0303030303030303ULL)\
670
               + (b&0x0303030303030303ULL);\
671
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
672
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
673
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
674
            pixels+=line_size;\
675
            block +=line_size;\
676
            a= LD64(pixels  );\
677
            b= LD64(pixels+1);\
678
            l0=  (a&0x0303030303030303ULL)\
679
               + (b&0x0303030303030303ULL)\
680
               + 0x0101010101010101ULL;\
681
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
682
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
683
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
684
            pixels+=line_size;\
685
            block +=line_size;\
686
        }\
687
}\
688
\
689
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
690
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
691
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
692
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
693
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
694
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
695
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
696

697
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
698
#else // 64 bit variant
699

    
700
#define PIXOP2(OPNAME, OP) \
701
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
702
    int i;\
703
    for(i=0; i<h; i++){\
704
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
705
        pixels+=line_size;\
706
        block +=line_size;\
707
    }\
708
}\
709
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
710
    int i;\
711
    for(i=0; i<h; i++){\
712
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
713
        pixels+=line_size;\
714
        block +=line_size;\
715
    }\
716
}\
717
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
718
    int i;\
719
    for(i=0; i<h; i++){\
720
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
721
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
722
        pixels+=line_size;\
723
        block +=line_size;\
724
    }\
725
}\
726
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
727
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
728
}\
729
\
730
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
731
                                                int src_stride1, int src_stride2, int h){\
732
    int i;\
733
    for(i=0; i<h; i++){\
734
        uint32_t a,b;\
735
        a= LD32(&src1[i*src_stride1  ]);\
736
        b= LD32(&src2[i*src_stride2  ]);\
737
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
738
        a= LD32(&src1[i*src_stride1+4]);\
739
        b= LD32(&src2[i*src_stride2+4]);\
740
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
741
    }\
742
}\
743
\
744
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
745
                                                int src_stride1, int src_stride2, int h){\
746
    int i;\
747
    for(i=0; i<h; i++){\
748
        uint32_t a,b;\
749
        a= LD32(&src1[i*src_stride1  ]);\
750
        b= LD32(&src2[i*src_stride2  ]);\
751
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
752
        a= LD32(&src1[i*src_stride1+4]);\
753
        b= LD32(&src2[i*src_stride2+4]);\
754
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
755
    }\
756
}\
757
\
758
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
759
                                                int src_stride1, int src_stride2, int h){\
760
    int i;\
761
    for(i=0; i<h; i++){\
762
        uint32_t a,b;\
763
        a= LD32(&src1[i*src_stride1  ]);\
764
        b= LD32(&src2[i*src_stride2  ]);\
765
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
766
    }\
767
}\
768
\
769
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
770
                                                int src_stride1, int src_stride2, int h){\
771
    int i;\
772
    for(i=0; i<h; i++){\
773
        uint32_t a,b;\
774
        a= LD16(&src1[i*src_stride1  ]);\
775
        b= LD16(&src2[i*src_stride2  ]);\
776
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
777
    }\
778
}\
779
\
780
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
781
                                                int src_stride1, int src_stride2, int h){\
782
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
783
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
784
}\
785
\
786
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
787
                                                int src_stride1, int src_stride2, int h){\
788
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
789
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
790
}\
791
\
792
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
793
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
794
}\
795
\
796
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
797
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
798
}\
799
\
800
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
801
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
802
}\
803
\
804
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
805
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
806
}\
807
\
808
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
809
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
810
    int i;\
811
    for(i=0; i<h; i++){\
812
        uint32_t a, b, c, d, l0, l1, h0, h1;\
813
        a= LD32(&src1[i*src_stride1]);\
814
        b= LD32(&src2[i*src_stride2]);\
815
        c= LD32(&src3[i*src_stride3]);\
816
        d= LD32(&src4[i*src_stride4]);\
817
        l0=  (a&0x03030303UL)\
818
           + (b&0x03030303UL)\
819
           + 0x02020202UL;\
820
        h0= ((a&0xFCFCFCFCUL)>>2)\
821
          + ((b&0xFCFCFCFCUL)>>2);\
822
        l1=  (c&0x03030303UL)\
823
           + (d&0x03030303UL);\
824
        h1= ((c&0xFCFCFCFCUL)>>2)\
825
          + ((d&0xFCFCFCFCUL)>>2);\
826
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
827
        a= LD32(&src1[i*src_stride1+4]);\
828
        b= LD32(&src2[i*src_stride2+4]);\
829
        c= LD32(&src3[i*src_stride3+4]);\
830
        d= LD32(&src4[i*src_stride4+4]);\
831
        l0=  (a&0x03030303UL)\
832
           + (b&0x03030303UL)\
833
           + 0x02020202UL;\
834
        h0= ((a&0xFCFCFCFCUL)>>2)\
835
          + ((b&0xFCFCFCFCUL)>>2);\
836
        l1=  (c&0x03030303UL)\
837
           + (d&0x03030303UL);\
838
        h1= ((c&0xFCFCFCFCUL)>>2)\
839
          + ((d&0xFCFCFCFCUL)>>2);\
840
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
841
    }\
842
}\
843
\
844
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
845
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
846
}\
847
\
848
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
849
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
850
}\
851
\
852
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
853
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
854
}\
855
\
856
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
857
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
858
}\
859
\
860
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
861
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
862
    int i;\
863
    for(i=0; i<h; i++){\
864
        uint32_t a, b, c, d, l0, l1, h0, h1;\
865
        a= LD32(&src1[i*src_stride1]);\
866
        b= LD32(&src2[i*src_stride2]);\
867
        c= LD32(&src3[i*src_stride3]);\
868
        d= LD32(&src4[i*src_stride4]);\
869
        l0=  (a&0x03030303UL)\
870
           + (b&0x03030303UL)\
871
           + 0x01010101UL;\
872
        h0= ((a&0xFCFCFCFCUL)>>2)\
873
          + ((b&0xFCFCFCFCUL)>>2);\
874
        l1=  (c&0x03030303UL)\
875
           + (d&0x03030303UL);\
876
        h1= ((c&0xFCFCFCFCUL)>>2)\
877
          + ((d&0xFCFCFCFCUL)>>2);\
878
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
879
        a= LD32(&src1[i*src_stride1+4]);\
880
        b= LD32(&src2[i*src_stride2+4]);\
881
        c= LD32(&src3[i*src_stride3+4]);\
882
        d= LD32(&src4[i*src_stride4+4]);\
883
        l0=  (a&0x03030303UL)\
884
           + (b&0x03030303UL)\
885
           + 0x01010101UL;\
886
        h0= ((a&0xFCFCFCFCUL)>>2)\
887
          + ((b&0xFCFCFCFCUL)>>2);\
888
        l1=  (c&0x03030303UL)\
889
           + (d&0x03030303UL);\
890
        h1= ((c&0xFCFCFCFCUL)>>2)\
891
          + ((d&0xFCFCFCFCUL)>>2);\
892
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
893
    }\
894
}\
895
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
896
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
897
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
898
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
899
}\
900
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
901
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
902
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
903
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
904
}\
905
\
906
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
907
{\
908
        int i, a0, b0, a1, b1;\
909
        a0= pixels[0];\
910
        b0= pixels[1] + 2;\
911
        a0 += b0;\
912
        b0 += pixels[2];\
913
\
914
        pixels+=line_size;\
915
        for(i=0; i<h; i+=2){\
916
            a1= pixels[0];\
917
            b1= pixels[1];\
918
            a1 += b1;\
919
            b1 += pixels[2];\
920
\
921
            block[0]= (a1+a0)>>2; /* FIXME non put */\
922
            block[1]= (b1+b0)>>2;\
923
\
924
            pixels+=line_size;\
925
            block +=line_size;\
926
\
927
            a0= pixels[0];\
928
            b0= pixels[1] + 2;\
929
            a0 += b0;\
930
            b0 += pixels[2];\
931
\
932
            block[0]= (a1+a0)>>2;\
933
            block[1]= (b1+b0)>>2;\
934
            pixels+=line_size;\
935
            block +=line_size;\
936
        }\
937
}\
938
\
939
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
940
{\
941
        int i;\
942
        const uint32_t a= LD32(pixels  );\
943
        const uint32_t b= LD32(pixels+1);\
944
        uint32_t l0=  (a&0x03030303UL)\
945
                    + (b&0x03030303UL)\
946
                    + 0x02020202UL;\
947
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
948
                   + ((b&0xFCFCFCFCUL)>>2);\
949
        uint32_t l1,h1;\
950
\
951
        pixels+=line_size;\
952
        for(i=0; i<h; i+=2){\
953
            uint32_t a= LD32(pixels  );\
954
            uint32_t b= LD32(pixels+1);\
955
            l1=  (a&0x03030303UL)\
956
               + (b&0x03030303UL);\
957
            h1= ((a&0xFCFCFCFCUL)>>2)\
958
              + ((b&0xFCFCFCFCUL)>>2);\
959
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
960
            pixels+=line_size;\
961
            block +=line_size;\
962
            a= LD32(pixels  );\
963
            b= LD32(pixels+1);\
964
            l0=  (a&0x03030303UL)\
965
               + (b&0x03030303UL)\
966
               + 0x02020202UL;\
967
            h0= ((a&0xFCFCFCFCUL)>>2)\
968
              + ((b&0xFCFCFCFCUL)>>2);\
969
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
970
            pixels+=line_size;\
971
            block +=line_size;\
972
        }\
973
}\
974
\
975
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
976
{\
977
    int j;\
978
    for(j=0; j<2; j++){\
979
        int i;\
980
        const uint32_t a= LD32(pixels  );\
981
        const uint32_t b= LD32(pixels+1);\
982
        uint32_t l0=  (a&0x03030303UL)\
983
                    + (b&0x03030303UL)\
984
                    + 0x02020202UL;\
985
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
986
                   + ((b&0xFCFCFCFCUL)>>2);\
987
        uint32_t l1,h1;\
988
\
989
        pixels+=line_size;\
990
        for(i=0; i<h; i+=2){\
991
            uint32_t a= LD32(pixels  );\
992
            uint32_t b= LD32(pixels+1);\
993
            l1=  (a&0x03030303UL)\
994
               + (b&0x03030303UL);\
995
            h1= ((a&0xFCFCFCFCUL)>>2)\
996
              + ((b&0xFCFCFCFCUL)>>2);\
997
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
998
            pixels+=line_size;\
999
            block +=line_size;\
1000
            a= LD32(pixels  );\
1001
            b= LD32(pixels+1);\
1002
            l0=  (a&0x03030303UL)\
1003
               + (b&0x03030303UL)\
1004
               + 0x02020202UL;\
1005
            h0= ((a&0xFCFCFCFCUL)>>2)\
1006
              + ((b&0xFCFCFCFCUL)>>2);\
1007
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008
            pixels+=line_size;\
1009
            block +=line_size;\
1010
        }\
1011
        pixels+=4-line_size*(h+1);\
1012
        block +=4-line_size*h;\
1013
    }\
1014
}\
1015
\
1016
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1017
{\
1018
    int j;\
1019
    for(j=0; j<2; j++){\
1020
        int i;\
1021
        const uint32_t a= LD32(pixels  );\
1022
        const uint32_t b= LD32(pixels+1);\
1023
        uint32_t l0=  (a&0x03030303UL)\
1024
                    + (b&0x03030303UL)\
1025
                    + 0x01010101UL;\
1026
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1027
                   + ((b&0xFCFCFCFCUL)>>2);\
1028
        uint32_t l1,h1;\
1029
\
1030
        pixels+=line_size;\
1031
        for(i=0; i<h; i+=2){\
1032
            uint32_t a= LD32(pixels  );\
1033
            uint32_t b= LD32(pixels+1);\
1034
            l1=  (a&0x03030303UL)\
1035
               + (b&0x03030303UL);\
1036
            h1= ((a&0xFCFCFCFCUL)>>2)\
1037
              + ((b&0xFCFCFCFCUL)>>2);\
1038
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1039
            pixels+=line_size;\
1040
            block +=line_size;\
1041
            a= LD32(pixels  );\
1042
            b= LD32(pixels+1);\
1043
            l0=  (a&0x03030303UL)\
1044
               + (b&0x03030303UL)\
1045
               + 0x01010101UL;\
1046
            h0= ((a&0xFCFCFCFCUL)>>2)\
1047
              + ((b&0xFCFCFCFCUL)>>2);\
1048
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1049
            pixels+=line_size;\
1050
            block +=line_size;\
1051
        }\
1052
        pixels+=4-line_size*(h+1);\
1053
        block +=4-line_size*h;\
1054
    }\
1055
}\
1056
\
1057
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1058
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1059
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1060
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1061
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1062
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1063
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1064
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1065

    
1066
#define op_avg(a, b) a = rnd_avg32(a, b)
1067
#endif
1068
#define op_put(a, b) a = b
1069

    
1070
PIXOP2(avg, op_avg)
1071
PIXOP2(put, op_put)
1072
#undef op_avg
1073
#undef op_put
1074

    
1075
#define avg2(a,b) ((a+b+1)>>1)
1076
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1077

    
1078
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1079
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1080
}
1081

    
1082
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1083
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1084
}
1085

    
1086
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1087
{
1088
    const int A=(16-x16)*(16-y16);
1089
    const int B=(   x16)*(16-y16);
1090
    const int C=(16-x16)*(   y16);
1091
    const int D=(   x16)*(   y16);
1092
    int i;
1093

    
1094
    for(i=0; i<h; i++)
1095
    {
1096
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1097
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1098
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1099
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1100
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1101
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1102
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1103
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1104
        dst+= stride;
1105
        src+= stride;
1106
    }
1107
}
1108

    
1109
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
1110
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1111
{
1112
    int y, vx, vy;
1113
    const int s= 1<<shift;
1114
    
1115
    width--;
1116
    height--;
1117

    
1118
    for(y=0; y<h; y++){
1119
        int x;
1120

    
1121
        vx= ox;
1122
        vy= oy;
1123
        for(x=0; x<8; x++){ //XXX FIXME optimize
1124
            int src_x, src_y, frac_x, frac_y, index;
1125

    
1126
            src_x= vx>>16;
1127
            src_y= vy>>16;
1128
            frac_x= src_x&(s-1);
1129
            frac_y= src_y&(s-1);
1130
            src_x>>=shift;
1131
            src_y>>=shift;
1132
  
1133
            if((unsigned)src_x < width){
1134
                if((unsigned)src_y < height){
1135
                    index= src_x + src_y*stride;
1136
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1137
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1138
                                        + (  src[index+stride  ]*(s-frac_x)
1139
                                           + src[index+stride+1]*   frac_x )*   frac_y
1140
                                        + r)>>(shift*2);
1141
                }else{
1142
                    index= src_x + clip(src_y, 0, height)*stride;                    
1143
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
1144
                                          + src[index       +1]*   frac_x )*s
1145
                                        + r)>>(shift*2);
1146
                }
1147
            }else{
1148
                if((unsigned)src_y < height){
1149
                    index= clip(src_x, 0, width) + src_y*stride;                    
1150
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
1151
                                           + src[index+stride  ]*   frac_y )*s
1152
                                        + r)>>(shift*2);
1153
                }else{
1154
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
1155
                    dst[y*stride + x]=    src[index         ];
1156
                }
1157
            }
1158
            
1159
            vx+= dxx;
1160
            vy+= dyx;
1161
        }
1162
        ox += dxy;
1163
        oy += dyy;
1164
    }
1165
}
1166

    
1167
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1168
    switch(width){
1169
    case 2: put_pixels2_c (dst, src, stride, height); break;
1170
    case 4: put_pixels4_c (dst, src, stride, height); break;
1171
    case 8: put_pixels8_c (dst, src, stride, height); break;
1172
    case 16:put_pixels16_c(dst, src, stride, height); break;
1173
    }
1174
}
1175

    
1176
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1177
    int i,j;
1178
    for (i=0; i < height; i++) {
1179
      for (j=0; j < width; j++) {
1180
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1181
      }
1182
      src += stride;
1183
      dst += stride;
1184
    }
1185
}
1186

    
1187
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1188
    int i,j;
1189
    for (i=0; i < height; i++) {
1190
      for (j=0; j < width; j++) {
1191
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1192
      }
1193
      src += stride;
1194
      dst += stride;
1195
    }
1196
}
1197
    
1198
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1199
    int i,j;
1200
    for (i=0; i < height; i++) {
1201
      for (j=0; j < width; j++) {
1202
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1203
      }
1204
      src += stride;
1205
      dst += stride;
1206
    }
1207
}
1208
    
1209
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1210
    int i,j;
1211
    for (i=0; i < height; i++) {
1212
      for (j=0; j < width; j++) {
1213
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1214
      }
1215
      src += stride;
1216
      dst += stride;
1217
    }
1218
}
1219

    
1220
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1221
    int i,j;
1222
    for (i=0; i < height; i++) {
1223
      for (j=0; j < width; j++) {
1224
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1225
      }
1226
      src += stride;
1227
      dst += stride;
1228
    }
1229
}
1230

    
1231
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1232
    int i,j;
1233
    for (i=0; i < height; i++) {
1234
      for (j=0; j < width; j++) {
1235
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1236
      }
1237
      src += stride;
1238
      dst += stride;
1239
    }
1240
}
1241

    
1242
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1243
    int i,j;
1244
    for (i=0; i < height; i++) {
1245
      for (j=0; j < width; j++) {
1246
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1247
      }
1248
      src += stride;
1249
      dst += stride;
1250
    }
1251
}
1252

    
1253
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1254
    int i,j;
1255
    for (i=0; i < height; i++) {
1256
      for (j=0; j < width; j++) {
1257
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1258
      }
1259
      src += stride;
1260
      dst += stride;
1261
    }
1262
}
1263

    
1264
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1265
    switch(width){
1266
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1267
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1268
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1269
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1270
    }
1271
}
1272

    
1273
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1274
    int i,j;
1275
    for (i=0; i < height; i++) {
1276
      for (j=0; j < width; j++) {
1277
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1278
      }
1279
      src += stride;
1280
      dst += stride;
1281
    }
1282
}
1283

    
1284
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1285
    int i,j;
1286
    for (i=0; i < height; i++) {
1287
      for (j=0; j < width; j++) {
1288
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1289
      }
1290
      src += stride;
1291
      dst += stride;
1292
    }
1293
}
1294
    
1295
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1296
    int i,j;
1297
    for (i=0; i < height; i++) {
1298
      for (j=0; j < width; j++) {
1299
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1300
      }
1301
      src += stride;
1302
      dst += stride;
1303
    }
1304
}
1305
    
1306
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1307
    int i,j;
1308
    for (i=0; i < height; i++) {
1309
      for (j=0; j < width; j++) {
1310
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1311
      }
1312
      src += stride;
1313
      dst += stride;
1314
    }
1315
}
1316

    
1317
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1318
    int i,j;
1319
    for (i=0; i < height; i++) {
1320
      for (j=0; j < width; j++) {
1321
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1322
      }
1323
      src += stride;
1324
      dst += stride;
1325
    }
1326
}
1327

    
1328
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1329
    int i,j;
1330
    for (i=0; i < height; i++) {
1331
      for (j=0; j < width; j++) {
1332
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1333
      }
1334
      src += stride;
1335
      dst += stride;
1336
    }
1337
}
1338

    
1339
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1340
    int i,j;
1341
    for (i=0; i < height; i++) {
1342
      for (j=0; j < width; j++) {
1343
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1344
      }
1345
      src += stride;
1346
      dst += stride;
1347
    }
1348
}
1349

    
1350
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1351
    int i,j;
1352
    for (i=0; i < height; i++) {
1353
      for (j=0; j < width; j++) {
1354
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1355
      }
1356
      src += stride;
1357
      dst += stride;
1358
    }
1359
}
1360
#if 0
1361
#define TPEL_WIDTH(width)\
1362
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1363
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1364
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1365
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1366
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1367
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1368
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1369
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1370
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1371
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1372
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1373
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1374
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1375
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1376
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1377
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1378
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1379
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1380
#endif
1381

    
1382
#define H264_CHROMA_MC(OPNAME, OP)\
1383
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1384
    const int A=(8-x)*(8-y);\
1385
    const int B=(  x)*(8-y);\
1386
    const int C=(8-x)*(  y);\
1387
    const int D=(  x)*(  y);\
1388
    int i;\
1389
    \
1390
    assert(x<8 && y<8 && x>=0 && y>=0);\
1391
\
1392
    for(i=0; i<h; i++)\
1393
    {\
1394
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1395
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1396
        dst+= stride;\
1397
        src+= stride;\
1398
    }\
1399
}\
1400
\
1401
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1402
    const int A=(8-x)*(8-y);\
1403
    const int B=(  x)*(8-y);\
1404
    const int C=(8-x)*(  y);\
1405
    const int D=(  x)*(  y);\
1406
    int i;\
1407
    \
1408
    assert(x<8 && y<8 && x>=0 && y>=0);\
1409
\
1410
    for(i=0; i<h; i++)\
1411
    {\
1412
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1413
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1414
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1415
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1416
        dst+= stride;\
1417
        src+= stride;\
1418
    }\
1419
}\
1420
\
1421
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1422
    const int A=(8-x)*(8-y);\
1423
    const int B=(  x)*(8-y);\
1424
    const int C=(8-x)*(  y);\
1425
    const int D=(  x)*(  y);\
1426
    int i;\
1427
    \
1428
    assert(x<8 && y<8 && x>=0 && y>=0);\
1429
\
1430
    for(i=0; i<h; i++)\
1431
    {\
1432
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1433
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1434
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1435
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1436
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1437
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1438
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1439
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1440
        dst+= stride;\
1441
        src+= stride;\
1442
    }\
1443
}
1444

    
1445
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1446
#define op_put(a, b) a = (((b) + 32)>>6)
1447

    
1448
H264_CHROMA_MC(put_       , op_put)
1449
H264_CHROMA_MC(avg_       , op_avg)
1450
#undef op_avg
1451
#undef op_put
1452

    
1453
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1454
{
1455
    int i;
1456
    for(i=0; i<h; i++)
1457
    {
1458
        ST32(dst   , LD32(src   ));
1459
        dst+=dstStride;
1460
        src+=srcStride;
1461
    }
1462
}
1463

    
1464
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1465
{
1466
    int i;
1467
    for(i=0; i<h; i++)
1468
    {
1469
        ST32(dst   , LD32(src   ));
1470
        ST32(dst+4 , LD32(src+4 ));
1471
        dst+=dstStride;
1472
        src+=srcStride;
1473
    }
1474
}
1475

    
1476
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1477
{
1478
    int i;
1479
    for(i=0; i<h; i++)
1480
    {
1481
        ST32(dst   , LD32(src   ));
1482
        ST32(dst+4 , LD32(src+4 ));
1483
        ST32(dst+8 , LD32(src+8 ));
1484
        ST32(dst+12, LD32(src+12));
1485
        dst+=dstStride;
1486
        src+=srcStride;
1487
    }
1488
}
1489

    
1490
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1491
{
1492
    int i;
1493
    for(i=0; i<h; i++)
1494
    {
1495
        ST32(dst   , LD32(src   ));
1496
        ST32(dst+4 , LD32(src+4 ));
1497
        ST32(dst+8 , LD32(src+8 ));
1498
        ST32(dst+12, LD32(src+12));
1499
        dst[16]= src[16];
1500
        dst+=dstStride;
1501
        src+=srcStride;
1502
    }
1503
}
1504

    
1505
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1506
{
1507
    int i;
1508
    for(i=0; i<h; i++)
1509
    {
1510
        ST32(dst   , LD32(src   ));
1511
        ST32(dst+4 , LD32(src+4 ));
1512
        dst[8]= src[8];
1513
        dst+=dstStride;
1514
        src+=srcStride;
1515
    }
1516
}
1517

    
1518

    
1519
#define QPEL_MC(r, OPNAME, RND, OP) \
1520
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1521
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1522
    int i;\
1523
    for(i=0; i<h; i++)\
1524
    {\
1525
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1526
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1527
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1528
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1529
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1530
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1531
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1532
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1533
        dst+=dstStride;\
1534
        src+=srcStride;\
1535
    }\
1536
}\
1537
\
1538
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1539
    const int w=8;\
1540
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1541
    int i;\
1542
    for(i=0; i<w; i++)\
1543
    {\
1544
        const int src0= src[0*srcStride];\
1545
        const int src1= src[1*srcStride];\
1546
        const int src2= src[2*srcStride];\
1547
        const int src3= src[3*srcStride];\
1548
        const int src4= src[4*srcStride];\
1549
        const int src5= src[5*srcStride];\
1550
        const int src6= src[6*srcStride];\
1551
        const int src7= src[7*srcStride];\
1552
        const int src8= src[8*srcStride];\
1553
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1554
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1555
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1556
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1557
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1558
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1559
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1560
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1561
        dst++;\
1562
        src++;\
1563
    }\
1564
}\
1565
\
1566
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1567
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1568
    int i;\
1569
    \
1570
    for(i=0; i<h; i++)\
1571
    {\
1572
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1573
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1574
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1575
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1576
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1577
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1578
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1579
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1580
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1581
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1582
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1583
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1584
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1585
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1586
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1587
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1588
        dst+=dstStride;\
1589
        src+=srcStride;\
1590
    }\
1591
}\
1592
\
1593
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1594
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1595
    int i;\
1596
    const int w=16;\
1597
    for(i=0; i<w; i++)\
1598
    {\
1599
        const int src0= src[0*srcStride];\
1600
        const int src1= src[1*srcStride];\
1601
        const int src2= src[2*srcStride];\
1602
        const int src3= src[3*srcStride];\
1603
        const int src4= src[4*srcStride];\
1604
        const int src5= src[5*srcStride];\
1605
        const int src6= src[6*srcStride];\
1606
        const int src7= src[7*srcStride];\
1607
        const int src8= src[8*srcStride];\
1608
        const int src9= src[9*srcStride];\
1609
        const int src10= src[10*srcStride];\
1610
        const int src11= src[11*srcStride];\
1611
        const int src12= src[12*srcStride];\
1612
        const int src13= src[13*srcStride];\
1613
        const int src14= src[14*srcStride];\
1614
        const int src15= src[15*srcStride];\
1615
        const int src16= src[16*srcStride];\
1616
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1617
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1618
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1619
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1620
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1621
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1622
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1623
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1624
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1625
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1626
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1627
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1628
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1629
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1630
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1631
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1632
        dst++;\
1633
        src++;\
1634
    }\
1635
}\
1636
\
1637
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1638
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1639
}\
1640
\
1641
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1642
    uint8_t half[64];\
1643
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1644
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1645
}\
1646
\
1647
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1648
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1649
}\
1650
\
1651
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1652
    uint8_t half[64];\
1653
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1654
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1655
}\
1656
\
1657
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1658
    uint8_t full[16*9];\
1659
    uint8_t half[64];\
1660
    copy_block9(full, src, 16, stride, 9);\
1661
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1662
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1663
}\
1664
\
1665
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1666
    uint8_t full[16*9];\
1667
    copy_block9(full, src, 16, stride, 9);\
1668
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1669
}\
1670
\
1671
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1672
    uint8_t full[16*9];\
1673
    uint8_t half[64];\
1674
    copy_block9(full, src, 16, stride, 9);\
1675
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1676
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1677
}\
1678
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1679
    uint8_t full[16*9];\
1680
    uint8_t halfH[72];\
1681
    uint8_t halfV[64];\
1682
    uint8_t halfHV[64];\
1683
    copy_block9(full, src, 16, stride, 9);\
1684
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1685
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1686
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1687
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1688
}\
1689
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1690
    uint8_t full[16*9];\
1691
    uint8_t halfH[72];\
1692
    uint8_t halfHV[64];\
1693
    copy_block9(full, src, 16, stride, 9);\
1694
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1695
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1696
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1697
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1698
}\
1699
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1700
    uint8_t full[16*9];\
1701
    uint8_t halfH[72];\
1702
    uint8_t halfV[64];\
1703
    uint8_t halfHV[64];\
1704
    copy_block9(full, src, 16, stride, 9);\
1705
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1706
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1707
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1708
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1709
}\
1710
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1711
    uint8_t full[16*9];\
1712
    uint8_t halfH[72];\
1713
    uint8_t halfHV[64];\
1714
    copy_block9(full, src, 16, stride, 9);\
1715
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1716
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1717
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1718
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1719
}\
1720
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1721
    uint8_t full[16*9];\
1722
    uint8_t halfH[72];\
1723
    uint8_t halfV[64];\
1724
    uint8_t halfHV[64];\
1725
    copy_block9(full, src, 16, stride, 9);\
1726
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1727
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1728
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1729
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1730
}\
1731
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1732
    uint8_t full[16*9];\
1733
    uint8_t halfH[72];\
1734
    uint8_t halfHV[64];\
1735
    copy_block9(full, src, 16, stride, 9);\
1736
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1737
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1738
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1739
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1740
}\
1741
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1742
    uint8_t full[16*9];\
1743
    uint8_t halfH[72];\
1744
    uint8_t halfV[64];\
1745
    uint8_t halfHV[64];\
1746
    copy_block9(full, src, 16, stride, 9);\
1747
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1748
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1749
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1750
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1751
}\
1752
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1753
    uint8_t full[16*9];\
1754
    uint8_t halfH[72];\
1755
    uint8_t halfHV[64];\
1756
    copy_block9(full, src, 16, stride, 9);\
1757
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1758
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1759
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1760
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1761
}\
1762
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1763
    uint8_t halfH[72];\
1764
    uint8_t halfHV[64];\
1765
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1766
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1767
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1768
}\
1769
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1770
    uint8_t halfH[72];\
1771
    uint8_t halfHV[64];\
1772
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1773
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1774
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1775
}\
1776
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1777
    uint8_t full[16*9];\
1778
    uint8_t halfH[72];\
1779
    uint8_t halfV[64];\
1780
    uint8_t halfHV[64];\
1781
    copy_block9(full, src, 16, stride, 9);\
1782
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1783
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1784
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1785
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1786
}\
1787
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1788
    uint8_t full[16*9];\
1789
    uint8_t halfH[72];\
1790
    copy_block9(full, src, 16, stride, 9);\
1791
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1792
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1793
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1794
}\
1795
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1796
    uint8_t full[16*9];\
1797
    uint8_t halfH[72];\
1798
    uint8_t halfV[64];\
1799
    uint8_t halfHV[64];\
1800
    copy_block9(full, src, 16, stride, 9);\
1801
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1802
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1803
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1804
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1805
}\
1806
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1807
    uint8_t full[16*9];\
1808
    uint8_t halfH[72];\
1809
    copy_block9(full, src, 16, stride, 9);\
1810
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1812
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1813
}\
1814
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1815
    uint8_t halfH[72];\
1816
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1817
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1818
}\
1819
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1820
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1821
}\
1822
\
1823
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1824
    uint8_t half[256];\
1825
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1826
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1827
}\
1828
\
1829
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1830
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1831
}\
1832
\
1833
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1834
    uint8_t half[256];\
1835
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1836
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1837
}\
1838
\
1839
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1840
    uint8_t full[24*17];\
1841
    uint8_t half[256];\
1842
    copy_block17(full, src, 24, stride, 17);\
1843
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1844
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1845
}\
1846
\
1847
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1848
    uint8_t full[24*17];\
1849
    copy_block17(full, src, 24, stride, 17);\
1850
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1851
}\
1852
\
1853
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1854
    uint8_t full[24*17];\
1855
    uint8_t half[256];\
1856
    copy_block17(full, src, 24, stride, 17);\
1857
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1858
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1859
}\
1860
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1861
    uint8_t full[24*17];\
1862
    uint8_t halfH[272];\
1863
    uint8_t halfV[256];\
1864
    uint8_t halfHV[256];\
1865
    copy_block17(full, src, 24, stride, 17);\
1866
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1867
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1868
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1869
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1870
}\
1871
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1872
    uint8_t full[24*17];\
1873
    uint8_t halfH[272];\
1874
    uint8_t halfHV[256];\
1875
    copy_block17(full, src, 24, stride, 17);\
1876
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1877
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1878
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1879
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1880
}\
1881
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1882
    uint8_t full[24*17];\
1883
    uint8_t halfH[272];\
1884
    uint8_t halfV[256];\
1885
    uint8_t halfHV[256];\
1886
    copy_block17(full, src, 24, stride, 17);\
1887
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1888
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1889
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1890
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1891
}\
1892
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1893
    uint8_t full[24*17];\
1894
    uint8_t halfH[272];\
1895
    uint8_t halfHV[256];\
1896
    copy_block17(full, src, 24, stride, 17);\
1897
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1898
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1899
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1900
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1901
}\
1902
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1903
    uint8_t full[24*17];\
1904
    uint8_t halfH[272];\
1905
    uint8_t halfV[256];\
1906
    uint8_t halfHV[256];\
1907
    copy_block17(full, src, 24, stride, 17);\
1908
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1909
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1910
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1911
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1912
}\
1913
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1914
    uint8_t full[24*17];\
1915
    uint8_t halfH[272];\
1916
    uint8_t halfHV[256];\
1917
    copy_block17(full, src, 24, stride, 17);\
1918
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1919
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1920
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1921
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1922
}\
1923
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1924
    uint8_t full[24*17];\
1925
    uint8_t halfH[272];\
1926
    uint8_t halfV[256];\
1927
    uint8_t halfHV[256];\
1928
    copy_block17(full, src, 24, stride, 17);\
1929
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1930
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1931
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1932
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1933
}\
1934
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1935
    uint8_t full[24*17];\
1936
    uint8_t halfH[272];\
1937
    uint8_t halfHV[256];\
1938
    copy_block17(full, src, 24, stride, 17);\
1939
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1940
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1941
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1942
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1943
}\
1944
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1945
    uint8_t halfH[272];\
1946
    uint8_t halfHV[256];\
1947
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1948
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1949
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1950
}\
1951
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1952
    uint8_t halfH[272];\
1953
    uint8_t halfHV[256];\
1954
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1955
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1956
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1957
}\
1958
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1959
    uint8_t full[24*17];\
1960
    uint8_t halfH[272];\
1961
    uint8_t halfV[256];\
1962
    uint8_t halfHV[256];\
1963
    copy_block17(full, src, 24, stride, 17);\
1964
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1965
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1966
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1967
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1968
}\
1969
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1970
    uint8_t full[24*17];\
1971
    uint8_t halfH[272];\
1972
    copy_block17(full, src, 24, stride, 17);\
1973
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1974
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1975
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1976
}\
1977
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1978
    uint8_t full[24*17];\
1979
    uint8_t halfH[272];\
1980
    uint8_t halfV[256];\
1981
    uint8_t halfHV[256];\
1982
    copy_block17(full, src, 24, stride, 17);\
1983
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1984
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1985
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1986
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1987
}\
1988
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1989
    uint8_t full[24*17];\
1990
    uint8_t halfH[272];\
1991
    copy_block17(full, src, 24, stride, 17);\
1992
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1994
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1995
}\
1996
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1997
    uint8_t halfH[272];\
1998
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1999
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2000
}
2001

    
2002
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2003
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2004
#define op_put(a, b) a = cm[((b) + 16)>>5]
2005
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2006

    
2007
QPEL_MC(0, put_       , _       , op_put)
2008
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2009
QPEL_MC(0, avg_       , _       , op_avg)
2010
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2011
#undef op_avg
2012
#undef op_avg_no_rnd
2013
#undef op_put
2014
#undef op_put_no_rnd
2015

    
2016
#if 1
2017
#define H264_LOWPASS(OPNAME, OP, OP2) \
2018
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2019
    const int h=4;\
2020
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2021
    int i;\
2022
    for(i=0; i<h; i++)\
2023
    {\
2024
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2025
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2026
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2027
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2028
        dst+=dstStride;\
2029
        src+=srcStride;\
2030
    }\
2031
}\
2032
\
2033
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2034
    const int w=4;\
2035
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2036
    int i;\
2037
    for(i=0; i<w; i++)\
2038
    {\
2039
        const int srcB= src[-2*srcStride];\
2040
        const int srcA= src[-1*srcStride];\
2041
        const int src0= src[0 *srcStride];\
2042
        const int src1= src[1 *srcStride];\
2043
        const int src2= src[2 *srcStride];\
2044
        const int src3= src[3 *srcStride];\
2045
        const int src4= src[4 *srcStride];\
2046
        const int src5= src[5 *srcStride];\
2047
        const int src6= src[6 *srcStride];\
2048
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2049
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2050
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2051
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2052
        dst++;\
2053
        src++;\
2054
    }\
2055
}\
2056
\
2057
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2058
    const int h=4;\
2059
    const int w=4;\
2060
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2061
    int i;\
2062
    src -= 2*srcStride;\
2063
    for(i=0; i<h+5; i++)\
2064
    {\
2065
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2066
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2067
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2068
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2069
        tmp+=tmpStride;\
2070
        src+=srcStride;\
2071
    }\
2072
    tmp -= tmpStride*(h+5-2);\
2073
    for(i=0; i<w; i++)\
2074
    {\
2075
        const int tmpB= tmp[-2*tmpStride];\
2076
        const int tmpA= tmp[-1*tmpStride];\
2077
        const int tmp0= tmp[0 *tmpStride];\
2078
        const int tmp1= tmp[1 *tmpStride];\
2079
        const int tmp2= tmp[2 *tmpStride];\
2080
        const int tmp3= tmp[3 *tmpStride];\
2081
        const int tmp4= tmp[4 *tmpStride];\
2082
        const int tmp5= tmp[5 *tmpStride];\
2083
        const int tmp6= tmp[6 *tmpStride];\
2084
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2085
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2086
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2087
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2088
        dst++;\
2089
        tmp++;\
2090
    }\
2091
}\
2092
\
2093
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2094
    const int h=8;\
2095
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2096
    int i;\
2097
    for(i=0; i<h; i++)\
2098
    {\
2099
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2100
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2101
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2102
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2103
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2104
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2105
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2106
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2107
        dst+=dstStride;\
2108
        src+=srcStride;\
2109
    }\
2110
}\
2111
\
2112
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2113
    const int w=8;\
2114
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2115
    int i;\
2116
    for(i=0; i<w; i++)\
2117
    {\
2118
        const int srcB= src[-2*srcStride];\
2119
        const int srcA= src[-1*srcStride];\
2120
        const int src0= src[0 *srcStride];\
2121
        const int src1= src[1 *srcStride];\
2122
        const int src2= src[2 *srcStride];\
2123
        const int src3= src[3 *srcStride];\
2124
        const int src4= src[4 *srcStride];\
2125
        const int src5= src[5 *srcStride];\
2126
        const int src6= src[6 *srcStride];\
2127
        const int src7= src[7 *srcStride];\
2128
        const int src8= src[8 *srcStride];\
2129
        const int src9= src[9 *srcStride];\
2130
        const int src10=src[10*srcStride];\
2131
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2132
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2133
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2134
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2135
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2136
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2137
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2138
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2139
        dst++;\
2140
        src++;\
2141
    }\
2142
}\
2143
\
2144
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2145
    const int h=8;\
2146
    const int w=8;\
2147
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2148
    int i;\
2149
    src -= 2*srcStride;\
2150
    for(i=0; i<h+5; i++)\
2151
    {\
2152
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2153
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2154
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2155
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2156
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2157
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2158
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2159
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2160
        tmp+=tmpStride;\
2161
        src+=srcStride;\
2162
    }\
2163
    tmp -= tmpStride*(h+5-2);\
2164
    for(i=0; i<w; i++)\
2165
    {\
2166
        const int tmpB= tmp[-2*tmpStride];\
2167
        const int tmpA= tmp[-1*tmpStride];\
2168
        const int tmp0= tmp[0 *tmpStride];\
2169
        const int tmp1= tmp[1 *tmpStride];\
2170
        const int tmp2= tmp[2 *tmpStride];\
2171
        const int tmp3= tmp[3 *tmpStride];\
2172
        const int tmp4= tmp[4 *tmpStride];\
2173
        const int tmp5= tmp[5 *tmpStride];\
2174
        const int tmp6= tmp[6 *tmpStride];\
2175
        const int tmp7= tmp[7 *tmpStride];\
2176
        const int tmp8= tmp[8 *tmpStride];\
2177
        const int tmp9= tmp[9 *tmpStride];\
2178
        const int tmp10=tmp[10*tmpStride];\
2179
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2180
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2181
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2182
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2183
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2184
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2185
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2186
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2187
        dst++;\
2188
        tmp++;\
2189
    }\
2190
}\
2191
\
2192
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2193
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2194
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2195
    src += 8*srcStride;\
2196
    dst += 8*dstStride;\
2197
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2198
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2199
}\
2200
\
2201
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2202
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2203
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2204
    src += 8*srcStride;\
2205
    dst += 8*dstStride;\
2206
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2207
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2208
}\
2209
\
2210
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2211
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2212
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2213
    src += 8*srcStride;\
2214
    dst += 8*dstStride;\
2215
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2216
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2217
}\
2218

    
2219
#define H264_MC(OPNAME, SIZE) \
2220
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2221
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2222
}\
2223
\
2224
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2225
    uint8_t half[SIZE*SIZE];\
2226
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2227
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2228
}\
2229
\
2230
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2231
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2232
}\
2233
\
2234
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2235
    uint8_t half[SIZE*SIZE];\
2236
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2237
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2238
}\
2239
\
2240
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2241
    uint8_t full[SIZE*(SIZE+5)];\
2242
    uint8_t * const full_mid= full + SIZE*2;\
2243
    uint8_t half[SIZE*SIZE];\
2244
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2245
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2246
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2247
}\
2248
\
2249
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2250
    uint8_t full[SIZE*(SIZE+5)];\
2251
    uint8_t * const full_mid= full + SIZE*2;\
2252
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2253
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2254
}\
2255
\
2256
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2257
    uint8_t full[SIZE*(SIZE+5)];\
2258
    uint8_t * const full_mid= full + SIZE*2;\
2259
    uint8_t half[SIZE*SIZE];\
2260
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2261
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2262
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2263
}\
2264
\
2265
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2266
    uint8_t full[SIZE*(SIZE+5)];\
2267
    uint8_t * const full_mid= full + SIZE*2;\
2268
    uint8_t halfH[SIZE*SIZE];\
2269
    uint8_t halfV[SIZE*SIZE];\
2270
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2271
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2272
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2273
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2274
}\
2275
\
2276
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2277
    uint8_t full[SIZE*(SIZE+5)];\
2278
    uint8_t * const full_mid= full + SIZE*2;\
2279
    uint8_t halfH[SIZE*SIZE];\
2280
    uint8_t halfV[SIZE*SIZE];\
2281
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2282
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2283
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2284
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2285
}\
2286
\
2287
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2288
    uint8_t full[SIZE*(SIZE+5)];\
2289
    uint8_t * const full_mid= full + SIZE*2;\
2290
    uint8_t halfH[SIZE*SIZE];\
2291
    uint8_t halfV[SIZE*SIZE];\
2292
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2293
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2294
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2295
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2296
}\
2297
\
2298
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2299
    uint8_t full[SIZE*(SIZE+5)];\
2300
    uint8_t * const full_mid= full + SIZE*2;\
2301
    uint8_t halfH[SIZE*SIZE];\
2302
    uint8_t halfV[SIZE*SIZE];\
2303
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2304
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2305
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2306
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2307
}\
2308
\
2309
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2310
    int16_t tmp[SIZE*(SIZE+5)];\
2311
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2312
}\
2313
\
2314
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2315
    int16_t tmp[SIZE*(SIZE+5)];\
2316
    uint8_t halfH[SIZE*SIZE];\
2317
    uint8_t halfHV[SIZE*SIZE];\
2318
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2319
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2320
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2321
}\
2322
\
2323
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2324
    int16_t tmp[SIZE*(SIZE+5)];\
2325
    uint8_t halfH[SIZE*SIZE];\
2326
    uint8_t halfHV[SIZE*SIZE];\
2327
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2328
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2329
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2330
}\
2331
\
2332
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2333
    uint8_t full[SIZE*(SIZE+5)];\
2334
    uint8_t * const full_mid= full + SIZE*2;\
2335
    int16_t tmp[SIZE*(SIZE+5)];\
2336
    uint8_t halfV[SIZE*SIZE];\
2337
    uint8_t halfHV[SIZE*SIZE];\
2338
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2339
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2340
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2341
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2342
}\
2343
\
2344
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2345
    uint8_t full[SIZE*(SIZE+5)];\
2346
    uint8_t * const full_mid= full + SIZE*2;\
2347
    int16_t tmp[SIZE*(SIZE+5)];\
2348
    uint8_t halfV[SIZE*SIZE];\
2349
    uint8_t halfHV[SIZE*SIZE];\
2350
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2351
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2352
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2353
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2354
}\
2355

    
2356
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2357
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2358
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2359
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2360
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2361

    
2362
H264_LOWPASS(put_       , op_put, op2_put)
2363
H264_LOWPASS(avg_       , op_avg, op2_avg)
2364
H264_MC(put_, 4)
2365
H264_MC(put_, 8)
2366
H264_MC(put_, 16)
2367
H264_MC(avg_, 4)
2368
H264_MC(avg_, 8)
2369
H264_MC(avg_, 16)
2370

    
2371
#undef op_avg
2372
#undef op_put
2373
#undef op2_avg
2374
#undef op2_put
2375
#endif
2376

    
2377
static inline uint8_t clip1(int x){
2378
    if(x > 255) return 255;
2379
    if(x < 0)   return 0;
2380
    return x;
2381
}
2382
#define op_scale1(x)  block[x] = clip1( (block[x]*weight + offset) >> log2_denom )
2383
#define op_scale2(x)  dst[x] = clip( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1), 0, 255 )
2384
#define H264_WEIGHT(W,H) \
2385
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2386
    int x, y; \
2387
    offset <<= log2_denom; \
2388
    if(log2_denom) offset += 1<<(log2_denom-1); \
2389
    for(y=0; y<H; y++, block += stride){ \
2390
        op_scale1(0); \
2391
        op_scale1(1); \
2392
        if(W==2) continue; \
2393
        op_scale1(2); \
2394
        op_scale1(3); \
2395
        if(W==4) continue; \
2396
        op_scale1(4); \
2397
        op_scale1(5); \
2398
        op_scale1(6); \
2399
        op_scale1(7); \
2400
        if(W==8) continue; \
2401
        op_scale1(8); \
2402
        op_scale1(9); \
2403
        op_scale1(10); \
2404
        op_scale1(11); \
2405
        op_scale1(12); \
2406
        op_scale1(13); \
2407
        op_scale1(14); \
2408
        op_scale1(15); \
2409
    } \
2410
} \
2411
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2412
    int x, y; \
2413
    int offset = (offsets + offsetd + 1) >> 1; \
2414
    offset = ((offset << 1) + 1) << log2_denom; \
2415
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2416
        op_scale2(0); \
2417
        op_scale2(1); \
2418
        if(W==2) continue; \
2419
        op_scale2(2); \
2420
        op_scale2(3); \
2421
        if(W==4) continue; \
2422
        op_scale2(4); \
2423
        op_scale2(5); \
2424
        op_scale2(6); \
2425
        op_scale2(7); \
2426
        if(W==8) continue; \
2427
        op_scale2(8); \
2428
        op_scale2(9); \
2429
        op_scale2(10); \
2430
        op_scale2(11); \
2431
        op_scale2(12); \
2432
        op_scale2(13); \
2433
        op_scale2(14); \
2434
        op_scale2(15); \
2435
    } \
2436
}
2437

    
2438
H264_WEIGHT(16,16)
2439
H264_WEIGHT(16,8)
2440
H264_WEIGHT(8,16)
2441
H264_WEIGHT(8,8)
2442
H264_WEIGHT(8,4)
2443
H264_WEIGHT(4,8)
2444
H264_WEIGHT(4,4)
2445
H264_WEIGHT(4,2)
2446
H264_WEIGHT(2,4)
2447
H264_WEIGHT(2,2)
2448

    
2449
#undef op_scale1
2450
#undef op_scale2
2451
#undef H264_WEIGHT
2452

    
2453
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2454
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2455
    int i;
2456

    
2457
    for(i=0; i<h; i++){
2458
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2459
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2460
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2461
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2462
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2463
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2464
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2465
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2466
        dst+=dstStride;
2467
        src+=srcStride;        
2468
    }
2469
}
2470

    
2471
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2472
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2473
    int i;
2474

    
2475
    for(i=0; i<w; i++){
2476
        const int src_1= src[ -srcStride];
2477
        const int src0 = src[0          ];
2478
        const int src1 = src[  srcStride];
2479
        const int src2 = src[2*srcStride];
2480
        const int src3 = src[3*srcStride];
2481
        const int src4 = src[4*srcStride];
2482
        const int src5 = src[5*srcStride];
2483
        const int src6 = src[6*srcStride];
2484
        const int src7 = src[7*srcStride];
2485
        const int src8 = src[8*srcStride];
2486
        const int src9 = src[9*srcStride];
2487
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2488
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2489
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2490
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2491
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2492
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2493
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2494
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2495
        src++;
2496
        dst++;
2497
    }
2498
}
2499

    
2500
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2501
    put_pixels8_c(dst, src, stride, 8);
2502
}
2503

    
2504
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2505
    uint8_t half[64];
2506
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2507
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2508
}
2509

    
2510
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2511
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2512
}
2513

    
2514
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2515
    uint8_t half[64];
2516
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2517
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2518
}
2519

    
2520
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2521
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2522
}
2523

    
2524
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2525
    uint8_t halfH[88];
2526
    uint8_t halfV[64];
2527
    uint8_t halfHV[64];
2528
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2529
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2530
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2531
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2532
}
2533
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2534
    uint8_t halfH[88];
2535
    uint8_t halfV[64];
2536
    uint8_t halfHV[64];
2537
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2538
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2539
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2540
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2541
}
2542
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2543
    uint8_t halfH[88];
2544
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2545
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2546
}
2547

    
2548
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2549
    int x;
2550
    const int strength= ff_h263_loop_filter_strength[qscale];
2551
    
2552
    for(x=0; x<8; x++){
2553
        int d1, d2, ad1;
2554
        int p0= src[x-2*stride];
2555
        int p1= src[x-1*stride];
2556
        int p2= src[x+0*stride];
2557
        int p3= src[x+1*stride];
2558
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2559

    
2560
        if     (d<-2*strength) d1= 0;
2561
        else if(d<-  strength) d1=-2*strength - d;
2562
        else if(d<   strength) d1= d;
2563
        else if(d< 2*strength) d1= 2*strength - d;
2564
        else                   d1= 0;
2565
        
2566
        p1 += d1;
2567
        p2 -= d1;
2568
        if(p1&256) p1= ~(p1>>31);
2569
        if(p2&256) p2= ~(p2>>31);
2570
        
2571
        src[x-1*stride] = p1;
2572
        src[x+0*stride] = p2;
2573

    
2574
        ad1= ABS(d1)>>1;
2575
        
2576
        d2= clip((p0-p3)/4, -ad1, ad1);
2577
        
2578
        src[x-2*stride] = p0 - d2;
2579
        src[x+  stride] = p3 + d2;
2580
    }
2581
}
2582

    
2583
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2584
    int y;
2585
    const int strength= ff_h263_loop_filter_strength[qscale];
2586
    
2587
    for(y=0; y<8; y++){
2588
        int d1, d2, ad1;
2589
        int p0= src[y*stride-2];
2590
        int p1= src[y*stride-1];
2591
        int p2= src[y*stride+0];
2592
        int p3= src[y*stride+1];
2593
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2594

    
2595
        if     (d<-2*strength) d1= 0;
2596
        else if(d<-  strength) d1=-2*strength - d;
2597
        else if(d<   strength) d1= d;
2598
        else if(d< 2*strength) d1= 2*strength - d;
2599
        else                   d1= 0;
2600
        
2601
        p1 += d1;
2602
        p2 -= d1;
2603
        if(p1&256) p1= ~(p1>>31);
2604
        if(p2&256) p2= ~(p2>>31);
2605
        
2606
        src[y*stride-1] = p1;
2607
        src[y*stride+0] = p2;
2608

    
2609
        ad1= ABS(d1)>>1;
2610
        
2611
        d2= clip((p0-p3)/4, -ad1, ad1);
2612
        
2613
        src[y*stride-2] = p0 - d2;
2614
        src[y*stride+1] = p3 + d2;
2615
    }
2616
}
2617

    
2618
static void h261_loop_filter_c(uint8_t *src, int stride){
2619
    int x,y,xy,yz;
2620
    int temp[64];
2621

    
2622
    for(x=0; x<8; x++){
2623
        temp[x      ] = 4*src[x           ];
2624
        temp[x + 7*8] = 4*src[x + 7*stride];
2625
    }
2626
    for(y=1; y<7; y++){
2627
        for(x=0; x<8; x++){
2628
            xy = y * stride + x;
2629
            yz = y * 8 + x;
2630
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2631
        }
2632
    }
2633
        
2634
    for(y=0; y<8; y++){
2635
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2636
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2637
        for(x=1; x<7; x++){
2638
            xy = y * stride + x;
2639
            yz = y * 8 + x;
2640
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2641
        }
2642
    }
2643
}
2644

    
2645
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2646
{
2647
    int s, i;
2648

    
2649
    s = 0;
2650
    for(i=0;i<h;i++) {
2651
        s += abs(pix1[0] - pix2[0]);
2652
        s += abs(pix1[1] - pix2[1]);
2653
        s += abs(pix1[2] - pix2[2]);
2654
        s += abs(pix1[3] - pix2[3]);
2655
        s += abs(pix1[4] - pix2[4]);
2656
        s += abs(pix1[5] - pix2[5]);
2657
        s += abs(pix1[6] - pix2[6]);
2658
        s += abs(pix1[7] - pix2[7]);
2659
        s += abs(pix1[8] - pix2[8]);
2660
        s += abs(pix1[9] - pix2[9]);
2661
        s += abs(pix1[10] - pix2[10]);
2662
        s += abs(pix1[11] - pix2[11]);
2663
        s += abs(pix1[12] - pix2[12]);
2664
        s += abs(pix1[13] - pix2[13]);
2665
        s += abs(pix1[14] - pix2[14]);
2666
        s += abs(pix1[15] - pix2[15]);
2667
        pix1 += line_size;
2668
        pix2 += line_size;
2669
    }
2670
    return s;
2671
}
2672

    
2673
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2674
{
2675
    int s, i;
2676

    
2677
    s = 0;
2678
    for(i=0;i<h;i++) {
2679
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2680
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2681
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2682
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2683
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2684
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2685
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2686
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2687
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2688
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2689
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2690
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2691
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2692
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2693
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2694
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2695
        pix1 += line_size;
2696
        pix2 += line_size;
2697
    }
2698
    return s;
2699
}
2700

    
2701
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2702
{
2703
    int s, i;
2704
    uint8_t *pix3 = pix2 + line_size;
2705

    
2706
    s = 0;
2707
    for(i=0;i<h;i++) {
2708
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2709
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2710
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2711
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2712
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2713
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2714
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2715
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2716
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2717
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2718
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2719
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2720
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2721
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2722
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2723
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2724
        pix1 += line_size;
2725
        pix2 += line_size;
2726
        pix3 += line_size;
2727
    }
2728
    return s;
2729
}
2730

    
2731
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2732
{
2733
    int s, i;
2734
    uint8_t *pix3 = pix2 + line_size;
2735

    
2736
    s = 0;
2737
    for(i=0;i<h;i++) {
2738
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2739
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2740
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2741
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2742
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2743
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2744
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2745
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2746
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2747
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2748
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2749
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2750
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2751
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2752
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2753
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2754
        pix1 += line_size;
2755
        pix2 += line_size;
2756
        pix3 += line_size;
2757
    }
2758
    return s;
2759
}
2760

    
2761
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2762
{
2763
    int s, i;
2764

    
2765
    s = 0;
2766
    for(i=0;i<h;i++) {
2767
        s += abs(pix1[0] - pix2[0]);
2768
        s += abs(pix1[1] - pix2[1]);
2769
        s += abs(pix1[2] - pix2[2]);
2770
        s += abs(pix1[3] - pix2[3]);
2771
        s += abs(pix1[4] - pix2[4]);
2772
        s += abs(pix1[5] - pix2[5]);
2773
        s += abs(pix1[6] - pix2[6]);
2774
        s += abs(pix1[7] - pix2[7]);
2775
        pix1 += line_size;
2776
        pix2 += line_size;
2777
    }
2778
    return s;
2779
}
2780

    
2781
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2782
{
2783
    int s, i;
2784

    
2785
    s = 0;
2786
    for(i=0;i<h;i++) {
2787
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2788
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2789
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2790
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2791
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2792
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2793
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2794
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2795
        pix1 += line_size;
2796
        pix2 += line_size;
2797
    }
2798
    return s;
2799
}
2800

    
2801
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2802
{
2803
    int s, i;
2804
    uint8_t *pix3 = pix2 + line_size;
2805

    
2806
    s = 0;
2807
    for(i=0;i<h;i++) {
2808
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2809
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2810
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2811
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2812
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2813
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2814
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2815
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2816
        pix1 += line_size;
2817
        pix2 += line_size;
2818
        pix3 += line_size;
2819
    }
2820
    return s;
2821
}
2822

    
2823
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2824
{
2825
    int s, i;
2826
    uint8_t *pix3 = pix2 + line_size;
2827

    
2828
    s = 0;
2829
    for(i=0;i<h;i++) {
2830
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2831
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2832
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2833
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2834
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2835
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2836
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2837
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2838
        pix1 += line_size;
2839
        pix2 += line_size;
2840
        pix3 += line_size;
2841
    }
2842
    return s;
2843
}
2844

    
2845
static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2846
    int score1=0;
2847
    int score2=0;
2848
    int x,y;
2849

    
2850
    for(y=0; y<h; y++){
2851
        for(x=0; x<16; x++){
2852
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2853
        }
2854
        if(y+1<h){
2855
            for(x=0; x<15; x++){
2856
                score2+= ABS(  s1[x  ] - s1[x  +stride]
2857
                             - s1[x+1] + s1[x+1+stride])
2858
                        -ABS(  s2[x  ] - s2[x  +stride]
2859
                             - s2[x+1] + s2[x+1+stride]);
2860
            }
2861
        }
2862
        s1+= stride;
2863
        s2+= stride;
2864
    }
2865

    
2866
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2867
    else  return score1 + ABS(score2)*8;
2868
}
2869

    
2870
static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2871
    int score1=0;
2872
    int score2=0;
2873
    int x,y;
2874
    
2875
    for(y=0; y<h; y++){
2876
        for(x=0; x<8; x++){
2877
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2878
        }
2879
        if(y+1<h){
2880
            for(x=0; x<7; x++){
2881
                score2+= ABS(  s1[x  ] - s1[x  +stride]
2882
                             - s1[x+1] + s1[x+1+stride])
2883
                        -ABS(  s2[x  ] - s2[x  +stride]
2884
                             - s2[x+1] + s2[x+1+stride]);
2885
            }
2886
        }
2887
        s1+= stride;
2888
        s2+= stride;
2889
    }
2890
    
2891
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2892
    else  return score1 + ABS(score2)*8;
2893
}
2894

    
2895
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2896
    int i;
2897
    unsigned int sum=0;
2898

    
2899
    for(i=0; i<8*8; i++){
2900
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2901
        int w= weight[i];
2902
        b>>= RECON_SHIFT;
2903
        assert(-512<b && b<512);
2904

    
2905
        sum += (w*b)*(w*b)>>4;
2906
    }
2907
    return sum>>2;
2908
}
2909

    
2910
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2911
    int i;
2912

    
2913
    for(i=0; i<8*8; i++){
2914
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2915
    }    
2916
}
2917

    
2918
/**
2919
 * permutes an 8x8 block.
2920
 * @param block the block which will be permuted according to the given permutation vector
2921
 * @param permutation the permutation vector
2922
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2923
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
2924
 *                  (inverse) permutated to scantable order!
2925
 */
2926
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2927
{
2928
    int i;
2929
    DCTELEM temp[64];
2930
    
2931
    if(last<=0) return;
2932
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2933

    
2934
    for(i=0; i<=last; i++){
2935
        const int j= scantable[i];
2936
        temp[j]= block[j];
2937
        block[j]=0;
2938
    }
2939
    
2940
    for(i=0; i<=last; i++){
2941
        const int j= scantable[i];
2942
        const int perm_j= permutation[j];
2943
        block[perm_j]= temp[j];
2944
    }
2945
}
2946

    
2947
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2948
    return 0;
2949
}
2950

    
2951
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2952
    int i;
2953
    
2954
    memset(cmp, 0, sizeof(void*)*5);
2955
        
2956
    for(i=0; i<5; i++){
2957
        switch(type&0xFF){
2958
        case FF_CMP_SAD:
2959
            cmp[i]= c->sad[i];
2960
            break;
2961
        case FF_CMP_SATD:
2962
            cmp[i]= c->hadamard8_diff[i];
2963
            break;
2964
        case FF_CMP_SSE:
2965
            cmp[i]= c->sse[i];
2966
            break;
2967
        case FF_CMP_DCT:
2968
            cmp[i]= c->dct_sad[i];
2969
            break;
2970
        case FF_CMP_DCTMAX:
2971
            cmp[i]= c->dct_max[i];
2972
            break;
2973
        case FF_CMP_PSNR:
2974
            cmp[i]= c->quant_psnr[i];
2975
            break;
2976
        case FF_CMP_BIT:
2977
            cmp[i]= c->bit[i];
2978
            break;
2979
        case FF_CMP_RD:
2980
            cmp[i]= c->rd[i];
2981
            break;
2982
        case FF_CMP_VSAD:
2983
            cmp[i]= c->vsad[i];
2984
            break;
2985
        case FF_CMP_VSSE:
2986
            cmp[i]= c->vsse[i];
2987
            break;
2988
        case FF_CMP_ZERO:
2989
            cmp[i]= zero_cmp;
2990
            break;
2991
        case FF_CMP_NSSE:
2992
            cmp[i]= c->nsse[i];
2993
            break;
2994
        case FF_CMP_W53:
2995
            cmp[i]= c->w53[i];
2996
            break;
2997
        case FF_CMP_W97:
2998
            cmp[i]= c->w97[i];
2999
            break;
3000
        default:
3001
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3002
        }
3003
    }
3004
}
3005

    
3006
/**
3007
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3008
 */
3009
static void clear_blocks_c(DCTELEM *blocks)
3010
{
3011
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3012
}
3013

    
3014
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3015
    int i;
3016
    for(i=0; i+7<w; i+=8){
3017
        dst[i+0] += src[i+0];
3018
        dst[i+1] += src[i+1];
3019
        dst[i+2] += src[i+2];
3020
        dst[i+3] += src[i+3];
3021
        dst[i+4] += src[i+4];
3022
        dst[i+5] += src[i+5];
3023
        dst[i+6] += src[i+6];
3024
        dst[i+7] += src[i+7];
3025
    }
3026
    for(; i<w; i++)
3027
        dst[i+0] += src[i+0];
3028
}
3029

    
3030
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3031
    int i;
3032
    for(i=0; i+7<w; i+=8){
3033
        dst[i+0] = src1[i+0]-src2[i+0];
3034
        dst[i+1] = src1[i+1]-src2[i+1];
3035
        dst[i+2] = src1[i+2]-src2[i+2];
3036
        dst[i+3] = src1[i+3]-src2[i+3];
3037
        dst[i+4] = src1[i+4]-src2[i+4];
3038
        dst[i+5] = src1[i+5]-src2[i+5];
3039
        dst[i+6] = src1[i+6]-src2[i+6];
3040
        dst[i+7] = src1[i+7]-src2[i+7];
3041
    }
3042
    for(; i<w; i++)
3043
        dst[i+0] = src1[i+0]-src2[i+0];
3044
}
3045

    
3046
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3047
    int i;
3048
    uint8_t l, lt;
3049

    
3050
    l= *left;
3051
    lt= *left_top;
3052

    
3053
    for(i=0; i<w; i++){
3054
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3055
        lt= src1[i];
3056
        l= src2[i];
3057
        dst[i]= l - pred;
3058
    }    
3059

    
3060
    *left= l;
3061
    *left_top= lt;
3062
}
3063

    
3064
#define BUTTERFLY2(o1,o2,i1,i2) \
3065
o1= (i1)+(i2);\
3066
o2= (i1)-(i2);
3067

    
3068
#define BUTTERFLY1(x,y) \
3069
{\
3070
    int a,b;\
3071
    a= x;\
3072
    b= y;\
3073
    x= a+b;\
3074
    y= a-b;\
3075
}
3076

    
3077
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3078

    
3079
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3080
    int i;
3081
    int temp[64];
3082
    int sum=0;
3083
    
3084
    assert(h==8);
3085

    
3086
    for(i=0; i<8; i++){
3087
        //FIXME try pointer walks
3088
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3089
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3090
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3091
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3092
        
3093
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3094
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3095
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3096
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3097
        
3098
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3099
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3100
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3101
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3102
    }
3103

    
3104
    for(i=0; i<8; i++){
3105
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3106
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3107
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3108
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3109
        
3110
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3111
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3112
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3113
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3114

    
3115
        sum += 
3116
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3117
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3118
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3119
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3120
    }
3121
#if 0
3122
static int maxi=0;
3123
if(sum>maxi){
3124
    maxi=sum;
3125
    printf("MAX:%d\n", maxi);
3126
}
3127
#endif
3128
    return sum;
3129
}
3130

    
3131
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3132
    int i;
3133
    int temp[64];
3134
    int sum=0;
3135
    
3136
    assert(h==8);
3137
    
3138
    for(i=0; i<8; i++){
3139
        //FIXME try pointer walks
3140
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3141
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3142
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3143
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3144
        
3145
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3146
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3147
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3148
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3149
        
3150
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3151
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3152
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3153
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3154
    }
3155

    
3156
    for(i=0; i<8; i++){
3157
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3158
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3159
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3160
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3161
        
3162
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3163
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3164
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3165
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3166
    
3167
        sum += 
3168
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3169
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3170
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3171
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3172
    }
3173
    
3174
    sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3175
    
3176
    return sum;
3177
}
3178

    
3179
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3180
    MpegEncContext * const s= (MpegEncContext *)c;
3181
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3182
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3183
    int sum=0, i;
3184
    
3185
    assert(h==8);
3186

    
3187
    s->dsp.diff_pixels(temp, src1, src2, stride);
3188
    s->dsp.fdct(temp);
3189

    
3190
    for(i=0; i<64; i++)
3191
        sum+= ABS(temp[i]);
3192
        
3193
    return sum;
3194
}
3195

    
3196
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3197
    MpegEncContext * const s= (MpegEncContext *)c;
3198
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3199
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3200
    int sum=0, i;
3201
    
3202
    assert(h==8);
3203

    
3204
    s->dsp.diff_pixels(temp, src1, src2, stride);
3205
    s->dsp.fdct(temp);
3206

    
3207
    for(i=0; i<64; i++)
3208
        sum= FFMAX(sum, ABS(temp[i]));
3209
        
3210
    return sum;
3211
}
3212

    
3213
void simple_idct(DCTELEM *block); //FIXME
3214

    
3215
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3216
    MpegEncContext * const s= (MpegEncContext *)c;
3217
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3218
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3219
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3220
    int sum=0, i;
3221

    
3222
    assert(h==8);
3223
    s->mb_intra=0;
3224
    
3225
    s->dsp.diff_pixels(temp, src1, src2, stride);
3226
    
3227
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3228
    
3229
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3230
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3231
    simple_idct(temp); //FIXME 
3232
    
3233
    for(i=0; i<64; i++)
3234
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3235
        
3236
    return sum;
3237
}
3238

    
3239
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3240
    MpegEncContext * const s= (MpegEncContext *)c;
3241
    const uint8_t *scantable= s->intra_scantable.permutated;
3242
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3243
    uint64_t __align8 aligned_bak[stride];
3244
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3245
    uint8_t * const bak= (uint8_t*)aligned_bak;
3246
    int i, last, run, bits, level, distoration, start_i;
3247
    const int esc_length= s->ac_esc_length;
3248
    uint8_t * length;
3249
    uint8_t * last_length;
3250
    
3251
    assert(h==8);
3252

    
3253
    for(i=0; i<8; i++){
3254
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3255
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3256
    }
3257

    
3258
    s->dsp.diff_pixels(temp, src1, src2, stride);
3259

    
3260
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3261

    
3262
    bits=0;
3263
    
3264
    if (s->mb_intra) {
3265
        start_i = 1; 
3266
        length     = s->intra_ac_vlc_length;
3267
        last_length= s->intra_ac_vlc_last_length;
3268
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3269
    } else {
3270
        start_i = 0;
3271
        length     = s->inter_ac_vlc_length;
3272
        last_length= s->inter_ac_vlc_last_length;
3273
    }
3274
    
3275
    if(last>=start_i){
3276
        run=0;
3277
        for(i=start_i; i<last; i++){
3278
            int j= scantable[i];
3279
            level= temp[j];
3280
        
3281
            if(level){
3282
                level+=64;
3283
                if((level&(~127)) == 0){
3284
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3285
                }else
3286
                    bits+= esc_length;
3287
                run=0;
3288
            }else
3289
                run++;
3290
        }
3291
        i= scantable[last];
3292
       
3293
        level= temp[i] + 64;
3294

    
3295
        assert(level - 64);
3296
        
3297
        if((level&(~127)) == 0){
3298
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3299
        }else
3300
            bits+= esc_length;
3301
    
3302
    }
3303

    
3304
    if(last>=0){
3305
        if(s->mb_intra)
3306
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3307
        else
3308
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3309
    }
3310
    
3311
    s->dsp.idct_add(bak, stride, temp);
3312
    
3313
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3314

    
3315
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3316
}
3317

    
3318
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3319
    MpegEncContext * const s= (MpegEncContext *)c;
3320
    const uint8_t *scantable= s->intra_scantable.permutated;
3321
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3322
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3323
    int i, last, run, bits, level, start_i;
3324
    const int esc_length= s->ac_esc_length;
3325
    uint8_t * length;
3326
    uint8_t * last_length;
3327

    
3328
    assert(h==8);
3329
    
3330
    s->dsp.diff_pixels(temp, src1, src2, stride);
3331

    
3332
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3333

    
3334
    bits=0;
3335
    
3336
    if (s->mb_intra) {
3337
        start_i = 1; 
3338
        length     = s->intra_ac_vlc_length;
3339
        last_length= s->intra_ac_vlc_last_length;
3340
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3341
    } else {
3342
        start_i = 0;
3343
        length     = s->inter_ac_vlc_length;
3344
        last_length= s->inter_ac_vlc_last_length;
3345
    }
3346
    
3347
    if(last>=start_i){
3348
        run=0;
3349
        for(i=start_i; i<last; i++){
3350
            int j= scantable[i];
3351
            level= temp[j];
3352
        
3353
            if(level){
3354
                level+=64;
3355
                if((level&(~127)) == 0){
3356
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3357
                }else
3358
                    bits+= esc_length;
3359
                run=0;
3360
            }else
3361
                run++;
3362
        }
3363
        i= scantable[last];
3364
                
3365
        level= temp[i] + 64;
3366
        
3367
        assert(level - 64);
3368
        
3369
        if((level&(~127)) == 0){
3370
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3371
        }else
3372
            bits+= esc_length;
3373
    }
3374

    
3375
    return bits;
3376
}
3377

    
3378
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3379
    int score=0;
3380
    int x,y;
3381
    
3382
    for(y=1; y<h; y++){
3383
        for(x=0; x<16; x+=4){
3384
            score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride]) 
3385
                   +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3386
        }
3387
        s+= stride;
3388
    }
3389
    
3390
    return score;
3391
}
3392

    
3393
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3394
    int score=0;
3395
    int x,y;
3396
    
3397
    for(y=1; y<h; y++){
3398
        for(x=0; x<16; x++){
3399
            score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3400
        }
3401
        s1+= stride;
3402
        s2+= stride;
3403
    }
3404
    
3405
    return score;
3406
}
3407

    
3408
#define SQ(a) ((a)*(a))
3409
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3410
    int score=0;
3411
    int x,y;
3412
    
3413
    for(y=1; y<h; y++){
3414
        for(x=0; x<16; x+=4){
3415
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride]) 
3416
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3417
        }
3418
        s+= stride;
3419
    }
3420
    
3421
    return score;
3422
}
3423

    
3424
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3425
    int score=0;
3426
    int x,y;
3427
    
3428
    for(y=1; y<h; y++){
3429
        for(x=0; x<16; x++){
3430
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3431
        }
3432
        s1+= stride;
3433
        s2+= stride;
3434
    }
3435
    
3436
    return score;
3437
}
3438

    
3439
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3440
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3441
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3442
WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3443
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3444
WARPER8_16_SQ(rd8x8_c, rd16_c)
3445
WARPER8_16_SQ(bit8x8_c, bit16_c)
3446

    
3447
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3448
 converted */
3449
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3450
{
3451
    j_rev_dct (block);
3452
    put_pixels_clamped_c(block, dest, line_size);
3453
}
3454
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3455
{
3456
    j_rev_dct (block);
3457
    add_pixels_clamped_c(block, dest, line_size);
3458
}
3459

    
3460
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3461
{
3462
    j_rev_dct4 (block);
3463
    put_pixels_clamped4_c(block, dest, line_size);
3464
}
3465
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3466
{
3467
    j_rev_dct4 (block);
3468
    add_pixels_clamped4_c(block, dest, line_size);
3469
}
3470

    
3471
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3472
{
3473
    j_rev_dct2 (block);
3474
    put_pixels_clamped2_c(block, dest, line_size);
3475
}
3476
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3477
{
3478
    j_rev_dct2 (block);
3479
    add_pixels_clamped2_c(block, dest, line_size);
3480
}
3481

    
3482
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3483
{
3484
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3485

    
3486
    dest[0] = cm[(block[0] + 4)>>3];
3487
}
3488
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3489
{
3490
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3491

    
3492
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3493
}
3494

    
3495
/* init static data */
3496
void dsputil_static_init(void)
3497
{
3498
    int i;
3499

    
3500
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3501
    for(i=0;i<MAX_NEG_CROP;i++) {
3502
        cropTbl[i] = 0;
3503
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
3504
    }
3505
    
3506
    for(i=0;i<512;i++) {
3507
        squareTbl[i] = (i - 256) * (i - 256);
3508
    }
3509
    
3510
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3511
}
3512

    
3513

    
3514
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3515
{
3516
    int i;
3517

    
3518
#ifdef CONFIG_ENCODERS
3519
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3520
        c->fdct = fdct_ifast;
3521
        c->fdct248 = fdct_ifast248;
3522
    } 
3523
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3524
        c->fdct = ff_faandct;
3525
        c->fdct248 = ff_faandct248; 
3526
    } 
3527
    else {
3528
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3529
        c->fdct248 = ff_fdct248_islow;
3530
    }
3531
#endif //CONFIG_ENCODERS
3532

    
3533
    if(avctx->lowres==1){
3534
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3535
            c->idct_put= ff_jref_idct4_put;
3536
            c->idct_add= ff_jref_idct4_add;
3537
        }else{
3538
            c->idct_put= ff_h264_lowres_idct_put_c;
3539
            c->idct_add= ff_h264_lowres_idct_add_c;
3540
        }
3541
        c->idct    = j_rev_dct4;
3542
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3543
    }else if(avctx->lowres==2){
3544
        c->idct_put= ff_jref_idct2_put;
3545
        c->idct_add= ff_jref_idct2_add;
3546
        c->idct    = j_rev_dct2;
3547
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3548
    }else if(avctx->lowres==3){
3549
        c->idct_put= ff_jref_idct1_put;
3550
        c->idct_add= ff_jref_idct1_add;
3551
        c->idct    = j_rev_dct1;
3552
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3553
    }else{
3554
        if(avctx->idct_algo==FF_IDCT_INT){
3555
            c->idct_put= ff_jref_idct_put;
3556
            c->idct_add= ff_jref_idct_add;
3557
            c->idct    = j_rev_dct;
3558
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3559
        }else{ //accurate/default
3560
            c->idct_put= simple_idct_put;
3561
            c->idct_add= simple_idct_add;
3562
            c->idct    = simple_idct;
3563
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3564
        }
3565
    }
3566

    
3567
    c->h264_idct_add= ff_h264_idct_add_c;
3568

    
3569
    /* VP3 DSP support */
3570
    c->vp3_dsp_init = vp3_dsp_init_c;
3571
    c->vp3_idct = vp3_idct_c;
3572

    
3573
    c->get_pixels = get_pixels_c;
3574
    c->diff_pixels = diff_pixels_c;
3575
    c->put_pixels_clamped = put_pixels_clamped_c;
3576
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3577
    c->add_pixels_clamped = add_pixels_clamped_c;
3578
    c->gmc1 = gmc1_c;
3579
    c->gmc = gmc_c;
3580
    c->clear_blocks = clear_blocks_c;
3581
    c->pix_sum = pix_sum_c;
3582
    c->pix_norm1 = pix_norm1_c;
3583

    
3584
    /* TODO [0] 16  [1] 8 */
3585
    c->pix_abs[0][0] = pix_abs16_c;
3586
    c->pix_abs[0][1] = pix_abs16_x2_c;
3587
    c->pix_abs[0][2] = pix_abs16_y2_c;
3588
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3589
    c->pix_abs[1][0] = pix_abs8_c;
3590
    c->pix_abs[1][1] = pix_abs8_x2_c;
3591
    c->pix_abs[1][2] = pix_abs8_y2_c;
3592
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3593

    
3594
#define dspfunc(PFX, IDX, NUM) \
3595
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3596
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3597
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3598
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3599

    
3600
    dspfunc(put, 0, 16);
3601
    dspfunc(put_no_rnd, 0, 16);
3602
    dspfunc(put, 1, 8);
3603
    dspfunc(put_no_rnd, 1, 8);
3604
    dspfunc(put, 2, 4);
3605
    dspfunc(put, 3, 2);
3606

    
3607
    dspfunc(avg, 0, 16);
3608
    dspfunc(avg_no_rnd, 0, 16);
3609
    dspfunc(avg, 1, 8);
3610
    dspfunc(avg_no_rnd, 1, 8);
3611
    dspfunc(avg, 2, 4);
3612
    dspfunc(avg, 3, 2);
3613
#undef dspfunc
3614

    
3615
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3616
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3617

    
3618
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3619
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3620
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3621
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3622
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3623
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3624
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3625
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3626
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3627

    
3628
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3629
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3630
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3631
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3632
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3633
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3634
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3635
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3636
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3637

    
3638
#define dspfunc(PFX, IDX, NUM) \
3639
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3640
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3641
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3642
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3643
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3644
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3645
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3646
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3647
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3648
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3649
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3650
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3651
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3652
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3653
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3654
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3655

    
3656
    dspfunc(put_qpel, 0, 16);
3657
    dspfunc(put_no_rnd_qpel, 0, 16);
3658

    
3659
    dspfunc(avg_qpel, 0, 16);
3660
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3661

    
3662
    dspfunc(put_qpel, 1, 8);
3663
    dspfunc(put_no_rnd_qpel, 1, 8);
3664

    
3665
    dspfunc(avg_qpel, 1, 8);
3666
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3667

    
3668
    dspfunc(put_h264_qpel, 0, 16);
3669
    dspfunc(put_h264_qpel, 1, 8);
3670
    dspfunc(put_h264_qpel, 2, 4);
3671
    dspfunc(avg_h264_qpel, 0, 16);
3672
    dspfunc(avg_h264_qpel, 1, 8);
3673
    dspfunc(avg_h264_qpel, 2, 4);
3674

    
3675
#undef dspfunc
3676
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3677
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3678
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3679
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3680
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3681
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3682

    
3683
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3684
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3685
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3686
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3687
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3688
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3689
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3690
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3691
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3692
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3693
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3694
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3695
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3696
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3697
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3698
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3699
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3700
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3701
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3702
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3703

    
3704
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3705
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3706
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3707
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3708
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3709
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3710
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3711
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3712
        
3713
#define SET_CMP_FUNC(name) \
3714
    c->name[0]= name ## 16_c;\
3715
    c->name[1]= name ## 8x8_c;
3716
    
3717
    SET_CMP_FUNC(hadamard8_diff)
3718
    c->hadamard8_diff[4]= hadamard8_intra16_c;
3719
    SET_CMP_FUNC(dct_sad)
3720
    SET_CMP_FUNC(dct_max)
3721
    c->sad[0]= pix_abs16_c;
3722
    c->sad[1]= pix_abs8_c;
3723
    c->sse[0]= sse16_c;
3724
    c->sse[1]= sse8_c;
3725
    c->sse[2]= sse4_c;
3726
    SET_CMP_FUNC(quant_psnr)
3727
    SET_CMP_FUNC(rd)
3728
    SET_CMP_FUNC(bit)
3729
    c->vsad[0]= vsad16_c;
3730
    c->vsad[4]= vsad_intra16_c;
3731
    c->vsse[0]= vsse16_c;
3732
    c->vsse[4]= vsse_intra16_c;
3733
    c->nsse[0]= nsse16_c;
3734
    c->nsse[1]= nsse8_c;
3735
    c->w53[0]= w53_16_c;
3736
    c->w53[1]= w53_8_c;
3737
    c->w97[0]= w97_16_c;
3738
    c->w97[1]= w97_8_c;
3739

    
3740
    c->add_bytes= add_bytes_c;
3741
    c->diff_bytes= diff_bytes_c;
3742
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3743
    c->bswap_buf= bswap_buf;
3744
    
3745
    c->h263_h_loop_filter= h263_h_loop_filter_c;
3746
    c->h263_v_loop_filter= h263_v_loop_filter_c;
3747
    
3748
    c->h261_loop_filter= h261_loop_filter_c;
3749
    
3750
    c->try_8x8basis= try_8x8basis_c;
3751
    c->add_8x8basis= add_8x8basis_c;
3752

    
3753
#ifdef HAVE_MMX
3754
    dsputil_init_mmx(c, avctx);
3755
#endif
3756
#ifdef ARCH_ARMV4L
3757
    dsputil_init_armv4l(c, avctx);
3758
#endif
3759
#ifdef HAVE_MLIB
3760
    dsputil_init_mlib(c, avctx);
3761
#endif
3762
#ifdef ARCH_SPARC
3763
   dsputil_init_vis(c,avctx);
3764
#endif
3765
#ifdef ARCH_ALPHA
3766
    dsputil_init_alpha(c, avctx);
3767
#endif
3768
#ifdef ARCH_POWERPC
3769
    dsputil_init_ppc(c, avctx);
3770
#endif
3771
#ifdef HAVE_MMI
3772
    dsputil_init_mmi(c, avctx);
3773
#endif
3774
#ifdef ARCH_SH4
3775
    dsputil_init_sh4(c,avctx);
3776
#endif
3777

    
3778
    switch(c->idct_permutation_type){
3779
    case FF_NO_IDCT_PERM:
3780
        for(i=0; i<64; i++)
3781
            c->idct_permutation[i]= i;
3782
        break;
3783
    case FF_LIBMPEG2_IDCT_PERM:
3784
        for(i=0; i<64; i++)
3785
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3786
        break;
3787
    case FF_SIMPLE_IDCT_PERM:
3788
        for(i=0; i<64; i++)
3789
            c->idct_permutation[i]= simple_mmx_permutation[i];
3790
        break;
3791
    case FF_TRANSPOSE_IDCT_PERM:
3792
        for(i=0; i<64; i++)
3793
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3794
        break;
3795
    default:
3796
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3797
    }
3798
}
3799