Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 186447f8

History | View | Annotate | Download (114 KB)

1 de6d9b64 Fabrice Bellard
/*
2
 * DSP utils
3 ff4ec49e Fabrice Bellard
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 de6d9b64 Fabrice Bellard
 *
5 ff4ec49e Fabrice Bellard
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9 de6d9b64 Fabrice Bellard
 *
10 ff4ec49e Fabrice Bellard
 * This library is distributed in the hope that it will be useful,
11 de6d9b64 Fabrice Bellard
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ff4ec49e Fabrice Bellard
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14 de6d9b64 Fabrice Bellard
 *
15 ff4ec49e Fabrice Bellard
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 7ff037e9 Michael Niedermayer
 *
19 59fe111e Michael Niedermayer
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20 de6d9b64 Fabrice Bellard
 */
21 983e3246 Michael Niedermayer
 
22
/**
23
 * @file dsputil.c
24
 * DSP utils
25
 */
26
 
27 de6d9b64 Fabrice Bellard
#include "avcodec.h"
28
#include "dsputil.h"
29 1457ab52 Michael Niedermayer
#include "mpegvideo.h"
30 b0368839 Michael Niedermayer
#include "simple_idct.h"
31 45553457 Zdenek Kabelac
32 5596c60c Michael Niedermayer
33 0c1a9eda Zdenek Kabelac
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34
uint32_t squareTbl[512];
35 de6d9b64 Fabrice Bellard
36 0c1a9eda Zdenek Kabelac
const uint8_t ff_zigzag_direct[64] = {
37 2ad1516a Michael Niedermayer
    0,   1,  8, 16,  9,  2,  3, 10,
38
    17, 24, 32, 25, 18, 11,  4,  5,
39 e0eac44e Fabrice Bellard
    12, 19, 26, 33, 40, 48, 41, 34,
40 2ad1516a Michael Niedermayer
    27, 20, 13,  6,  7, 14, 21, 28,
41 e0eac44e Fabrice Bellard
    35, 42, 49, 56, 57, 50, 43, 36,
42
    29, 22, 15, 23, 30, 37, 44, 51,
43
    58, 59, 52, 45, 38, 31, 39, 46,
44
    53, 60, 61, 54, 47, 55, 62, 63
45
};
46
47 2f349de2 Michael Niedermayer
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
48 0c1a9eda Zdenek Kabelac
uint16_t __align8 inv_zigzag_direct16[64];
49 2f349de2 Michael Niedermayer
50 0c1a9eda Zdenek Kabelac
const uint8_t ff_alternate_horizontal_scan[64] = {
51 2ad1516a Michael Niedermayer
    0,  1,   2,  3,  8,  9, 16, 17, 
52 e0eac44e Fabrice Bellard
    10, 11,  4,  5,  6,  7, 15, 14,
53
    13, 12, 19, 18, 24, 25, 32, 33, 
54
    26, 27, 20, 21, 22, 23, 28, 29,
55
    30, 31, 34, 35, 40, 41, 48, 49, 
56
    42, 43, 36, 37, 38, 39, 44, 45,
57
    46, 47, 50, 51, 56, 57, 58, 59, 
58
    52, 53, 54, 55, 60, 61, 62, 63,
59
};
60
61 0c1a9eda Zdenek Kabelac
const uint8_t ff_alternate_vertical_scan[64] = {
62 2ad1516a Michael Niedermayer
    0,  8,  16, 24,  1,  9,  2, 10, 
63 e0eac44e Fabrice Bellard
    17, 25, 32, 40, 48, 56, 57, 49,
64
    41, 33, 26, 18,  3, 11,  4, 12, 
65
    19, 27, 34, 42, 50, 58, 35, 43,
66
    51, 59, 20, 28,  5, 13,  6, 14, 
67
    21, 29, 36, 44, 52, 60, 37, 45,
68
    53, 61, 22, 30,  7, 15, 23, 31, 
69
    38, 46, 54, 62, 39, 47, 55, 63,
70
};
71
72 2f349de2 Michael Niedermayer
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
73 0c1a9eda Zdenek Kabelac
const uint32_t inverse[256]={
74 2f349de2 Michael Niedermayer
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
75
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
76
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
77
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
78
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
79
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
80
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
81
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
82
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
83
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
84
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
85
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
86
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
87
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
88
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
89
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
90
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
91
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
92
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
93
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
94
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
95
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
96
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
97
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
98
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
99
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
100
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
101
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
102
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
103
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
104
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
105
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
106
};
107
108 b0368839 Michael Niedermayer
/* Input permutation for the simple_idct_mmx */
109
static const uint8_t simple_mmx_permutation[64]={
110
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
111
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
112
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
113
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
114
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
115
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
116
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
117
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
118
};
119
120 0c1a9eda Zdenek Kabelac
static int pix_sum_c(uint8_t * pix, int line_size)
121 3aa102be Michael Niedermayer
{
122
    int s, i, j;
123
124
    s = 0;
125
    for (i = 0; i < 16; i++) {
126
        for (j = 0; j < 16; j += 8) {
127
            s += pix[0];
128
            s += pix[1];
129
            s += pix[2];
130
            s += pix[3];
131
            s += pix[4];
132
            s += pix[5];
133
            s += pix[6];
134
            s += pix[7];
135
            pix += 8;
136
        }
137
        pix += line_size - 16;
138
    }
139
    return s;
140
}
141
142 0c1a9eda Zdenek Kabelac
static int pix_norm1_c(uint8_t * pix, int line_size)
143 3aa102be Michael Niedermayer
{
144
    int s, i, j;
145 0c1a9eda Zdenek Kabelac
    uint32_t *sq = squareTbl + 256;
146 3aa102be Michael Niedermayer
147
    s = 0;
148
    for (i = 0; i < 16; i++) {
149
        for (j = 0; j < 16; j += 8) {
150 2a006cd3 Felix von Leitner
#if 0
151 3aa102be Michael Niedermayer
            s += sq[pix[0]];
152
            s += sq[pix[1]];
153
            s += sq[pix[2]];
154
            s += sq[pix[3]];
155
            s += sq[pix[4]];
156
            s += sq[pix[5]];
157
            s += sq[pix[6]];
158
            s += sq[pix[7]];
159 2a006cd3 Felix von Leitner
#else
160
#if LONG_MAX > 2147483647
161
            register uint64_t x=*(uint64_t*)pix;
162
            s += sq[x&0xff];
163
            s += sq[(x>>8)&0xff];
164
            s += sq[(x>>16)&0xff];
165
            s += sq[(x>>24)&0xff];
166
            s += sq[(x>>32)&0xff];
167
            s += sq[(x>>40)&0xff];
168
            s += sq[(x>>48)&0xff];
169
            s += sq[(x>>56)&0xff];
170
#else
171
            register uint32_t x=*(uint32_t*)pix;
172
            s += sq[x&0xff];
173
            s += sq[(x>>8)&0xff];
174
            s += sq[(x>>16)&0xff];
175
            s += sq[(x>>24)&0xff];
176
            x=*(uint32_t*)(pix+4);
177
            s += sq[x&0xff];
178
            s += sq[(x>>8)&0xff];
179
            s += sq[(x>>16)&0xff];
180
            s += sq[(x>>24)&0xff];
181
#endif
182
#endif
183 3aa102be Michael Niedermayer
            pix += 8;
184
        }
185
        pix += line_size - 16;
186
    }
187
    return s;
188
}
189
190 3d2e8cce Michael Niedermayer
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
191
    int i;
192
    
193
    for(i=0; i+8<=w; i+=8){
194
        dst[i+0]= bswap_32(src[i+0]);
195
        dst[i+1]= bswap_32(src[i+1]);
196
        dst[i+2]= bswap_32(src[i+2]);
197
        dst[i+3]= bswap_32(src[i+3]);
198
        dst[i+4]= bswap_32(src[i+4]);
199
        dst[i+5]= bswap_32(src[i+5]);
200
        dst[i+6]= bswap_32(src[i+6]);
201
        dst[i+7]= bswap_32(src[i+7]);
202
    }
203
    for(;i<w; i++){
204
        dst[i+0]= bswap_32(src[i+0]);
205
    }
206
}
207 3aa102be Michael Niedermayer
208 0c1a9eda Zdenek Kabelac
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
209 1457ab52 Michael Niedermayer
{
210
    int s, i;
211 0c1a9eda Zdenek Kabelac
    uint32_t *sq = squareTbl + 256;
212 1457ab52 Michael Niedermayer
213
    s = 0;
214
    for (i = 0; i < 8; i++) {
215
        s += sq[pix1[0] - pix2[0]];
216
        s += sq[pix1[1] - pix2[1]];
217
        s += sq[pix1[2] - pix2[2]];
218
        s += sq[pix1[3] - pix2[3]];
219
        s += sq[pix1[4] - pix2[4]];
220
        s += sq[pix1[5] - pix2[5]];
221
        s += sq[pix1[6] - pix2[6]];
222
        s += sq[pix1[7] - pix2[7]];
223
        pix1 += line_size;
224
        pix2 += line_size;
225
    }
226
    return s;
227
}
228
229 6b026927 Falk Hüffner
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
230 9c76bd48 Brian Foley
{
231 6b026927 Falk Hüffner
    int s, i;
232
    uint32_t *sq = squareTbl + 256;
233 9c76bd48 Brian Foley
234
    s = 0;
235
    for (i = 0; i < 16; i++) {
236 6b026927 Falk Hüffner
        s += sq[pix1[ 0] - pix2[ 0]];
237
        s += sq[pix1[ 1] - pix2[ 1]];
238
        s += sq[pix1[ 2] - pix2[ 2]];
239
        s += sq[pix1[ 3] - pix2[ 3]];
240
        s += sq[pix1[ 4] - pix2[ 4]];
241
        s += sq[pix1[ 5] - pix2[ 5]];
242
        s += sq[pix1[ 6] - pix2[ 6]];
243
        s += sq[pix1[ 7] - pix2[ 7]];
244
        s += sq[pix1[ 8] - pix2[ 8]];
245
        s += sq[pix1[ 9] - pix2[ 9]];
246
        s += sq[pix1[10] - pix2[10]];
247
        s += sq[pix1[11] - pix2[11]];
248
        s += sq[pix1[12] - pix2[12]];
249
        s += sq[pix1[13] - pix2[13]];
250
        s += sq[pix1[14] - pix2[14]];
251
        s += sq[pix1[15] - pix2[15]];
252 2a006cd3 Felix von Leitner
253 6b026927 Falk Hüffner
        pix1 += line_size;
254
        pix2 += line_size;
255 9c76bd48 Brian Foley
    }
256
    return s;
257
}
258
259 0c1a9eda Zdenek Kabelac
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
260 de6d9b64 Fabrice Bellard
{
261
    int i;
262
263
    /* read the pixels */
264
    for(i=0;i<8;i++) {
265 c13e1abd Falk Hüffner
        block[0] = pixels[0];
266
        block[1] = pixels[1];
267
        block[2] = pixels[2];
268
        block[3] = pixels[3];
269
        block[4] = pixels[4];
270
        block[5] = pixels[5];
271
        block[6] = pixels[6];
272
        block[7] = pixels[7];
273
        pixels += line_size;
274
        block += 8;
275 de6d9b64 Fabrice Bellard
    }
276
}
277
278 0c1a9eda Zdenek Kabelac
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
279
                          const uint8_t *s2, int stride){
280 9dbcbd92 Michael Niedermayer
    int i;
281
282
    /* read the pixels */
283
    for(i=0;i<8;i++) {
284 c13e1abd Falk Hüffner
        block[0] = s1[0] - s2[0];
285
        block[1] = s1[1] - s2[1];
286
        block[2] = s1[2] - s2[2];
287
        block[3] = s1[3] - s2[3];
288
        block[4] = s1[4] - s2[4];
289
        block[5] = s1[5] - s2[5];
290
        block[6] = s1[6] - s2[6];
291
        block[7] = s1[7] - s2[7];
292 9dbcbd92 Michael Niedermayer
        s1 += stride;
293
        s2 += stride;
294 c13e1abd Falk Hüffner
        block += 8;
295 9dbcbd92 Michael Niedermayer
    }
296
}
297
298
299 0c1a9eda Zdenek Kabelac
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
300 eb4b3dd3 Zdenek Kabelac
                                 int line_size)
301 de6d9b64 Fabrice Bellard
{
302
    int i;
303 0c1a9eda Zdenek Kabelac
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
304 de6d9b64 Fabrice Bellard
    
305
    /* read the pixels */
306
    for(i=0;i<8;i++) {
307 c13e1abd Falk Hüffner
        pixels[0] = cm[block[0]];
308
        pixels[1] = cm[block[1]];
309
        pixels[2] = cm[block[2]];
310
        pixels[3] = cm[block[3]];
311
        pixels[4] = cm[block[4]];
312
        pixels[5] = cm[block[5]];
313
        pixels[6] = cm[block[6]];
314
        pixels[7] = cm[block[7]];
315
316
        pixels += line_size;
317
        block += 8;
318 de6d9b64 Fabrice Bellard
    }
319
}
320
321 0c1a9eda Zdenek Kabelac
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
322 c13e1abd Falk Hüffner
                          int line_size)
323 de6d9b64 Fabrice Bellard
{
324
    int i;
325 0c1a9eda Zdenek Kabelac
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
326 de6d9b64 Fabrice Bellard
    
327
    /* read the pixels */
328
    for(i=0;i<8;i++) {
329 c13e1abd Falk Hüffner
        pixels[0] = cm[pixels[0] + block[0]];
330
        pixels[1] = cm[pixels[1] + block[1]];
331
        pixels[2] = cm[pixels[2] + block[2]];
332
        pixels[3] = cm[pixels[3] + block[3]];
333
        pixels[4] = cm[pixels[4] + block[4]];
334
        pixels[5] = cm[pixels[5] + block[5]];
335
        pixels[6] = cm[pixels[6] + block[6]];
336
        pixels[7] = cm[pixels[7] + block[7]];
337
        pixels += line_size;
338
        block += 8;
339 de6d9b64 Fabrice Bellard
    }
340
}
341 59fe111e Michael Niedermayer
#if 0
342

343
#define PIXOP2(OPNAME, OP) \
344 b3184779 Michael Niedermayer
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
345 59fe111e Michael Niedermayer
{\
346
    int i;\
347
    for(i=0; i<h; i++){\
348
        OP(*((uint64_t*)block), LD64(pixels));\
349
        pixels+=line_size;\
350
        block +=line_size;\
351
    }\
352
}\
353
\
354 45553457 Zdenek Kabelac
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
355 59fe111e Michael Niedermayer
{\
356
    int i;\
357
    for(i=0; i<h; i++){\
358
        const uint64_t a= LD64(pixels  );\
359
        const uint64_t b= LD64(pixels+1);\
360
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
361
        pixels+=line_size;\
362
        block +=line_size;\
363
    }\
364
}\
365
\
366 45553457 Zdenek Kabelac
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
367 59fe111e Michael Niedermayer
{\
368
    int i;\
369
    for(i=0; i<h; i++){\
370
        const uint64_t a= LD64(pixels  );\
371
        const uint64_t b= LD64(pixels+1);\
372
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
373
        pixels+=line_size;\
374
        block +=line_size;\
375
    }\
376
}\
377
\
378 45553457 Zdenek Kabelac
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
379 59fe111e Michael Niedermayer
{\
380
    int i;\
381
    for(i=0; i<h; i++){\
382
        const uint64_t a= LD64(pixels          );\
383
        const uint64_t b= LD64(pixels+line_size);\
384
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
385
        pixels+=line_size;\
386
        block +=line_size;\
387
    }\
388
}\
389
\
390 45553457 Zdenek Kabelac
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
391 59fe111e Michael Niedermayer
{\
392
    int i;\
393
    for(i=0; i<h; i++){\
394
        const uint64_t a= LD64(pixels          );\
395
        const uint64_t b= LD64(pixels+line_size);\
396
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
397
        pixels+=line_size;\
398
        block +=line_size;\
399
    }\
400
}\
401
\
402 45553457 Zdenek Kabelac
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
403 59fe111e Michael Niedermayer
{\
404
        int i;\
405
        const uint64_t a= LD64(pixels  );\
406
        const uint64_t b= LD64(pixels+1);\
407
        uint64_t l0=  (a&0x0303030303030303ULL)\
408
                    + (b&0x0303030303030303ULL)\
409
                    + 0x0202020202020202ULL;\
410
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
411
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
412
        uint64_t l1,h1;\
413
\
414
        pixels+=line_size;\
415
        for(i=0; i<h; i+=2){\
416
            uint64_t a= LD64(pixels  );\
417
            uint64_t b= LD64(pixels+1);\
418
            l1=  (a&0x0303030303030303ULL)\
419
               + (b&0x0303030303030303ULL);\
420
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
421
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
422
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
423
            pixels+=line_size;\
424
            block +=line_size;\
425
            a= LD64(pixels  );\
426
            b= LD64(pixels+1);\
427
            l0=  (a&0x0303030303030303ULL)\
428
               + (b&0x0303030303030303ULL)\
429
               + 0x0202020202020202ULL;\
430
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
431
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
432
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
433
            pixels+=line_size;\
434
            block +=line_size;\
435
        }\
436
}\
437
\
438 45553457 Zdenek Kabelac
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
439 59fe111e Michael Niedermayer
{\
440
        int i;\
441
        const uint64_t a= LD64(pixels  );\
442
        const uint64_t b= LD64(pixels+1);\
443
        uint64_t l0=  (a&0x0303030303030303ULL)\
444
                    + (b&0x0303030303030303ULL)\
445
                    + 0x0101010101010101ULL;\
446
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
447
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
448
        uint64_t l1,h1;\
449
\
450
        pixels+=line_size;\
451
        for(i=0; i<h; i+=2){\
452
            uint64_t a= LD64(pixels  );\
453
            uint64_t b= LD64(pixels+1);\
454
            l1=  (a&0x0303030303030303ULL)\
455
               + (b&0x0303030303030303ULL);\
456
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
457
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
458
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
459
            pixels+=line_size;\
460
            block +=line_size;\
461
            a= LD64(pixels  );\
462
            b= LD64(pixels+1);\
463
            l0=  (a&0x0303030303030303ULL)\
464
               + (b&0x0303030303030303ULL)\
465
               + 0x0101010101010101ULL;\
466
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
467
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
468
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
469
            pixels+=line_size;\
470
            block +=line_size;\
471
        }\
472
}\
473
\
474 45553457 Zdenek Kabelac
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
475
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
476
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
477
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
478
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
479
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
480
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
481 59fe111e Michael Niedermayer

482
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
483
#else // 64 bit variant
484
485
#define PIXOP2(OPNAME, OP) \
486 669ac79c Michael Niedermayer
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
487
    int i;\
488
    for(i=0; i<h; i++){\
489
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
490
        pixels+=line_size;\
491
        block +=line_size;\
492
    }\
493
}\
494 0da71265 Michael Niedermayer
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
495
    int i;\
496
    for(i=0; i<h; i++){\
497
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
498
        pixels+=line_size;\
499
        block +=line_size;\
500
    }\
501
}\
502 45553457 Zdenek Kabelac
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
503 59fe111e Michael Niedermayer
    int i;\
504
    for(i=0; i<h; i++){\
505
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
506
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
507
        pixels+=line_size;\
508
        block +=line_size;\
509
    }\
510
}\
511 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
512
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
513 b3184779 Michael Niedermayer
}\
514 59fe111e Michael Niedermayer
\
515 b3184779 Michael Niedermayer
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
516
                                                int src_stride1, int src_stride2, int h){\
517 59fe111e Michael Niedermayer
    int i;\
518
    for(i=0; i<h; i++){\
519 b3184779 Michael Niedermayer
        uint32_t a,b;\
520
        a= LD32(&src1[i*src_stride1  ]);\
521
        b= LD32(&src2[i*src_stride2  ]);\
522 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
523 b3184779 Michael Niedermayer
        a= LD32(&src1[i*src_stride1+4]);\
524
        b= LD32(&src2[i*src_stride2+4]);\
525 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
526 59fe111e Michael Niedermayer
    }\
527
}\
528
\
529 b3184779 Michael Niedermayer
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530
                                                int src_stride1, int src_stride2, int h){\
531 59fe111e Michael Niedermayer
    int i;\
532
    for(i=0; i<h; i++){\
533 b3184779 Michael Niedermayer
        uint32_t a,b;\
534
        a= LD32(&src1[i*src_stride1  ]);\
535
        b= LD32(&src2[i*src_stride2  ]);\
536 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
537 b3184779 Michael Niedermayer
        a= LD32(&src1[i*src_stride1+4]);\
538
        b= LD32(&src2[i*src_stride2+4]);\
539 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
540 59fe111e Michael Niedermayer
    }\
541
}\
542
\
543 0da71265 Michael Niedermayer
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
544
                                                int src_stride1, int src_stride2, int h){\
545
    int i;\
546
    for(i=0; i<h; i++){\
547
        uint32_t a,b;\
548
        a= LD32(&src1[i*src_stride1  ]);\
549
        b= LD32(&src2[i*src_stride2  ]);\
550 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
551 0da71265 Michael Niedermayer
    }\
552
}\
553
\
554 669ac79c Michael Niedermayer
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
555
                                                int src_stride1, int src_stride2, int h){\
556
    int i;\
557
    for(i=0; i<h; i++){\
558
        uint32_t a,b;\
559
        a= LD16(&src1[i*src_stride1  ]);\
560
        b= LD16(&src2[i*src_stride2  ]);\
561
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
562
    }\
563
}\
564
\
565 b3184779 Michael Niedermayer
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
566
                                                int src_stride1, int src_stride2, int h){\
567
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
568
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
569
}\
570
\
571
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
572
                                                int src_stride1, int src_stride2, int h){\
573
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
574
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
575
}\
576
\
577 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
578 b3184779 Michael Niedermayer
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
579
}\
580
\
581 45553457 Zdenek Kabelac
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
582 b3184779 Michael Niedermayer
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
583
}\
584
\
585 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
586 b3184779 Michael Niedermayer
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
587
}\
588
\
589 45553457 Zdenek Kabelac
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
590 b3184779 Michael Niedermayer
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
591
}\
592
\
593
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
594
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
595 59fe111e Michael Niedermayer
    int i;\
596
    for(i=0; i<h; i++){\
597 b3184779 Michael Niedermayer
        uint32_t a, b, c, d, l0, l1, h0, h1;\
598
        a= LD32(&src1[i*src_stride1]);\
599
        b= LD32(&src2[i*src_stride2]);\
600
        c= LD32(&src3[i*src_stride3]);\
601
        d= LD32(&src4[i*src_stride4]);\
602
        l0=  (a&0x03030303UL)\
603
           + (b&0x03030303UL)\
604
           + 0x02020202UL;\
605
        h0= ((a&0xFCFCFCFCUL)>>2)\
606
          + ((b&0xFCFCFCFCUL)>>2);\
607
        l1=  (c&0x03030303UL)\
608
           + (d&0x03030303UL);\
609
        h1= ((c&0xFCFCFCFCUL)>>2)\
610
          + ((d&0xFCFCFCFCUL)>>2);\
611
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
612
        a= LD32(&src1[i*src_stride1+4]);\
613
        b= LD32(&src2[i*src_stride2+4]);\
614
        c= LD32(&src3[i*src_stride3+4]);\
615
        d= LD32(&src4[i*src_stride4+4]);\
616
        l0=  (a&0x03030303UL)\
617
           + (b&0x03030303UL)\
618
           + 0x02020202UL;\
619
        h0= ((a&0xFCFCFCFCUL)>>2)\
620
          + ((b&0xFCFCFCFCUL)>>2);\
621
        l1=  (c&0x03030303UL)\
622
           + (d&0x03030303UL);\
623
        h1= ((c&0xFCFCFCFCUL)>>2)\
624
          + ((d&0xFCFCFCFCUL)>>2);\
625
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
626 59fe111e Michael Niedermayer
    }\
627
}\
628 669ac79c Michael Niedermayer
\
629
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
630
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
631
}\
632
\
633
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
634
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
635
}\
636
\
637
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
638
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
639
}\
640
\
641
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
642
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
643
}\
644
\
645 b3184779 Michael Niedermayer
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
646
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
647 59fe111e Michael Niedermayer
    int i;\
648
    for(i=0; i<h; i++){\
649 b3184779 Michael Niedermayer
        uint32_t a, b, c, d, l0, l1, h0, h1;\
650
        a= LD32(&src1[i*src_stride1]);\
651
        b= LD32(&src2[i*src_stride2]);\
652
        c= LD32(&src3[i*src_stride3]);\
653
        d= LD32(&src4[i*src_stride4]);\
654
        l0=  (a&0x03030303UL)\
655
           + (b&0x03030303UL)\
656
           + 0x01010101UL;\
657
        h0= ((a&0xFCFCFCFCUL)>>2)\
658
          + ((b&0xFCFCFCFCUL)>>2);\
659
        l1=  (c&0x03030303UL)\
660
           + (d&0x03030303UL);\
661
        h1= ((c&0xFCFCFCFCUL)>>2)\
662
          + ((d&0xFCFCFCFCUL)>>2);\
663
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
664
        a= LD32(&src1[i*src_stride1+4]);\
665
        b= LD32(&src2[i*src_stride2+4]);\
666
        c= LD32(&src3[i*src_stride3+4]);\
667
        d= LD32(&src4[i*src_stride4+4]);\
668
        l0=  (a&0x03030303UL)\
669
           + (b&0x03030303UL)\
670
           + 0x01010101UL;\
671
        h0= ((a&0xFCFCFCFCUL)>>2)\
672
          + ((b&0xFCFCFCFCUL)>>2);\
673
        l1=  (c&0x03030303UL)\
674
           + (d&0x03030303UL);\
675
        h1= ((c&0xFCFCFCFCUL)>>2)\
676
          + ((d&0xFCFCFCFCUL)>>2);\
677
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
678 59fe111e Michael Niedermayer
    }\
679
}\
680 b3184779 Michael Niedermayer
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
681
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
682
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
683
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
684
}\
685
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
686
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
687
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
688
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
689
}\
690 59fe111e Michael Niedermayer
\
691 669ac79c Michael Niedermayer
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
692
{\
693
        int i, a0, b0, a1, b1;\
694
        a0= pixels[0];\
695
        b0= pixels[1] + 2;\
696
        a0 += b0;\
697
        b0 += pixels[2];\
698
\
699
        pixels+=line_size;\
700
        for(i=0; i<h; i+=2){\
701
            a1= pixels[0];\
702
            b1= pixels[1];\
703
            a1 += b1;\
704
            b1 += pixels[2];\
705
\
706
            block[0]= (a1+a0)>>2; /* FIXME non put */\
707
            block[1]= (b1+b0)>>2;\
708
\
709
            pixels+=line_size;\
710
            block +=line_size;\
711
\
712
            a0= pixels[0];\
713
            b0= pixels[1] + 2;\
714
            a0 += b0;\
715
            b0 += pixels[2];\
716
\
717
            block[0]= (a1+a0)>>2;\
718
            block[1]= (b1+b0)>>2;\
719
            pixels+=line_size;\
720
            block +=line_size;\
721
        }\
722
}\
723
\
724
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
725
{\
726
        int i;\
727
        const uint32_t a= LD32(pixels  );\
728
        const uint32_t b= LD32(pixels+1);\
729
        uint32_t l0=  (a&0x03030303UL)\
730
                    + (b&0x03030303UL)\
731
                    + 0x02020202UL;\
732
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
733
                   + ((b&0xFCFCFCFCUL)>>2);\
734
        uint32_t l1,h1;\
735
\
736
        pixels+=line_size;\
737
        for(i=0; i<h; i+=2){\
738
            uint32_t a= LD32(pixels  );\
739
            uint32_t b= LD32(pixels+1);\
740
            l1=  (a&0x03030303UL)\
741
               + (b&0x03030303UL);\
742
            h1= ((a&0xFCFCFCFCUL)>>2)\
743
              + ((b&0xFCFCFCFCUL)>>2);\
744
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
745
            pixels+=line_size;\
746
            block +=line_size;\
747
            a= LD32(pixels  );\
748
            b= LD32(pixels+1);\
749
            l0=  (a&0x03030303UL)\
750
               + (b&0x03030303UL)\
751
               + 0x02020202UL;\
752
            h0= ((a&0xFCFCFCFCUL)>>2)\
753
              + ((b&0xFCFCFCFCUL)>>2);\
754
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
755
            pixels+=line_size;\
756
            block +=line_size;\
757
        }\
758
}\
759
\
760 45553457 Zdenek Kabelac
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
761 59fe111e Michael Niedermayer
{\
762
    int j;\
763
    for(j=0; j<2; j++){\
764
        int i;\
765
        const uint32_t a= LD32(pixels  );\
766
        const uint32_t b= LD32(pixels+1);\
767
        uint32_t l0=  (a&0x03030303UL)\
768
                    + (b&0x03030303UL)\
769
                    + 0x02020202UL;\
770
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
771
                   + ((b&0xFCFCFCFCUL)>>2);\
772
        uint32_t l1,h1;\
773
\
774
        pixels+=line_size;\
775
        for(i=0; i<h; i+=2){\
776
            uint32_t a= LD32(pixels  );\
777
            uint32_t b= LD32(pixels+1);\
778
            l1=  (a&0x03030303UL)\
779
               + (b&0x03030303UL);\
780
            h1= ((a&0xFCFCFCFCUL)>>2)\
781
              + ((b&0xFCFCFCFCUL)>>2);\
782
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
783
            pixels+=line_size;\
784
            block +=line_size;\
785
            a= LD32(pixels  );\
786
            b= LD32(pixels+1);\
787
            l0=  (a&0x03030303UL)\
788
               + (b&0x03030303UL)\
789
               + 0x02020202UL;\
790
            h0= ((a&0xFCFCFCFCUL)>>2)\
791
              + ((b&0xFCFCFCFCUL)>>2);\
792
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
793
            pixels+=line_size;\
794
            block +=line_size;\
795
        }\
796
        pixels+=4-line_size*(h+1);\
797
        block +=4-line_size*h;\
798
    }\
799
}\
800
\
801 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
802 59fe111e Michael Niedermayer
{\
803
    int j;\
804
    for(j=0; j<2; j++){\
805
        int i;\
806
        const uint32_t a= LD32(pixels  );\
807
        const uint32_t b= LD32(pixels+1);\
808
        uint32_t l0=  (a&0x03030303UL)\
809
                    + (b&0x03030303UL)\
810
                    + 0x01010101UL;\
811
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
812
                   + ((b&0xFCFCFCFCUL)>>2);\
813
        uint32_t l1,h1;\
814
\
815
        pixels+=line_size;\
816
        for(i=0; i<h; i+=2){\
817
            uint32_t a= LD32(pixels  );\
818
            uint32_t b= LD32(pixels+1);\
819
            l1=  (a&0x03030303UL)\
820
               + (b&0x03030303UL);\
821
            h1= ((a&0xFCFCFCFCUL)>>2)\
822
              + ((b&0xFCFCFCFCUL)>>2);\
823
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
824
            pixels+=line_size;\
825
            block +=line_size;\
826
            a= LD32(pixels  );\
827
            b= LD32(pixels+1);\
828
            l0=  (a&0x03030303UL)\
829
               + (b&0x03030303UL)\
830
               + 0x01010101UL;\
831
            h0= ((a&0xFCFCFCFCUL)>>2)\
832
              + ((b&0xFCFCFCFCUL)>>2);\
833
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
834
            pixels+=line_size;\
835
            block +=line_size;\
836
        }\
837
        pixels+=4-line_size*(h+1);\
838
        block +=4-line_size*h;\
839
    }\
840
}\
841
\
842 45553457 Zdenek Kabelac
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
843
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
844
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
845
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
846
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
847
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
848
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
849
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
850 b3184779 Michael Niedermayer
851 d8085ea7 Michael Niedermayer
#define op_avg(a, b) a = rnd_avg32(a, b)
852 59fe111e Michael Niedermayer
#endif
853
#define op_put(a, b) a = b
854
855
PIXOP2(avg, op_avg)
856
PIXOP2(put, op_put)
857
#undef op_avg
858
#undef op_put
859
860 de6d9b64 Fabrice Bellard
#define avg2(a,b) ((a+b+1)>>1)
861
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
862
863 073b013d Michael Niedermayer
864 0c1a9eda Zdenek Kabelac
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
865 44eb4951 Michael Niedermayer
{
866
    const int A=(16-x16)*(16-y16);
867
    const int B=(   x16)*(16-y16);
868
    const int C=(16-x16)*(   y16);
869
    const int D=(   x16)*(   y16);
870
    int i;
871
872
    for(i=0; i<h; i++)
873
    {
874 b3184779 Michael Niedermayer
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
875
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
876
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
877
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
878
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
879
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
880
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
881
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
882
        dst+= stride;
883
        src+= stride;
884 44eb4951 Michael Niedermayer
    }
885
}
886
887 0c1a9eda Zdenek Kabelac
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
888 073b013d Michael Niedermayer
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
889
{
890
    int y, vx, vy;
891
    const int s= 1<<shift;
892
    
893
    width--;
894
    height--;
895
896
    for(y=0; y<h; y++){
897
        int x;
898
899
        vx= ox;
900
        vy= oy;
901
        for(x=0; x<8; x++){ //XXX FIXME optimize
902
            int src_x, src_y, frac_x, frac_y, index;
903
904
            src_x= vx>>16;
905
            src_y= vy>>16;
906
            frac_x= src_x&(s-1);
907
            frac_y= src_y&(s-1);
908
            src_x>>=shift;
909
            src_y>>=shift;
910
  
911
            if((unsigned)src_x < width){
912
                if((unsigned)src_y < height){
913
                    index= src_x + src_y*stride;
914
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
915
                                           + src[index       +1]*   frac_x )*(s-frac_y)
916
                                        + (  src[index+stride  ]*(s-frac_x)
917
                                           + src[index+stride+1]*   frac_x )*   frac_y
918
                                        + r)>>(shift*2);
919
                }else{
920
                    index= src_x + clip(src_y, 0, height)*stride;                    
921
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
922
                                          + src[index       +1]*   frac_x )*s
923
                                        + r)>>(shift*2);
924
                }
925
            }else{
926
                if((unsigned)src_y < height){
927
                    index= clip(src_x, 0, width) + src_y*stride;                    
928
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
929
                                           + src[index+stride  ]*   frac_y )*s
930
                                        + r)>>(shift*2);
931
                }else{
932
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
933
                    dst[y*stride + x]=    src[index         ];
934
                }
935
            }
936
            
937
            vx+= dxx;
938
            vy+= dyx;
939
        }
940
        ox += dxy;
941
        oy += dyy;
942
    }
943
}
944 669ac79c Michael Niedermayer
945
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
946
    switch(width){
947
    case 2: put_pixels2_c (dst, src, stride, height); break;
948
    case 4: put_pixels4_c (dst, src, stride, height); break;
949
    case 8: put_pixels8_c (dst, src, stride, height); break;
950
    case 16:put_pixels16_c(dst, src, stride, height); break;
951
    }
952
}
953
954
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
955
    int i,j;
956
    for (i=0; i < height; i++) {
957
      for (j=0; j < width; j++) {
958
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
959
      }
960
      src += stride;
961
      dst += stride;
962
    }
963
}
964
965
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
966
    int i,j;
967
    for (i=0; i < height; i++) {
968
      for (j=0; j < width; j++) {
969
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
970
      }
971
      src += stride;
972
      dst += stride;
973
    }
974
}
975
    
976
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
977
    int i,j;
978
    for (i=0; i < height; i++) {
979
      for (j=0; j < width; j++) {
980
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
981
      }
982
      src += stride;
983
      dst += stride;
984
    }
985
}
986
    
987
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
988
    int i,j;
989
    for (i=0; i < height; i++) {
990
      for (j=0; j < width; j++) {
991
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
992
      }
993
      src += stride;
994
      dst += stride;
995
    }
996
}
997
998
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
999
    int i,j;
1000
    for (i=0; i < height; i++) {
1001
      for (j=0; j < width; j++) {
1002 89ebf4e8 Mike Melanson
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1003 669ac79c Michael Niedermayer
      }
1004
      src += stride;
1005
      dst += stride;
1006
    }
1007
}
1008
1009
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1010
    int i,j;
1011
    for (i=0; i < height; i++) {
1012
      for (j=0; j < width; j++) {
1013
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1014
      }
1015
      src += stride;
1016
      dst += stride;
1017
    }
1018
}
1019
1020
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1021
    int i,j;
1022
    for (i=0; i < height; i++) {
1023
      for (j=0; j < width; j++) {
1024 89ebf4e8 Mike Melanson
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1025 669ac79c Michael Niedermayer
      }
1026
      src += stride;
1027
      dst += stride;
1028
    }
1029
}
1030
1031
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1032
    int i,j;
1033
    for (i=0; i < height; i++) {
1034
      for (j=0; j < width; j++) {
1035
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1036
      }
1037
      src += stride;
1038
      dst += stride;
1039
    }
1040
}
1041 da3b9756 Mike Melanson
1042
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1043
    switch(width){
1044
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1045
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1046
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1047
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1048
    }
1049
}
1050
1051
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1052
    int i,j;
1053
    for (i=0; i < height; i++) {
1054
      for (j=0; j < width; j++) {
1055
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1056
      }
1057
      src += stride;
1058
      dst += stride;
1059
    }
1060
}
1061
1062
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1063
    int i,j;
1064
    for (i=0; i < height; i++) {
1065
      for (j=0; j < width; j++) {
1066
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1067
      }
1068
      src += stride;
1069
      dst += stride;
1070
    }
1071
}
1072
    
1073
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1074
    int i,j;
1075
    for (i=0; i < height; i++) {
1076
      for (j=0; j < width; j++) {
1077
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1078
      }
1079
      src += stride;
1080
      dst += stride;
1081
    }
1082
}
1083
    
1084
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1085
    int i,j;
1086
    for (i=0; i < height; i++) {
1087
      for (j=0; j < width; j++) {
1088
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1089
      }
1090
      src += stride;
1091
      dst += stride;
1092
    }
1093
}
1094
1095
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1096
    int i,j;
1097
    for (i=0; i < height; i++) {
1098
      for (j=0; j < width; j++) {
1099 89ebf4e8 Mike Melanson
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1100 da3b9756 Mike Melanson
      }
1101
      src += stride;
1102
      dst += stride;
1103
    }
1104
}
1105
1106
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1107
    int i,j;
1108
    for (i=0; i < height; i++) {
1109
      for (j=0; j < width; j++) {
1110
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1111
      }
1112
      src += stride;
1113
      dst += stride;
1114
    }
1115
}
1116
1117
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1118
    int i,j;
1119
    for (i=0; i < height; i++) {
1120
      for (j=0; j < width; j++) {
1121 89ebf4e8 Mike Melanson
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1122 da3b9756 Mike Melanson
      }
1123
      src += stride;
1124
      dst += stride;
1125
    }
1126
}
1127
1128
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1129
    int i,j;
1130
    for (i=0; i < height; i++) {
1131
      for (j=0; j < width; j++) {
1132
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1133
      }
1134
      src += stride;
1135
      dst += stride;
1136
    }
1137
}
1138 669ac79c Michael Niedermayer
#if 0
1139
#define TPEL_WIDTH(width)\
1140
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1141
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1142
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1143
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1144
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1145
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1146
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1147
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1148
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1149
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1150
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1151
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1152
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1153
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1154
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1155
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1156
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1157
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1158
#endif
1159
1160 0da71265 Michael Niedermayer
#define H264_CHROMA_MC(OPNAME, OP)\
1161
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1162
    const int A=(8-x)*(8-y);\
1163
    const int B=(  x)*(8-y);\
1164
    const int C=(8-x)*(  y);\
1165
    const int D=(  x)*(  y);\
1166
    int i;\
1167
    \
1168
    assert(x<8 && y<8 && x>=0 && y>=0);\
1169
\
1170
    for(i=0; i<h; i++)\
1171
    {\
1172
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1173
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1174
        dst+= stride;\
1175
        src+= stride;\
1176
    }\
1177
}\
1178
\
1179
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1180
    const int A=(8-x)*(8-y);\
1181
    const int B=(  x)*(8-y);\
1182
    const int C=(8-x)*(  y);\
1183
    const int D=(  x)*(  y);\
1184
    int i;\
1185
    \
1186
    assert(x<8 && y<8 && x>=0 && y>=0);\
1187
\
1188
    for(i=0; i<h; i++)\
1189
    {\
1190
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1191
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1192
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1193
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1194
        dst+= stride;\
1195
        src+= stride;\
1196
    }\
1197
}\
1198
\
1199
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1200
    const int A=(8-x)*(8-y);\
1201
    const int B=(  x)*(8-y);\
1202
    const int C=(8-x)*(  y);\
1203
    const int D=(  x)*(  y);\
1204
    int i;\
1205
    \
1206
    assert(x<8 && y<8 && x>=0 && y>=0);\
1207
\
1208
    for(i=0; i<h; i++)\
1209
    {\
1210
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1211
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1212
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1213
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1214
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1215
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1216
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1217
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1218
        dst+= stride;\
1219
        src+= stride;\
1220
    }\
1221
}
1222
1223
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1224
#define op_put(a, b) a = (((b) + 32)>>6)
1225
1226
H264_CHROMA_MC(put_       , op_put)
1227
H264_CHROMA_MC(avg_       , op_avg)
1228
#undef op_avg
1229
#undef op_put
1230
1231
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1232
{
1233
    int i;
1234
    for(i=0; i<h; i++)
1235
    {
1236
        ST32(dst   , LD32(src   ));
1237
        dst+=dstStride;
1238
        src+=srcStride;
1239
    }
1240
}
1241
1242
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1243
{
1244
    int i;
1245
    for(i=0; i<h; i++)
1246
    {
1247
        ST32(dst   , LD32(src   ));
1248
        ST32(dst+4 , LD32(src+4 ));
1249
        dst+=dstStride;
1250
        src+=srcStride;
1251
    }
1252
}
1253
1254
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1255
{
1256
    int i;
1257
    for(i=0; i<h; i++)
1258
    {
1259
        ST32(dst   , LD32(src   ));
1260
        ST32(dst+4 , LD32(src+4 ));
1261
        ST32(dst+8 , LD32(src+8 ));
1262
        ST32(dst+12, LD32(src+12));
1263
        dst+=dstStride;
1264
        src+=srcStride;
1265
    }
1266
}
1267 073b013d Michael Niedermayer
1268 0c1a9eda Zdenek Kabelac
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1269 44eb4951 Michael Niedermayer
{
1270
    int i;
1271
    for(i=0; i<h; i++)
1272
    {
1273 b3184779 Michael Niedermayer
        ST32(dst   , LD32(src   ));
1274
        ST32(dst+4 , LD32(src+4 ));
1275
        ST32(dst+8 , LD32(src+8 ));
1276
        ST32(dst+12, LD32(src+12));
1277
        dst[16]= src[16];
1278 44eb4951 Michael Niedermayer
        dst+=dstStride;
1279
        src+=srcStride;
1280
    }
1281
}
1282
1283 0c1a9eda Zdenek Kabelac
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1284 44eb4951 Michael Niedermayer
{
1285
    int i;
1286 b3184779 Michael Niedermayer
    for(i=0; i<h; i++)
1287 44eb4951 Michael Niedermayer
    {
1288 b3184779 Michael Niedermayer
        ST32(dst   , LD32(src   ));
1289
        ST32(dst+4 , LD32(src+4 ));
1290
        dst[8]= src[8];
1291 44eb4951 Michael Niedermayer
        dst+=dstStride;
1292
        src+=srcStride;
1293
    }
1294
}
1295
1296 826f429a Michael Niedermayer
1297 b3184779 Michael Niedermayer
#define QPEL_MC(r, OPNAME, RND, OP) \
1298 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1299
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1300 b3184779 Michael Niedermayer
    int i;\
1301
    for(i=0; i<h; i++)\
1302
    {\
1303
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1304
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1305
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1306
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1307
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1308
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1309
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1310
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1311
        dst+=dstStride;\
1312
        src+=srcStride;\
1313
    }\
1314 44eb4951 Michael Niedermayer
}\
1315
\
1316 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1317 db794953 Michael Niedermayer
    const int w=8;\
1318 0c1a9eda Zdenek Kabelac
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1319 b3184779 Michael Niedermayer
    int i;\
1320
    for(i=0; i<w; i++)\
1321
    {\
1322
        const int src0= src[0*srcStride];\
1323
        const int src1= src[1*srcStride];\
1324
        const int src2= src[2*srcStride];\
1325
        const int src3= src[3*srcStride];\
1326
        const int src4= src[4*srcStride];\
1327
        const int src5= src[5*srcStride];\
1328
        const int src6= src[6*srcStride];\
1329
        const int src7= src[7*srcStride];\
1330
        const int src8= src[8*srcStride];\
1331
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1332
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1333
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1334
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1335
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1336
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1337
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1338
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1339
        dst++;\
1340
        src++;\
1341
    }\
1342
}\
1343
\
1344 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1345
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1346 b3184779 Michael Niedermayer
    int i;\
1347 826f429a Michael Niedermayer
    \
1348 b3184779 Michael Niedermayer
    for(i=0; i<h; i++)\
1349
    {\
1350
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1351
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1352
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1353
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1354
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1355
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1356
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1357
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1358
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1359
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1360
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1361
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1362
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1363
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1364
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1365
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1366
        dst+=dstStride;\
1367
        src+=srcStride;\
1368
    }\
1369
}\
1370
\
1371 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1372
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1373 b3184779 Michael Niedermayer
    int i;\
1374 826f429a Michael Niedermayer
    const int w=16;\
1375 b3184779 Michael Niedermayer
    for(i=0; i<w; i++)\
1376
    {\
1377
        const int src0= src[0*srcStride];\
1378
        const int src1= src[1*srcStride];\
1379
        const int src2= src[2*srcStride];\
1380
        const int src3= src[3*srcStride];\
1381
        const int src4= src[4*srcStride];\
1382
        const int src5= src[5*srcStride];\
1383
        const int src6= src[6*srcStride];\
1384
        const int src7= src[7*srcStride];\
1385
        const int src8= src[8*srcStride];\
1386
        const int src9= src[9*srcStride];\
1387
        const int src10= src[10*srcStride];\
1388
        const int src11= src[11*srcStride];\
1389
        const int src12= src[12*srcStride];\
1390
        const int src13= src[13*srcStride];\
1391
        const int src14= src[14*srcStride];\
1392
        const int src15= src[15*srcStride];\
1393
        const int src16= src[16*srcStride];\
1394
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1395
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1396
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1397
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1398
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1399
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1400
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1401
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1402
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1403
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1404
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1405
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1406
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1407
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1408
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1409
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1410
        dst++;\
1411
        src++;\
1412
    }\
1413
}\
1414
\
1415 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1416 45553457 Zdenek Kabelac
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1417 b3184779 Michael Niedermayer
}\
1418
\
1419 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1420
    uint8_t half[64];\
1421 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1422
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1423 44eb4951 Michael Niedermayer
}\
1424
\
1425 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1426 b3184779 Michael Niedermayer
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1427 44eb4951 Michael Niedermayer
}\
1428
\
1429 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1430
    uint8_t half[64];\
1431 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1432
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1433 44eb4951 Michael Niedermayer
}\
1434
\
1435 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1436
    uint8_t full[16*9];\
1437
    uint8_t half[64];\
1438 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1439 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1440 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1441 44eb4951 Michael Niedermayer
}\
1442
\
1443 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1444
    uint8_t full[16*9];\
1445 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1446 db794953 Michael Niedermayer
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1447 44eb4951 Michael Niedermayer
}\
1448
\
1449 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1450
    uint8_t full[16*9];\
1451
    uint8_t half[64];\
1452 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1453 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1454 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1455 44eb4951 Michael Niedermayer
}\
1456 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1457
    uint8_t full[16*9];\
1458
    uint8_t halfH[72];\
1459
    uint8_t halfV[64];\
1460
    uint8_t halfHV[64];\
1461 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1462
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1463 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1464
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1465 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1466 44eb4951 Michael Niedermayer
}\
1467 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1468
    uint8_t full[16*9];\
1469
    uint8_t halfH[72];\
1470
    uint8_t halfHV[64];\
1471 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1472
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1473
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1474
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1475
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1476
}\
1477 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1478
    uint8_t full[16*9];\
1479
    uint8_t halfH[72];\
1480
    uint8_t halfV[64];\
1481
    uint8_t halfHV[64];\
1482 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1483
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1484 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1485
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1486 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1487 44eb4951 Michael Niedermayer
}\
1488 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1489
    uint8_t full[16*9];\
1490
    uint8_t halfH[72];\
1491
    uint8_t halfHV[64];\
1492 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1493
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1494
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1495
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1496
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1497
}\
1498 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499
    uint8_t full[16*9];\
1500
    uint8_t halfH[72];\
1501
    uint8_t halfV[64];\
1502
    uint8_t halfHV[64];\
1503 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1504
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1506
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1508 44eb4951 Michael Niedermayer
}\
1509 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1510
    uint8_t full[16*9];\
1511
    uint8_t halfH[72];\
1512
    uint8_t halfHV[64];\
1513 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1514
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1516
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1518
}\
1519 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520
    uint8_t full[16*9];\
1521
    uint8_t halfH[72];\
1522
    uint8_t halfV[64];\
1523
    uint8_t halfHV[64];\
1524 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1525
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1526 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1527
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1529 44eb4951 Michael Niedermayer
}\
1530 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1531
    uint8_t full[16*9];\
1532
    uint8_t halfH[72];\
1533
    uint8_t halfHV[64];\
1534 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1535
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1537
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1539
}\
1540 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1541
    uint8_t halfH[72];\
1542
    uint8_t halfHV[64];\
1543 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1544 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1545 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1546 44eb4951 Michael Niedermayer
}\
1547 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1548
    uint8_t halfH[72];\
1549
    uint8_t halfHV[64];\
1550 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1551 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1552 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1553 44eb4951 Michael Niedermayer
}\
1554 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1555
    uint8_t full[16*9];\
1556
    uint8_t halfH[72];\
1557
    uint8_t halfV[64];\
1558
    uint8_t halfHV[64];\
1559 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1560
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1561 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1562
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1563 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1564 44eb4951 Michael Niedermayer
}\
1565 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1566
    uint8_t full[16*9];\
1567
    uint8_t halfH[72];\
1568 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1569
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1570
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1571
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1572
}\
1573 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1574
    uint8_t full[16*9];\
1575
    uint8_t halfH[72];\
1576
    uint8_t halfV[64];\
1577
    uint8_t halfHV[64];\
1578 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1579
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1580 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1581
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1582 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1583 44eb4951 Michael Niedermayer
}\
1584 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1585
    uint8_t full[16*9];\
1586
    uint8_t halfH[72];\
1587 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1588
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1589
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1590
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1591
}\
1592 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1593
    uint8_t halfH[72];\
1594 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1595 db794953 Michael Niedermayer
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1596 b3184779 Michael Niedermayer
}\
1597 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1598 45553457 Zdenek Kabelac
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1599 b3184779 Michael Niedermayer
}\
1600
\
1601 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1602
    uint8_t half[256];\
1603 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1604
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1605
}\
1606
\
1607 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1608 b3184779 Michael Niedermayer
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1609 44eb4951 Michael Niedermayer
}\
1610 b3184779 Michael Niedermayer
\
1611 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1612
    uint8_t half[256];\
1613 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1614
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1615
}\
1616
\
1617 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1618
    uint8_t full[24*17];\
1619
    uint8_t half[256];\
1620 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1621 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1622 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1623
}\
1624
\
1625 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1626
    uint8_t full[24*17];\
1627 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1628 826f429a Michael Niedermayer
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1629 b3184779 Michael Niedermayer
}\
1630
\
1631 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1632
    uint8_t full[24*17];\
1633
    uint8_t half[256];\
1634 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1635 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1636 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1637
}\
1638 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1639
    uint8_t full[24*17];\
1640
    uint8_t halfH[272];\
1641
    uint8_t halfV[256];\
1642
    uint8_t halfHV[256];\
1643 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1644
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1645 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1646
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1647 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1648
}\
1649 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1650
    uint8_t full[24*17];\
1651
    uint8_t halfH[272];\
1652
    uint8_t halfHV[256];\
1653 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1654
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1655
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1656
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1657
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1658
}\
1659 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1660
    uint8_t full[24*17];\
1661
    uint8_t halfH[272];\
1662
    uint8_t halfV[256];\
1663
    uint8_t halfHV[256];\
1664 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1665
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1666 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1667
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1668 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1669
}\
1670 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1671
    uint8_t full[24*17];\
1672
    uint8_t halfH[272];\
1673
    uint8_t halfHV[256];\
1674 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1675
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1676
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1677
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1678
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1679
}\
1680 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681
    uint8_t full[24*17];\
1682
    uint8_t halfH[272];\
1683
    uint8_t halfV[256];\
1684
    uint8_t halfHV[256];\
1685 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1686
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1688
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1690
}\
1691 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1692
    uint8_t full[24*17];\
1693
    uint8_t halfH[272];\
1694
    uint8_t halfHV[256];\
1695 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1696
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1698
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1700
}\
1701 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702
    uint8_t full[24*17];\
1703
    uint8_t halfH[272];\
1704
    uint8_t halfV[256];\
1705
    uint8_t halfHV[256];\
1706 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1707
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1708 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1709
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1711
}\
1712 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1713
    uint8_t full[24*17];\
1714
    uint8_t halfH[272];\
1715
    uint8_t halfHV[256];\
1716 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1717
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1719
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1721
}\
1722 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1723
    uint8_t halfH[272];\
1724
    uint8_t halfHV[256];\
1725 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1726 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1727 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1728
}\
1729 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1730
    uint8_t halfH[272];\
1731
    uint8_t halfHV[256];\
1732 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1733 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1734 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1735
}\
1736 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1737
    uint8_t full[24*17];\
1738
    uint8_t halfH[272];\
1739
    uint8_t halfV[256];\
1740
    uint8_t halfHV[256];\
1741 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1742
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1743 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1744
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1745 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1746
}\
1747 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1748
    uint8_t full[24*17];\
1749
    uint8_t halfH[272];\
1750 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1751
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1752
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1753
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1754
}\
1755 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1756
    uint8_t full[24*17];\
1757
    uint8_t halfH[272];\
1758
    uint8_t halfV[256];\
1759
    uint8_t halfHV[256];\
1760 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1761
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1762 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1763
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1764 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1765
}\
1766 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1767
    uint8_t full[24*17];\
1768
    uint8_t halfH[272];\
1769 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1770
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1771
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1772
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1773
}\
1774 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1775
    uint8_t halfH[272];\
1776 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1777 826f429a Michael Niedermayer
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1778 45553457 Zdenek Kabelac
}
1779 44eb4951 Michael Niedermayer
1780 b3184779 Michael Niedermayer
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1781
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1782
#define op_put(a, b) a = cm[((b) + 16)>>5]
1783
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1784
1785
QPEL_MC(0, put_       , _       , op_put)
1786
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1787
QPEL_MC(0, avg_       , _       , op_avg)
1788
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1789
#undef op_avg
1790
#undef op_avg_no_rnd
1791
#undef op_put
1792
#undef op_put_no_rnd
1793 44eb4951 Michael Niedermayer
1794 0da71265 Michael Niedermayer
#if 1
1795
#define H264_LOWPASS(OPNAME, OP, OP2) \
1796
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1797
    const int h=4;\
1798
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1799
    int i;\
1800
    for(i=0; i<h; i++)\
1801
    {\
1802
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1803
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1804
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1805
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1806
        dst+=dstStride;\
1807
        src+=srcStride;\
1808
    }\
1809
}\
1810
\
1811
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1812
    const int w=4;\
1813
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1814
    int i;\
1815
    for(i=0; i<w; i++)\
1816
    {\
1817
        const int srcB= src[-2*srcStride];\
1818
        const int srcA= src[-1*srcStride];\
1819
        const int src0= src[0 *srcStride];\
1820
        const int src1= src[1 *srcStride];\
1821
        const int src2= src[2 *srcStride];\
1822
        const int src3= src[3 *srcStride];\
1823
        const int src4= src[4 *srcStride];\
1824
        const int src5= src[5 *srcStride];\
1825
        const int src6= src[6 *srcStride];\
1826
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1827
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1828
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1829
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1830
        dst++;\
1831
        src++;\
1832
    }\
1833
}\
1834
\
1835
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1836
    const int h=4;\
1837
    const int w=4;\
1838
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1839
    int i;\
1840
    src -= 2*srcStride;\
1841
    for(i=0; i<h+5; i++)\
1842
    {\
1843
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1844
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1845
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1846
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1847
        tmp+=tmpStride;\
1848
        src+=srcStride;\
1849
    }\
1850
    tmp -= tmpStride*(h+5-2);\
1851
    for(i=0; i<w; i++)\
1852
    {\
1853
        const int tmpB= tmp[-2*tmpStride];\
1854
        const int tmpA= tmp[-1*tmpStride];\
1855
        const int tmp0= tmp[0 *tmpStride];\
1856
        const int tmp1= tmp[1 *tmpStride];\
1857
        const int tmp2= tmp[2 *tmpStride];\
1858
        const int tmp3= tmp[3 *tmpStride];\
1859
        const int tmp4= tmp[4 *tmpStride];\
1860
        const int tmp5= tmp[5 *tmpStride];\
1861
        const int tmp6= tmp[6 *tmpStride];\
1862
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1863
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1864
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1865
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1866
        dst++;\
1867
        tmp++;\
1868
    }\
1869
}\
1870
\
1871
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1872
    const int h=8;\
1873
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1874
    int i;\
1875
    for(i=0; i<h; i++)\
1876
    {\
1877
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1878
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1879
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1880
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1881
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1882
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1883
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1884
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1885
        dst+=dstStride;\
1886
        src+=srcStride;\
1887
    }\
1888
}\
1889
\
1890
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1891
    const int w=8;\
1892
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1893
    int i;\
1894
    for(i=0; i<w; i++)\
1895
    {\
1896
        const int srcB= src[-2*srcStride];\
1897
        const int srcA= src[-1*srcStride];\
1898
        const int src0= src[0 *srcStride];\
1899
        const int src1= src[1 *srcStride];\
1900
        const int src2= src[2 *srcStride];\
1901
        const int src3= src[3 *srcStride];\
1902
        const int src4= src[4 *srcStride];\
1903
        const int src5= src[5 *srcStride];\
1904
        const int src6= src[6 *srcStride];\
1905
        const int src7= src[7 *srcStride];\
1906
        const int src8= src[8 *srcStride];\
1907
        const int src9= src[9 *srcStride];\
1908
        const int src10=src[10*srcStride];\
1909
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1910
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1911
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1912
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1913
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1914
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1915
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1916
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1917
        dst++;\
1918
        src++;\
1919
    }\
1920
}\
1921
\
1922
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1923
    const int h=8;\
1924
    const int w=8;\
1925
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1926
    int i;\
1927
    src -= 2*srcStride;\
1928
    for(i=0; i<h+5; i++)\
1929
    {\
1930
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1931
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1932
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1933
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1934
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1935
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1936
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1937
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1938
        tmp+=tmpStride;\
1939
        src+=srcStride;\
1940
    }\
1941
    tmp -= tmpStride*(h+5-2);\
1942
    for(i=0; i<w; i++)\
1943
    {\
1944
        const int tmpB= tmp[-2*tmpStride];\
1945
        const int tmpA= tmp[-1*tmpStride];\
1946
        const int tmp0= tmp[0 *tmpStride];\
1947
        const int tmp1= tmp[1 *tmpStride];\
1948
        const int tmp2= tmp[2 *tmpStride];\
1949
        const int tmp3= tmp[3 *tmpStride];\
1950
        const int tmp4= tmp[4 *tmpStride];\
1951
        const int tmp5= tmp[5 *tmpStride];\
1952
        const int tmp6= tmp[6 *tmpStride];\
1953
        const int tmp7= tmp[7 *tmpStride];\
1954
        const int tmp8= tmp[8 *tmpStride];\
1955
        const int tmp9= tmp[9 *tmpStride];\
1956
        const int tmp10=tmp[10*tmpStride];\
1957
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1958
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1959
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1960
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1961
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1962
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1963
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1964
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1965
        dst++;\
1966
        tmp++;\
1967
    }\
1968
}\
1969
\
1970
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1971
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1972
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1973
    src += 8*srcStride;\
1974
    dst += 8*dstStride;\
1975
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1976
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1977
}\
1978
\
1979
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1980
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1981
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1982
    src += 8*srcStride;\
1983
    dst += 8*dstStride;\
1984
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1985
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1986
}\
1987
\
1988
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1989
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
1990
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1991
    src += 8*srcStride;\
1992
    tmp += 8*tmpStride;\
1993
    dst += 8*dstStride;\
1994
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
1995
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1996
}\
1997
1998
#define H264_MC(OPNAME, SIZE) \
1999
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2000
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2001
}\
2002
\
2003
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2004
    uint8_t half[SIZE*SIZE];\
2005
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2006
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2007
}\
2008
\
2009
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2010
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2011
}\
2012
\
2013
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2014
    uint8_t half[SIZE*SIZE];\
2015
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2016
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2017
}\
2018
\
2019
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2020
    uint8_t full[SIZE*(SIZE+5)];\
2021
    uint8_t * const full_mid= full + SIZE*2;\
2022
    uint8_t half[SIZE*SIZE];\
2023
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2024
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2025
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2026
}\
2027
\
2028
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2029
    uint8_t full[SIZE*(SIZE+5)];\
2030
    uint8_t * const full_mid= full + SIZE*2;\
2031
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2032
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2033
}\
2034
\
2035
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2036
    uint8_t full[SIZE*(SIZE+5)];\
2037
    uint8_t * const full_mid= full + SIZE*2;\
2038
    uint8_t half[SIZE*SIZE];\
2039
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2040
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2041
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2042
}\
2043
\
2044
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2045
    uint8_t full[SIZE*(SIZE+5)];\
2046
    uint8_t * const full_mid= full + SIZE*2;\
2047
    uint8_t halfH[SIZE*SIZE];\
2048
    uint8_t halfV[SIZE*SIZE];\
2049
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2050
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2051
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2052
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2053
}\
2054
\
2055
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2056
    uint8_t full[SIZE*(SIZE+5)];\
2057
    uint8_t * const full_mid= full + SIZE*2;\
2058
    uint8_t halfH[SIZE*SIZE];\
2059
    uint8_t halfV[SIZE*SIZE];\
2060
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2061
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2062
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2063
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2064
}\
2065
\
2066
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2067
    uint8_t full[SIZE*(SIZE+5)];\
2068
    uint8_t * const full_mid= full + SIZE*2;\
2069
    uint8_t halfH[SIZE*SIZE];\
2070
    uint8_t halfV[SIZE*SIZE];\
2071
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2072
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2073
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2074
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2075
}\
2076
\
2077
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2078
    uint8_t full[SIZE*(SIZE+5)];\
2079
    uint8_t * const full_mid= full + SIZE*2;\
2080
    uint8_t halfH[SIZE*SIZE];\
2081
    uint8_t halfV[SIZE*SIZE];\
2082
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2083
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2084
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2085
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2086
}\
2087
\
2088
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2089
    int16_t tmp[SIZE*(SIZE+5)];\
2090
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2091
}\
2092
\
2093
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2094
    int16_t tmp[SIZE*(SIZE+5)];\
2095
    uint8_t halfH[SIZE*SIZE];\
2096
    uint8_t halfHV[SIZE*SIZE];\
2097
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2098
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2099
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2100
}\
2101
\
2102
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2103
    int16_t tmp[SIZE*(SIZE+5)];\
2104
    uint8_t halfH[SIZE*SIZE];\
2105
    uint8_t halfHV[SIZE*SIZE];\
2106
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2107
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2108
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2109
}\
2110
\
2111
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2112
    uint8_t full[SIZE*(SIZE+5)];\
2113
    uint8_t * const full_mid= full + SIZE*2;\
2114
    int16_t tmp[SIZE*(SIZE+5)];\
2115
    uint8_t halfV[SIZE*SIZE];\
2116
    uint8_t halfHV[SIZE*SIZE];\
2117
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2118
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2119
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2120
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2121
}\
2122
\
2123
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2124
    uint8_t full[SIZE*(SIZE+5)];\
2125
    uint8_t * const full_mid= full + SIZE*2;\
2126
    int16_t tmp[SIZE*(SIZE+5)];\
2127
    uint8_t halfV[SIZE*SIZE];\
2128
    uint8_t halfHV[SIZE*SIZE];\
2129
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2130
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2131
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2132
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2133
}\
2134
2135
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2136
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2137
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2138
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2139
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2140
2141
H264_LOWPASS(put_       , op_put, op2_put)
2142
H264_LOWPASS(avg_       , op_avg, op2_avg)
2143
H264_MC(put_, 4)
2144
H264_MC(put_, 8)
2145
H264_MC(put_, 16)
2146
H264_MC(avg_, 4)
2147
H264_MC(avg_, 8)
2148
H264_MC(avg_, 16)
2149
2150
#undef op_avg
2151
#undef op_put
2152
#undef op2_avg
2153
#undef op2_put
2154
#endif
2155
2156 1457ab52 Michael Niedermayer
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2157
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2158
    int i;
2159
2160
    for(i=0; i<h; i++){
2161
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2162
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2163
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2164
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2165
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2166
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2167
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2168
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2169
        dst+=dstStride;
2170
        src+=srcStride;        
2171
    }
2172
}
2173
2174
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2175
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2176
    int i;
2177
2178
    for(i=0; i<w; i++){
2179
        const int src_1= src[ -srcStride];
2180
        const int src0 = src[0          ];
2181
        const int src1 = src[  srcStride];
2182
        const int src2 = src[2*srcStride];
2183
        const int src3 = src[3*srcStride];
2184
        const int src4 = src[4*srcStride];
2185
        const int src5 = src[5*srcStride];
2186
        const int src6 = src[6*srcStride];
2187
        const int src7 = src[7*srcStride];
2188
        const int src8 = src[8*srcStride];
2189
        const int src9 = src[9*srcStride];
2190
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2191
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2192
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2193
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2194
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2195
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2196
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2197
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2198
        src++;
2199
        dst++;
2200
    }
2201
}
2202
2203
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2204
    put_pixels8_c(dst, src, stride, 8);
2205
}
2206
2207
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2208
    uint8_t half[64];
2209
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2210
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2211
}
2212
2213
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2214
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2215
}
2216
2217
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2218
    uint8_t half[64];
2219
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2220
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2221
}
2222
2223
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2224
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2225
}
2226
2227
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2228
    uint8_t halfH[88];
2229
    uint8_t halfV[64];
2230
    uint8_t halfHV[64];
2231
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2232
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2233
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2234
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2235
}
2236
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2237
    uint8_t halfH[88];
2238
    uint8_t halfV[64];
2239
    uint8_t halfHV[64];
2240
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2241
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2242
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2243
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2244
}
2245
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2246
    uint8_t halfH[88];
2247
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2248
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2249
}
2250
2251
2252 0c1a9eda Zdenek Kabelac
static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2253 de6d9b64 Fabrice Bellard
{
2254
    int s, i;
2255
2256
    s = 0;
2257 ba6802de Michael Niedermayer
    for(i=0;i<16;i++) {
2258 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - pix2[0]);
2259
        s += abs(pix1[1] - pix2[1]);
2260
        s += abs(pix1[2] - pix2[2]);
2261
        s += abs(pix1[3] - pix2[3]);
2262
        s += abs(pix1[4] - pix2[4]);
2263
        s += abs(pix1[5] - pix2[5]);
2264
        s += abs(pix1[6] - pix2[6]);
2265
        s += abs(pix1[7] - pix2[7]);
2266
        s += abs(pix1[8] - pix2[8]);
2267
        s += abs(pix1[9] - pix2[9]);
2268
        s += abs(pix1[10] - pix2[10]);
2269
        s += abs(pix1[11] - pix2[11]);
2270
        s += abs(pix1[12] - pix2[12]);
2271
        s += abs(pix1[13] - pix2[13]);
2272
        s += abs(pix1[14] - pix2[14]);
2273
        s += abs(pix1[15] - pix2[15]);
2274
        pix1 += line_size;
2275
        pix2 += line_size;
2276
    }
2277
    return s;
2278
}
2279
2280 0c1a9eda Zdenek Kabelac
static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2281 de6d9b64 Fabrice Bellard
{
2282
    int s, i;
2283
2284
    s = 0;
2285 ba6802de Michael Niedermayer
    for(i=0;i<16;i++) {
2286 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2287
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2288
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2289
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2290
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2291
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2292
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2293
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2294
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2295
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2296
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2297
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2298
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2299
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2300
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2301
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2302
        pix1 += line_size;
2303
        pix2 += line_size;
2304
    }
2305
    return s;
2306
}
2307
2308 0c1a9eda Zdenek Kabelac
static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2309 de6d9b64 Fabrice Bellard
{
2310
    int s, i;
2311 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
2312 de6d9b64 Fabrice Bellard
2313
    s = 0;
2314 ba6802de Michael Niedermayer
    for(i=0;i<16;i++) {
2315 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2316
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2317
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2318
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2319
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2320
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2321
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2322
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2323
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2324
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2325
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2326
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2327
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2328
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2329
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2330
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2331
        pix1 += line_size;
2332
        pix2 += line_size;
2333
        pix3 += line_size;
2334
    }
2335
    return s;
2336
}
2337
2338 0c1a9eda Zdenek Kabelac
static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2339 de6d9b64 Fabrice Bellard
{
2340
    int s, i;
2341 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
2342 de6d9b64 Fabrice Bellard
2343
    s = 0;
2344 ba6802de Michael Niedermayer
    for(i=0;i<16;i++) {
2345 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2346
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2347
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2348
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2349
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2350
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2351
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2352
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2353
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2354
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2355
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2356
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2357
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2358
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2359
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2360
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2361
        pix1 += line_size;
2362
        pix2 += line_size;
2363
        pix3 += line_size;
2364
    }
2365
    return s;
2366
}
2367
2368 0c1a9eda Zdenek Kabelac
static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2369 ba6802de Michael Niedermayer
{
2370
    int s, i;
2371
2372
    s = 0;
2373
    for(i=0;i<8;i++) {
2374
        s += abs(pix1[0] - pix2[0]);
2375
        s += abs(pix1[1] - pix2[1]);
2376
        s += abs(pix1[2] - pix2[2]);
2377
        s += abs(pix1[3] - pix2[3]);
2378
        s += abs(pix1[4] - pix2[4]);
2379
        s += abs(pix1[5] - pix2[5]);
2380
        s += abs(pix1[6] - pix2[6]);
2381
        s += abs(pix1[7] - pix2[7]);
2382
        pix1 += line_size;
2383
        pix2 += line_size;
2384
    }
2385
    return s;
2386
}
2387
2388 0c1a9eda Zdenek Kabelac
static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2389 ba6802de Michael Niedermayer
{
2390
    int s, i;
2391
2392
    s = 0;
2393
    for(i=0;i<8;i++) {
2394
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2395
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2396
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2397
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2398
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2399
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2400
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2401
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2402
        pix1 += line_size;
2403
        pix2 += line_size;
2404
    }
2405
    return s;
2406
}
2407
2408 0c1a9eda Zdenek Kabelac
static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2409 ba6802de Michael Niedermayer
{
2410
    int s, i;
2411 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
2412 ba6802de Michael Niedermayer
2413
    s = 0;
2414
    for(i=0;i<8;i++) {
2415
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2416
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2417
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2418
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2419
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2420
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2421
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2422
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2423
        pix1 += line_size;
2424
        pix2 += line_size;
2425
        pix3 += line_size;
2426
    }
2427
    return s;
2428
}
2429
2430 0c1a9eda Zdenek Kabelac
static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2431 ba6802de Michael Niedermayer
{
2432
    int s, i;
2433 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
2434 ba6802de Michael Niedermayer
2435
    s = 0;
2436
    for(i=0;i<8;i++) {
2437
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2438
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2439
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2440
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2441
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2442
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2443
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2444
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2445
        pix1 += line_size;
2446
        pix2 += line_size;
2447
        pix3 += line_size;
2448
    }
2449
    return s;
2450
}
2451
2452 1457ab52 Michael Niedermayer
static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2453
    return pix_abs16x16_c(a,b,stride);
2454
}
2455
2456
static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2457
    return pix_abs8x8_c(a,b,stride);
2458
}
2459
2460 a9badb51 Michael Niedermayer
/**
2461
 * permutes an 8x8 block.
2462 2a5700de Michael Niedermayer
 * @param block the block which will be permuted according to the given permutation vector
2463 a9badb51 Michael Niedermayer
 * @param permutation the permutation vector
2464
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2465 2a5700de Michael Niedermayer
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
2466
 *                  (inverse) permutated to scantable order!
2467 a9badb51 Michael Niedermayer
 */
2468 0c1a9eda Zdenek Kabelac
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2469 d962f6fd Arpi
{
2470 7801d21d Michael Niedermayer
    int i;
2471 477ab036 Michael Niedermayer
    DCTELEM temp[64];
2472 7801d21d Michael Niedermayer
    
2473
    if(last<=0) return;
2474 9a7b310d Zdenek Kabelac
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2475 d962f6fd Arpi
2476 7801d21d Michael Niedermayer
    for(i=0; i<=last; i++){
2477
        const int j= scantable[i];
2478
        temp[j]= block[j];
2479
        block[j]=0;
2480
    }
2481
    
2482
    for(i=0; i<=last; i++){
2483
        const int j= scantable[i];
2484
        const int perm_j= permutation[j];
2485
        block[perm_j]= temp[j];
2486
    }
2487 d962f6fd Arpi
}
2488 e0eac44e Fabrice Bellard
2489 2a5700de Michael Niedermayer
/**
2490
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2491
 */
2492 eb4b3dd3 Zdenek Kabelac
static void clear_blocks_c(DCTELEM *blocks)
2493 649c00c9 Michael Niedermayer
{
2494
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
2495
}
2496
2497 11f18faf Michael Niedermayer
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2498
    int i;
2499 d32ac509 Felix von Leitner
    for(i=0; i+7<w; i+=8){
2500 11f18faf Michael Niedermayer
        dst[i+0] += src[i+0];
2501
        dst[i+1] += src[i+1];
2502
        dst[i+2] += src[i+2];
2503
        dst[i+3] += src[i+3];
2504
        dst[i+4] += src[i+4];
2505
        dst[i+5] += src[i+5];
2506
        dst[i+6] += src[i+6];
2507
        dst[i+7] += src[i+7];
2508
    }
2509
    for(; i<w; i++)
2510
        dst[i+0] += src[i+0];
2511
}
2512
2513
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2514
    int i;
2515 d32ac509 Felix von Leitner
    for(i=0; i+7<w; i+=8){
2516 11f18faf Michael Niedermayer
        dst[i+0] = src1[i+0]-src2[i+0];
2517
        dst[i+1] = src1[i+1]-src2[i+1];
2518
        dst[i+2] = src1[i+2]-src2[i+2];
2519
        dst[i+3] = src1[i+3]-src2[i+3];
2520
        dst[i+4] = src1[i+4]-src2[i+4];
2521
        dst[i+5] = src1[i+5]-src2[i+5];
2522
        dst[i+6] = src1[i+6]-src2[i+6];
2523
        dst[i+7] = src1[i+7]-src2[i+7];
2524
    }
2525
    for(; i<w; i++)
2526
        dst[i+0] = src1[i+0]-src2[i+0];
2527
}
2528
2529 1457ab52 Michael Niedermayer
#define BUTTERFLY2(o1,o2,i1,i2) \
2530
o1= (i1)+(i2);\
2531
o2= (i1)-(i2);
2532
2533
#define BUTTERFLY1(x,y) \
2534
{\
2535
    int a,b;\
2536
    a= x;\
2537
    b= y;\
2538
    x= a+b;\
2539
    y= a-b;\
2540
}
2541
2542
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2543
2544
static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2545
    int i;
2546
    int temp[64];
2547
    int sum=0;
2548
2549
    for(i=0; i<8; i++){
2550
        //FIXME try pointer walks
2551
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2552
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2553
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2554
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2555
        
2556
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2557
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2558
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2559
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2560
        
2561
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2562
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2563
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2564
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2565
    }
2566
2567
    for(i=0; i<8; i++){
2568
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2569
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2570
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2571
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2572
        
2573
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2574
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2575
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2576
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2577
2578
        sum += 
2579
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2580
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2581
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2582
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2583
    }
2584
#if 0
2585
static int maxi=0;
2586
if(sum>maxi){
2587
    maxi=sum;
2588
    printf("MAX:%d\n", maxi);
2589
}
2590
#endif
2591
    return sum;
2592
}
2593
2594
static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2595
    int i;
2596
    int temp[64];
2597
    int sum=0;
2598
//FIXME OOOPS ignore 0 term instead of mean mess
2599
    for(i=0; i<8; i++){
2600
        //FIXME try pointer walks
2601
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2602
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2603
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2604
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2605
        
2606
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2607
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2608
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2609
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2610
        
2611
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2612
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2613
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2614
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2615
    }
2616
2617
    for(i=0; i<8; i++){
2618
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2619
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2620
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2621
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2622
        
2623
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2624
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2625
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2626
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2627
    
2628
        sum += 
2629
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2630
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2631
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2632
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2633
    }
2634
    
2635
    return sum;
2636
}
2637
2638
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2639
    MpegEncContext * const s= (MpegEncContext *)c;
2640 76fbb024 Michael Niedermayer
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2641
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2642 1457ab52 Michael Niedermayer
    int sum=0, i;
2643
2644
    s->dsp.diff_pixels(temp, src1, src2, stride);
2645 b0368839 Michael Niedermayer
    s->dsp.fdct(temp);
2646 1457ab52 Michael Niedermayer
2647
    for(i=0; i<64; i++)
2648
        sum+= ABS(temp[i]);
2649
        
2650
    return sum;
2651
}
2652
2653 0e15384d Michael Niedermayer
void simple_idct(DCTELEM *block); //FIXME
2654 1457ab52 Michael Niedermayer
2655
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2656
    MpegEncContext * const s= (MpegEncContext *)c;
2657 76fbb024 Michael Niedermayer
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2658
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2659
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2660 1457ab52 Michael Niedermayer
    int sum=0, i;
2661
2662
    s->mb_intra=0;
2663
    
2664
    s->dsp.diff_pixels(temp, src1, src2, stride);
2665
    
2666
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2667
    
2668 67725183 Michael Niedermayer
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2669 1457ab52 Michael Niedermayer
    s->dct_unquantize(s, temp, 0, s->qscale);
2670
    simple_idct(temp); //FIXME 
2671
    
2672
    for(i=0; i<64; i++)
2673
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2674
        
2675
    return sum;
2676
}
2677
2678 3a87ac94 Michael Niedermayer
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2679
    MpegEncContext * const s= (MpegEncContext *)c;
2680 0c1a9eda Zdenek Kabelac
    const uint8_t *scantable= s->intra_scantable.permutated;
2681 76fbb024 Michael Niedermayer
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2682
    uint64_t __align8 aligned_bak[stride];
2683
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2684
    uint8_t * const bak= (uint8_t*)aligned_bak;
2685 3a87ac94 Michael Niedermayer
    int i, last, run, bits, level, distoration, start_i;
2686
    const int esc_length= s->ac_esc_length;
2687
    uint8_t * length;
2688
    uint8_t * last_length;
2689 67725183 Michael Niedermayer
    
2690
    for(i=0; i<8; i++){
2691
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2692
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2693
    }
2694 3a87ac94 Michael Niedermayer
2695 67725183 Michael Niedermayer
    s->dsp.diff_pixels(temp, src1, src2, stride);
2696
2697
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2698
2699
    bits=0;
2700 3a87ac94 Michael Niedermayer
    
2701
    if (s->mb_intra) {
2702 67725183 Michael Niedermayer
        start_i = 1; 
2703 3a87ac94 Michael Niedermayer
        length     = s->intra_ac_vlc_length;
2704
        last_length= s->intra_ac_vlc_last_length;
2705 67725183 Michael Niedermayer
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2706 3a87ac94 Michael Niedermayer
    } else {
2707
        start_i = 0;
2708
        length     = s->inter_ac_vlc_length;
2709
        last_length= s->inter_ac_vlc_last_length;
2710
    }
2711
    
2712 67725183 Michael Niedermayer
    if(last>=start_i){
2713 3a87ac94 Michael Niedermayer
        run=0;
2714
        for(i=start_i; i<last; i++){
2715
            int j= scantable[i];
2716
            level= temp[j];
2717
        
2718
            if(level){
2719
                level+=64;
2720
                if((level&(~127)) == 0){
2721
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2722
                }else
2723
                    bits+= esc_length;
2724
                run=0;
2725
            }else
2726
                run++;
2727
        }
2728
        i= scantable[last];
2729 1d0eab1d Michael Niedermayer
       
2730 3a87ac94 Michael Niedermayer
        level= temp[i] + 64;
2731 1d0eab1d Michael Niedermayer
2732
        assert(level - 64);
2733
        
2734 3a87ac94 Michael Niedermayer
        if<