Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ e2f9490e

History | View | Annotate | Download (78.5 KB)

1 de6d9b64 Fabrice Bellard
/*
2
 * DSP utils
3 ff4ec49e Fabrice Bellard
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 de6d9b64 Fabrice Bellard
 *
5 ff4ec49e Fabrice Bellard
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9 de6d9b64 Fabrice Bellard
 *
10 ff4ec49e Fabrice Bellard
 * This library is distributed in the hope that it will be useful,
11 de6d9b64 Fabrice Bellard
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ff4ec49e Fabrice Bellard
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14 de6d9b64 Fabrice Bellard
 *
15 ff4ec49e Fabrice Bellard
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 7ff037e9 Michael Niedermayer
 *
19 59fe111e Michael Niedermayer
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20 de6d9b64 Fabrice Bellard
 */
21
#include "avcodec.h"
22
#include "dsputil.h"
23 1457ab52 Michael Niedermayer
#include "mpegvideo.h"
24 45553457 Zdenek Kabelac
25 5596c60c Michael Niedermayer
int ff_bit_exact=0;
26
27 0c1a9eda Zdenek Kabelac
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
28
uint32_t squareTbl[512];
29 de6d9b64 Fabrice Bellard
30 0c1a9eda Zdenek Kabelac
const uint8_t ff_zigzag_direct[64] = {
31 2ad1516a Michael Niedermayer
    0,   1,  8, 16,  9,  2,  3, 10,
32
    17, 24, 32, 25, 18, 11,  4,  5,
33 e0eac44e Fabrice Bellard
    12, 19, 26, 33, 40, 48, 41, 34,
34 2ad1516a Michael Niedermayer
    27, 20, 13,  6,  7, 14, 21, 28,
35 e0eac44e Fabrice Bellard
    35, 42, 49, 56, 57, 50, 43, 36,
36
    29, 22, 15, 23, 30, 37, 44, 51,
37
    58, 59, 52, 45, 38, 31, 39, 46,
38
    53, 60, 61, 54, 47, 55, 62, 63
39
};
40
41 2f349de2 Michael Niedermayer
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
42 0c1a9eda Zdenek Kabelac
uint16_t __align8 inv_zigzag_direct16[64];
43 2f349de2 Michael Niedermayer
44 0c1a9eda Zdenek Kabelac
const uint8_t ff_alternate_horizontal_scan[64] = {
45 2ad1516a Michael Niedermayer
    0,  1,   2,  3,  8,  9, 16, 17, 
46 e0eac44e Fabrice Bellard
    10, 11,  4,  5,  6,  7, 15, 14,
47
    13, 12, 19, 18, 24, 25, 32, 33, 
48
    26, 27, 20, 21, 22, 23, 28, 29,
49
    30, 31, 34, 35, 40, 41, 48, 49, 
50
    42, 43, 36, 37, 38, 39, 44, 45,
51
    46, 47, 50, 51, 56, 57, 58, 59, 
52
    52, 53, 54, 55, 60, 61, 62, 63,
53
};
54
55 0c1a9eda Zdenek Kabelac
const uint8_t ff_alternate_vertical_scan[64] = {
56 2ad1516a Michael Niedermayer
    0,  8,  16, 24,  1,  9,  2, 10, 
57 e0eac44e Fabrice Bellard
    17, 25, 32, 40, 48, 56, 57, 49,
58
    41, 33, 26, 18,  3, 11,  4, 12, 
59
    19, 27, 34, 42, 50, 58, 35, 43,
60
    51, 59, 20, 28,  5, 13,  6, 14, 
61
    21, 29, 36, 44, 52, 60, 37, 45,
62
    53, 61, 22, 30,  7, 15, 23, 31, 
63
    38, 46, 54, 62, 39, 47, 55, 63,
64
};
65
66 2f349de2 Michael Niedermayer
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
67 0c1a9eda Zdenek Kabelac
const uint32_t inverse[256]={
68 2f349de2 Michael Niedermayer
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
69
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
70
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
71
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
72
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
73
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
74
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
75
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
76
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
77
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
78
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
79
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
80
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
81
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
82
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
83
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
84
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
85
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
86
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
87
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
88
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
89
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
90
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
91
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
92
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
93
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
94
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
95
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
96
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
97
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
98
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
99
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
100
};
101
102 0c1a9eda Zdenek Kabelac
static int pix_sum_c(uint8_t * pix, int line_size)
103 3aa102be Michael Niedermayer
{
104
    int s, i, j;
105
106
    s = 0;
107
    for (i = 0; i < 16; i++) {
108
        for (j = 0; j < 16; j += 8) {
109
            s += pix[0];
110
            s += pix[1];
111
            s += pix[2];
112
            s += pix[3];
113
            s += pix[4];
114
            s += pix[5];
115
            s += pix[6];
116
            s += pix[7];
117
            pix += 8;
118
        }
119
        pix += line_size - 16;
120
    }
121
    return s;
122
}
123
124 0c1a9eda Zdenek Kabelac
static int pix_norm1_c(uint8_t * pix, int line_size)
125 3aa102be Michael Niedermayer
{
126
    int s, i, j;
127 0c1a9eda Zdenek Kabelac
    uint32_t *sq = squareTbl + 256;
128 3aa102be Michael Niedermayer
129
    s = 0;
130
    for (i = 0; i < 16; i++) {
131
        for (j = 0; j < 16; j += 8) {
132 2a006cd3 Felix von Leitner
#if 0
133 3aa102be Michael Niedermayer
            s += sq[pix[0]];
134
            s += sq[pix[1]];
135
            s += sq[pix[2]];
136
            s += sq[pix[3]];
137
            s += sq[pix[4]];
138
            s += sq[pix[5]];
139
            s += sq[pix[6]];
140
            s += sq[pix[7]];
141 2a006cd3 Felix von Leitner
#else
142
#if LONG_MAX > 2147483647
143
            register uint64_t x=*(uint64_t*)pix;
144
            s += sq[x&0xff];
145
            s += sq[(x>>8)&0xff];
146
            s += sq[(x>>16)&0xff];
147
            s += sq[(x>>24)&0xff];
148
            s += sq[(x>>32)&0xff];
149
            s += sq[(x>>40)&0xff];
150
            s += sq[(x>>48)&0xff];
151
            s += sq[(x>>56)&0xff];
152
#else
153
            register uint32_t x=*(uint32_t*)pix;
154
            s += sq[x&0xff];
155
            s += sq[(x>>8)&0xff];
156
            s += sq[(x>>16)&0xff];
157
            s += sq[(x>>24)&0xff];
158
            x=*(uint32_t*)(pix+4);
159
            s += sq[x&0xff];
160
            s += sq[(x>>8)&0xff];
161
            s += sq[(x>>16)&0xff];
162
            s += sq[(x>>24)&0xff];
163
#endif
164
#endif
165 3aa102be Michael Niedermayer
            pix += 8;
166
        }
167
        pix += line_size - 16;
168
    }
169
    return s;
170
}
171
172
173 0c1a9eda Zdenek Kabelac
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
174 1457ab52 Michael Niedermayer
{
175
    int s, i;
176 0c1a9eda Zdenek Kabelac
    uint32_t *sq = squareTbl + 256;
177 1457ab52 Michael Niedermayer
178
    s = 0;
179
    for (i = 0; i < 8; i++) {
180
        s += sq[pix1[0] - pix2[0]];
181
        s += sq[pix1[1] - pix2[1]];
182
        s += sq[pix1[2] - pix2[2]];
183
        s += sq[pix1[3] - pix2[3]];
184
        s += sq[pix1[4] - pix2[4]];
185
        s += sq[pix1[5] - pix2[5]];
186
        s += sq[pix1[6] - pix2[6]];
187
        s += sq[pix1[7] - pix2[7]];
188
        pix1 += line_size;
189
        pix2 += line_size;
190
    }
191
    return s;
192
}
193
194 6b026927 Falk Hüffner
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
195 9c76bd48 Brian Foley
{
196 6b026927 Falk Hüffner
    int s, i;
197
    uint32_t *sq = squareTbl + 256;
198 9c76bd48 Brian Foley
199
    s = 0;
200
    for (i = 0; i < 16; i++) {
201 6b026927 Falk Hüffner
        s += sq[pix1[ 0] - pix2[ 0]];
202
        s += sq[pix1[ 1] - pix2[ 1]];
203
        s += sq[pix1[ 2] - pix2[ 2]];
204
        s += sq[pix1[ 3] - pix2[ 3]];
205
        s += sq[pix1[ 4] - pix2[ 4]];
206
        s += sq[pix1[ 5] - pix2[ 5]];
207
        s += sq[pix1[ 6] - pix2[ 6]];
208
        s += sq[pix1[ 7] - pix2[ 7]];
209
        s += sq[pix1[ 8] - pix2[ 8]];
210
        s += sq[pix1[ 9] - pix2[ 9]];
211
        s += sq[pix1[10] - pix2[10]];
212
        s += sq[pix1[11] - pix2[11]];
213
        s += sq[pix1[12] - pix2[12]];
214
        s += sq[pix1[13] - pix2[13]];
215
        s += sq[pix1[14] - pix2[14]];
216
        s += sq[pix1[15] - pix2[15]];
217 2a006cd3 Felix von Leitner
218 6b026927 Falk Hüffner
        pix1 += line_size;
219
        pix2 += line_size;
220 9c76bd48 Brian Foley
    }
221
    return s;
222
}
223
224 0c1a9eda Zdenek Kabelac
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
225 de6d9b64 Fabrice Bellard
{
226
    int i;
227
228
    /* read the pixels */
229
    for(i=0;i<8;i++) {
230 c13e1abd Falk Hüffner
        block[0] = pixels[0];
231
        block[1] = pixels[1];
232
        block[2] = pixels[2];
233
        block[3] = pixels[3];
234
        block[4] = pixels[4];
235
        block[5] = pixels[5];
236
        block[6] = pixels[6];
237
        block[7] = pixels[7];
238
        pixels += line_size;
239
        block += 8;
240 de6d9b64 Fabrice Bellard
    }
241
}
242
243 0c1a9eda Zdenek Kabelac
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
244
                          const uint8_t *s2, int stride){
245 9dbcbd92 Michael Niedermayer
    int i;
246
247
    /* read the pixels */
248
    for(i=0;i<8;i++) {
249 c13e1abd Falk Hüffner
        block[0] = s1[0] - s2[0];
250
        block[1] = s1[1] - s2[1];
251
        block[2] = s1[2] - s2[2];
252
        block[3] = s1[3] - s2[3];
253
        block[4] = s1[4] - s2[4];
254
        block[5] = s1[5] - s2[5];
255
        block[6] = s1[6] - s2[6];
256
        block[7] = s1[7] - s2[7];
257 9dbcbd92 Michael Niedermayer
        s1 += stride;
258
        s2 += stride;
259 c13e1abd Falk Hüffner
        block += 8;
260 9dbcbd92 Michael Niedermayer
    }
261
}
262
263
264 0c1a9eda Zdenek Kabelac
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
265 eb4b3dd3 Zdenek Kabelac
                                 int line_size)
266 de6d9b64 Fabrice Bellard
{
267
    int i;
268 0c1a9eda Zdenek Kabelac
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
269 de6d9b64 Fabrice Bellard
    
270
    /* read the pixels */
271
    for(i=0;i<8;i++) {
272 c13e1abd Falk Hüffner
        pixels[0] = cm[block[0]];
273
        pixels[1] = cm[block[1]];
274
        pixels[2] = cm[block[2]];
275
        pixels[3] = cm[block[3]];
276
        pixels[4] = cm[block[4]];
277
        pixels[5] = cm[block[5]];
278
        pixels[6] = cm[block[6]];
279
        pixels[7] = cm[block[7]];
280
281
        pixels += line_size;
282
        block += 8;
283 de6d9b64 Fabrice Bellard
    }
284
}
285
286 0c1a9eda Zdenek Kabelac
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
287 c13e1abd Falk Hüffner
                          int line_size)
288 de6d9b64 Fabrice Bellard
{
289
    int i;
290 0c1a9eda Zdenek Kabelac
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
291 de6d9b64 Fabrice Bellard
    
292
    /* read the pixels */
293
    for(i=0;i<8;i++) {
294 c13e1abd Falk Hüffner
        pixels[0] = cm[pixels[0] + block[0]];
295
        pixels[1] = cm[pixels[1] + block[1]];
296
        pixels[2] = cm[pixels[2] + block[2]];
297
        pixels[3] = cm[pixels[3] + block[3]];
298
        pixels[4] = cm[pixels[4] + block[4]];
299
        pixels[5] = cm[pixels[5] + block[5]];
300
        pixels[6] = cm[pixels[6] + block[6]];
301
        pixels[7] = cm[pixels[7] + block[7]];
302
        pixels += line_size;
303
        block += 8;
304 de6d9b64 Fabrice Bellard
    }
305
}
306 59fe111e Michael Niedermayer
#if 0
307

308
#define PIXOP2(OPNAME, OP) \
309 b3184779 Michael Niedermayer
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
310 59fe111e Michael Niedermayer
{\
311
    int i;\
312
    for(i=0; i<h; i++){\
313
        OP(*((uint64_t*)block), LD64(pixels));\
314
        pixels+=line_size;\
315
        block +=line_size;\
316
    }\
317
}\
318
\
319 45553457 Zdenek Kabelac
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
320 59fe111e Michael Niedermayer
{\
321
    int i;\
322
    for(i=0; i<h; i++){\
323
        const uint64_t a= LD64(pixels  );\
324
        const uint64_t b= LD64(pixels+1);\
325
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
326
        pixels+=line_size;\
327
        block +=line_size;\
328
    }\
329
}\
330
\
331 45553457 Zdenek Kabelac
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
332 59fe111e Michael Niedermayer
{\
333
    int i;\
334
    for(i=0; i<h; i++){\
335
        const uint64_t a= LD64(pixels  );\
336
        const uint64_t b= LD64(pixels+1);\
337
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
338
        pixels+=line_size;\
339
        block +=line_size;\
340
    }\
341
}\
342
\
343 45553457 Zdenek Kabelac
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
344 59fe111e Michael Niedermayer
{\
345
    int i;\
346
    for(i=0; i<h; i++){\
347
        const uint64_t a= LD64(pixels          );\
348
        const uint64_t b= LD64(pixels+line_size);\
349
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
350
        pixels+=line_size;\
351
        block +=line_size;\
352
    }\
353
}\
354
\
355 45553457 Zdenek Kabelac
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
356 59fe111e Michael Niedermayer
{\
357
    int i;\
358
    for(i=0; i<h; i++){\
359
        const uint64_t a= LD64(pixels          );\
360
        const uint64_t b= LD64(pixels+line_size);\
361
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
362
        pixels+=line_size;\
363
        block +=line_size;\
364
    }\
365
}\
366
\
367 45553457 Zdenek Kabelac
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
368 59fe111e Michael Niedermayer
{\
369
        int i;\
370
        const uint64_t a= LD64(pixels  );\
371
        const uint64_t b= LD64(pixels+1);\
372
        uint64_t l0=  (a&0x0303030303030303ULL)\
373
                    + (b&0x0303030303030303ULL)\
374
                    + 0x0202020202020202ULL;\
375
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
376
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
377
        uint64_t l1,h1;\
378
\
379
        pixels+=line_size;\
380
        for(i=0; i<h; i+=2){\
381
            uint64_t a= LD64(pixels  );\
382
            uint64_t b= LD64(pixels+1);\
383
            l1=  (a&0x0303030303030303ULL)\
384
               + (b&0x0303030303030303ULL);\
385
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
386
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
387
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
388
            pixels+=line_size;\
389
            block +=line_size;\
390
            a= LD64(pixels  );\
391
            b= LD64(pixels+1);\
392
            l0=  (a&0x0303030303030303ULL)\
393
               + (b&0x0303030303030303ULL)\
394
               + 0x0202020202020202ULL;\
395
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
396
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
397
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
398
            pixels+=line_size;\
399
            block +=line_size;\
400
        }\
401
}\
402
\
403 45553457 Zdenek Kabelac
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
404 59fe111e Michael Niedermayer
{\
405
        int i;\
406
        const uint64_t a= LD64(pixels  );\
407
        const uint64_t b= LD64(pixels+1);\
408
        uint64_t l0=  (a&0x0303030303030303ULL)\
409
                    + (b&0x0303030303030303ULL)\
410
                    + 0x0101010101010101ULL;\
411
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
412
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
413
        uint64_t l1,h1;\
414
\
415
        pixels+=line_size;\
416
        for(i=0; i<h; i+=2){\
417
            uint64_t a= LD64(pixels  );\
418
            uint64_t b= LD64(pixels+1);\
419
            l1=  (a&0x0303030303030303ULL)\
420
               + (b&0x0303030303030303ULL);\
421
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
422
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
423
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
424
            pixels+=line_size;\
425
            block +=line_size;\
426
            a= LD64(pixels  );\
427
            b= LD64(pixels+1);\
428
            l0=  (a&0x0303030303030303ULL)\
429
               + (b&0x0303030303030303ULL)\
430
               + 0x0101010101010101ULL;\
431
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
432
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
433
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
434
            pixels+=line_size;\
435
            block +=line_size;\
436
        }\
437
}\
438
\
439 45553457 Zdenek Kabelac
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
440
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
441
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
442
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
443
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
444
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
445
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
446 59fe111e Michael Niedermayer

447
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
448
#else // 64 bit variant
449
450
#define PIXOP2(OPNAME, OP) \
451 45553457 Zdenek Kabelac
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
452 59fe111e Michael Niedermayer
    int i;\
453
    for(i=0; i<h; i++){\
454
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
455
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
456
        pixels+=line_size;\
457
        block +=line_size;\
458
    }\
459
}\
460 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
461
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
462 b3184779 Michael Niedermayer
}\
463 59fe111e Michael Niedermayer
\
464 b3184779 Michael Niedermayer
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
465
                                                int src_stride1, int src_stride2, int h){\
466 59fe111e Michael Niedermayer
    int i;\
467
    for(i=0; i<h; i++){\
468 b3184779 Michael Niedermayer
        uint32_t a,b;\
469
        a= LD32(&src1[i*src_stride1  ]);\
470
        b= LD32(&src2[i*src_stride2  ]);\
471
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
472
        a= LD32(&src1[i*src_stride1+4]);\
473
        b= LD32(&src2[i*src_stride2+4]);\
474
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
475 59fe111e Michael Niedermayer
    }\
476
}\
477
\
478 b3184779 Michael Niedermayer
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
479
                                                int src_stride1, int src_stride2, int h){\
480 59fe111e Michael Niedermayer
    int i;\
481
    for(i=0; i<h; i++){\
482 b3184779 Michael Niedermayer
        uint32_t a,b;\
483
        a= LD32(&src1[i*src_stride1  ]);\
484
        b= LD32(&src2[i*src_stride2  ]);\
485
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
486
        a= LD32(&src1[i*src_stride1+4]);\
487
        b= LD32(&src2[i*src_stride2+4]);\
488
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
489 59fe111e Michael Niedermayer
    }\
490
}\
491
\
492 b3184779 Michael Niedermayer
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
493
                                                int src_stride1, int src_stride2, int h){\
494
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
495
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
496
}\
497
\
498
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
499
                                                int src_stride1, int src_stride2, int h){\
500
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
501
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
502
}\
503
\
504 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
505 b3184779 Michael Niedermayer
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
506
}\
507
\
508 45553457 Zdenek Kabelac
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
509 b3184779 Michael Niedermayer
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
510
}\
511
\
512 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
513 b3184779 Michael Niedermayer
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
514
}\
515
\
516 45553457 Zdenek Kabelac
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
517 b3184779 Michael Niedermayer
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
518
}\
519
\
520
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
521
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
522 59fe111e Michael Niedermayer
    int i;\
523
    for(i=0; i<h; i++){\
524 b3184779 Michael Niedermayer
        uint32_t a, b, c, d, l0, l1, h0, h1;\
525
        a= LD32(&src1[i*src_stride1]);\
526
        b= LD32(&src2[i*src_stride2]);\
527
        c= LD32(&src3[i*src_stride3]);\
528
        d= LD32(&src4[i*src_stride4]);\
529
        l0=  (a&0x03030303UL)\
530
           + (b&0x03030303UL)\
531
           + 0x02020202UL;\
532
        h0= ((a&0xFCFCFCFCUL)>>2)\
533
          + ((b&0xFCFCFCFCUL)>>2);\
534
        l1=  (c&0x03030303UL)\
535
           + (d&0x03030303UL);\
536
        h1= ((c&0xFCFCFCFCUL)>>2)\
537
          + ((d&0xFCFCFCFCUL)>>2);\
538
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
539
        a= LD32(&src1[i*src_stride1+4]);\
540
        b= LD32(&src2[i*src_stride2+4]);\
541
        c= LD32(&src3[i*src_stride3+4]);\
542
        d= LD32(&src4[i*src_stride4+4]);\
543
        l0=  (a&0x03030303UL)\
544
           + (b&0x03030303UL)\
545
           + 0x02020202UL;\
546
        h0= ((a&0xFCFCFCFCUL)>>2)\
547
          + ((b&0xFCFCFCFCUL)>>2);\
548
        l1=  (c&0x03030303UL)\
549
           + (d&0x03030303UL);\
550
        h1= ((c&0xFCFCFCFCUL)>>2)\
551
          + ((d&0xFCFCFCFCUL)>>2);\
552
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
553 59fe111e Michael Niedermayer
    }\
554
}\
555 b3184779 Michael Niedermayer
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
556
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
557 59fe111e Michael Niedermayer
    int i;\
558
    for(i=0; i<h; i++){\
559 b3184779 Michael Niedermayer
        uint32_t a, b, c, d, l0, l1, h0, h1;\
560
        a= LD32(&src1[i*src_stride1]);\
561
        b= LD32(&src2[i*src_stride2]);\
562
        c= LD32(&src3[i*src_stride3]);\
563
        d= LD32(&src4[i*src_stride4]);\
564
        l0=  (a&0x03030303UL)\
565
           + (b&0x03030303UL)\
566
           + 0x01010101UL;\
567
        h0= ((a&0xFCFCFCFCUL)>>2)\
568
          + ((b&0xFCFCFCFCUL)>>2);\
569
        l1=  (c&0x03030303UL)\
570
           + (d&0x03030303UL);\
571
        h1= ((c&0xFCFCFCFCUL)>>2)\
572
          + ((d&0xFCFCFCFCUL)>>2);\
573
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
574
        a= LD32(&src1[i*src_stride1+4]);\
575
        b= LD32(&src2[i*src_stride2+4]);\
576
        c= LD32(&src3[i*src_stride3+4]);\
577
        d= LD32(&src4[i*src_stride4+4]);\
578
        l0=  (a&0x03030303UL)\
579
           + (b&0x03030303UL)\
580
           + 0x01010101UL;\
581
        h0= ((a&0xFCFCFCFCUL)>>2)\
582
          + ((b&0xFCFCFCFCUL)>>2);\
583
        l1=  (c&0x03030303UL)\
584
           + (d&0x03030303UL);\
585
        h1= ((c&0xFCFCFCFCUL)>>2)\
586
          + ((d&0xFCFCFCFCUL)>>2);\
587
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
588 59fe111e Michael Niedermayer
    }\
589
}\
590 b3184779 Michael Niedermayer
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
591
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
592
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
593
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
594
}\
595
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
596
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
597
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
598
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
599
}\
600 59fe111e Michael Niedermayer
\
601 45553457 Zdenek Kabelac
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
602 59fe111e Michael Niedermayer
{\
603
    int j;\
604
    for(j=0; j<2; j++){\
605
        int i;\
606
        const uint32_t a= LD32(pixels  );\
607
        const uint32_t b= LD32(pixels+1);\
608
        uint32_t l0=  (a&0x03030303UL)\
609
                    + (b&0x03030303UL)\
610
                    + 0x02020202UL;\
611
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
612
                   + ((b&0xFCFCFCFCUL)>>2);\
613
        uint32_t l1,h1;\
614
\
615
        pixels+=line_size;\
616
        for(i=0; i<h; i+=2){\
617
            uint32_t a= LD32(pixels  );\
618
            uint32_t b= LD32(pixels+1);\
619
            l1=  (a&0x03030303UL)\
620
               + (b&0x03030303UL);\
621
            h1= ((a&0xFCFCFCFCUL)>>2)\
622
              + ((b&0xFCFCFCFCUL)>>2);\
623
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
624
            pixels+=line_size;\
625
            block +=line_size;\
626
            a= LD32(pixels  );\
627
            b= LD32(pixels+1);\
628
            l0=  (a&0x03030303UL)\
629
               + (b&0x03030303UL)\
630
               + 0x02020202UL;\
631
            h0= ((a&0xFCFCFCFCUL)>>2)\
632
              + ((b&0xFCFCFCFCUL)>>2);\
633
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
634
            pixels+=line_size;\
635
            block +=line_size;\
636
        }\
637
        pixels+=4-line_size*(h+1);\
638
        block +=4-line_size*h;\
639
    }\
640
}\
641
\
642 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
643 59fe111e Michael Niedermayer
{\
644
    int j;\
645
    for(j=0; j<2; j++){\
646
        int i;\
647
        const uint32_t a= LD32(pixels  );\
648
        const uint32_t b= LD32(pixels+1);\
649
        uint32_t l0=  (a&0x03030303UL)\
650
                    + (b&0x03030303UL)\
651
                    + 0x01010101UL;\
652
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
653
                   + ((b&0xFCFCFCFCUL)>>2);\
654
        uint32_t l1,h1;\
655
\
656
        pixels+=line_size;\
657
        for(i=0; i<h; i+=2){\
658
            uint32_t a= LD32(pixels  );\
659
            uint32_t b= LD32(pixels+1);\
660
            l1=  (a&0x03030303UL)\
661
               + (b&0x03030303UL);\
662
            h1= ((a&0xFCFCFCFCUL)>>2)\
663
              + ((b&0xFCFCFCFCUL)>>2);\
664
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
665
            pixels+=line_size;\
666
            block +=line_size;\
667
            a= LD32(pixels  );\
668
            b= LD32(pixels+1);\
669
            l0=  (a&0x03030303UL)\
670
               + (b&0x03030303UL)\
671
               + 0x01010101UL;\
672
            h0= ((a&0xFCFCFCFCUL)>>2)\
673
              + ((b&0xFCFCFCFCUL)>>2);\
674
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
675
            pixels+=line_size;\
676
            block +=line_size;\
677
        }\
678
        pixels+=4-line_size*(h+1);\
679
        block +=4-line_size*h;\
680
    }\
681
}\
682
\
683 45553457 Zdenek Kabelac
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
684
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
685
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
686
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
687
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
688
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
689
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
690
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
691 b3184779 Michael Niedermayer
692 59fe111e Michael Niedermayer
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
693
#endif
694
#define op_put(a, b) a = b
695
696
PIXOP2(avg, op_avg)
697
PIXOP2(put, op_put)
698
#undef op_avg
699
#undef op_put
700
701 de6d9b64 Fabrice Bellard
#define avg2(a,b) ((a+b+1)>>1)
702
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
703
704 073b013d Michael Niedermayer
705 0c1a9eda Zdenek Kabelac
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
706 44eb4951 Michael Niedermayer
{
707
    const int A=(16-x16)*(16-y16);
708
    const int B=(   x16)*(16-y16);
709
    const int C=(16-x16)*(   y16);
710
    const int D=(   x16)*(   y16);
711
    int i;
712
713
    for(i=0; i<h; i++)
714
    {
715 b3184779 Michael Niedermayer
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
716
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
717
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
718
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
719
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
720
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
721
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
722
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
723
        dst+= stride;
724
        src+= stride;
725 44eb4951 Michael Niedermayer
    }
726
}
727
728 0c1a9eda Zdenek Kabelac
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
729 073b013d Michael Niedermayer
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
730
{
731
    int y, vx, vy;
732
    const int s= 1<<shift;
733
    
734
    width--;
735
    height--;
736
737
    for(y=0; y<h; y++){
738
        int x;
739
740
        vx= ox;
741
        vy= oy;
742
        for(x=0; x<8; x++){ //XXX FIXME optimize
743
            int src_x, src_y, frac_x, frac_y, index;
744
745
            src_x= vx>>16;
746
            src_y= vy>>16;
747
            frac_x= src_x&(s-1);
748
            frac_y= src_y&(s-1);
749
            src_x>>=shift;
750
            src_y>>=shift;
751
  
752
            if((unsigned)src_x < width){
753
                if((unsigned)src_y < height){
754
                    index= src_x + src_y*stride;
755
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
756
                                           + src[index       +1]*   frac_x )*(s-frac_y)
757
                                        + (  src[index+stride  ]*(s-frac_x)
758
                                           + src[index+stride+1]*   frac_x )*   frac_y
759
                                        + r)>>(shift*2);
760
                }else{
761
                    index= src_x + clip(src_y, 0, height)*stride;                    
762
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
763
                                          + src[index       +1]*   frac_x )*s
764
                                        + r)>>(shift*2);
765
                }
766
            }else{
767
                if((unsigned)src_y < height){
768
                    index= clip(src_x, 0, width) + src_y*stride;                    
769
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
770
                                           + src[index+stride  ]*   frac_y )*s
771
                                        + r)>>(shift*2);
772
                }else{
773
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
774
                    dst[y*stride + x]=    src[index         ];
775
                }
776
            }
777
            
778
            vx+= dxx;
779
            vy+= dyx;
780
        }
781
        ox += dxy;
782
        oy += dyy;
783
    }
784
}
785
786 0c1a9eda Zdenek Kabelac
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
787 44eb4951 Michael Niedermayer
{
788
    int i;
789
    for(i=0; i<h; i++)
790
    {
791 b3184779 Michael Niedermayer
        ST32(dst   , LD32(src   ));
792
        ST32(dst+4 , LD32(src+4 ));
793
        ST32(dst+8 , LD32(src+8 ));
794
        ST32(dst+12, LD32(src+12));
795
        dst[16]= src[16];
796 44eb4951 Michael Niedermayer
        dst+=dstStride;
797
        src+=srcStride;
798
    }
799
}
800
801 0c1a9eda Zdenek Kabelac
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
802 44eb4951 Michael Niedermayer
{
803
    int i;
804 b3184779 Michael Niedermayer
    for(i=0; i<h; i++)
805 44eb4951 Michael Niedermayer
    {
806 b3184779 Michael Niedermayer
        ST32(dst   , LD32(src   ));
807
        ST32(dst+4 , LD32(src+4 ));
808
        dst[8]= src[8];
809 44eb4951 Michael Niedermayer
        dst+=dstStride;
810
        src+=srcStride;
811
    }
812
}
813
814 826f429a Michael Niedermayer
815 b3184779 Michael Niedermayer
#define QPEL_MC(r, OPNAME, RND, OP) \
816 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
817
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
818 b3184779 Michael Niedermayer
    int i;\
819
    for(i=0; i<h; i++)\
820
    {\
821
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
822
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
823
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
824
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
825
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
826
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
827
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
828
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
829
        dst+=dstStride;\
830
        src+=srcStride;\
831
    }\
832 44eb4951 Michael Niedermayer
}\
833
\
834 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
835 db794953 Michael Niedermayer
    const int w=8;\
836 0c1a9eda Zdenek Kabelac
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
837 b3184779 Michael Niedermayer
    int i;\
838
    for(i=0; i<w; i++)\
839
    {\
840
        const int src0= src[0*srcStride];\
841
        const int src1= src[1*srcStride];\
842
        const int src2= src[2*srcStride];\
843
        const int src3= src[3*srcStride];\
844
        const int src4= src[4*srcStride];\
845
        const int src5= src[5*srcStride];\
846
        const int src6= src[6*srcStride];\
847
        const int src7= src[7*srcStride];\
848
        const int src8= src[8*srcStride];\
849
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
850
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
851
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
852
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
853
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
854
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
855
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
856
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
857
        dst++;\
858
        src++;\
859
    }\
860
}\
861
\
862 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
863
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
864 b3184779 Michael Niedermayer
    int i;\
865 826f429a Michael Niedermayer
    \
866 b3184779 Michael Niedermayer
    for(i=0; i<h; i++)\
867
    {\
868
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
869
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
870
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
871
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
872
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
873
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
874
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
875
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
876
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
877
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
878
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
879
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
880
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
881
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
882
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
883
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
884
        dst+=dstStride;\
885
        src+=srcStride;\
886
    }\
887
}\
888
\
889 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
890
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
891 b3184779 Michael Niedermayer
    int i;\
892 826f429a Michael Niedermayer
    const int w=16;\
893 b3184779 Michael Niedermayer
    for(i=0; i<w; i++)\
894
    {\
895
        const int src0= src[0*srcStride];\
896
        const int src1= src[1*srcStride];\
897
        const int src2= src[2*srcStride];\
898
        const int src3= src[3*srcStride];\
899
        const int src4= src[4*srcStride];\
900
        const int src5= src[5*srcStride];\
901
        const int src6= src[6*srcStride];\
902
        const int src7= src[7*srcStride];\
903
        const int src8= src[8*srcStride];\
904
        const int src9= src[9*srcStride];\
905
        const int src10= src[10*srcStride];\
906
        const int src11= src[11*srcStride];\
907
        const int src12= src[12*srcStride];\
908
        const int src13= src[13*srcStride];\
909
        const int src14= src[14*srcStride];\
910
        const int src15= src[15*srcStride];\
911
        const int src16= src[16*srcStride];\
912
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
913
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
914
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
915
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
916
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
917
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
918
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
919
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
920
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
921
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
922
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
923
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
924
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
925
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
926
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
927
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
928
        dst++;\
929
        src++;\
930
    }\
931
}\
932
\
933 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
934 45553457 Zdenek Kabelac
    OPNAME ## pixels8_c(dst, src, stride, 8);\
935 b3184779 Michael Niedermayer
}\
936
\
937 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
938
    uint8_t half[64];\
939 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
940
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
941 44eb4951 Michael Niedermayer
}\
942
\
943 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
944 b3184779 Michael Niedermayer
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
945 44eb4951 Michael Niedermayer
}\
946
\
947 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
948
    uint8_t half[64];\
949 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
950
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
951 44eb4951 Michael Niedermayer
}\
952
\
953 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
954
    uint8_t full[16*9];\
955
    uint8_t half[64];\
956 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
957 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
958 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
959 44eb4951 Michael Niedermayer
}\
960
\
961 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
962
    uint8_t full[16*9];\
963 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
964 db794953 Michael Niedermayer
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
965 44eb4951 Michael Niedermayer
}\
966
\
967 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
968
    uint8_t full[16*9];\
969
    uint8_t half[64];\
970 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
971 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
972 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
973 44eb4951 Michael Niedermayer
}\
974 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
975
    uint8_t full[16*9];\
976
    uint8_t halfH[72];\
977
    uint8_t halfV[64];\
978
    uint8_t halfHV[64];\
979 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
980
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
981 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
982
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
983 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
984 44eb4951 Michael Niedermayer
}\
985 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
986
    uint8_t full[16*9];\
987
    uint8_t halfH[72];\
988
    uint8_t halfHV[64];\
989 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
990
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
991
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
992
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
993
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
994
}\
995 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
996
    uint8_t full[16*9];\
997
    uint8_t halfH[72];\
998
    uint8_t halfV[64];\
999
    uint8_t halfHV[64];\
1000 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1001
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1002 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1003
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1004 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1005 44eb4951 Michael Niedermayer
}\
1006 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1007
    uint8_t full[16*9];\
1008
    uint8_t halfH[72];\
1009
    uint8_t halfHV[64];\
1010 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1011
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1012
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1013
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1014
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1015
}\
1016 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1017
    uint8_t full[16*9];\
1018
    uint8_t halfH[72];\
1019
    uint8_t halfV[64];\
1020
    uint8_t halfHV[64];\
1021 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1022
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1023 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1024
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1025 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1026 44eb4951 Michael Niedermayer
}\
1027 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1028
    uint8_t full[16*9];\
1029
    uint8_t halfH[72];\
1030
    uint8_t halfHV[64];\
1031 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1032
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1033
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1034
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1035
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1036
}\
1037 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1038
    uint8_t full[16*9];\
1039
    uint8_t halfH[72];\
1040
    uint8_t halfV[64];\
1041
    uint8_t halfHV[64];\
1042 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1043
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1044 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1045
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1046 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1047 44eb4951 Michael Niedermayer
}\
1048 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1049
    uint8_t full[16*9];\
1050
    uint8_t halfH[72];\
1051
    uint8_t halfHV[64];\
1052 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1053
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1054
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1055
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1056
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1057
}\
1058 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1059
    uint8_t halfH[72];\
1060
    uint8_t halfHV[64];\
1061 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1062 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1063 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1064 44eb4951 Michael Niedermayer
}\
1065 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1066
    uint8_t halfH[72];\
1067
    uint8_t halfHV[64];\
1068 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1069 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1070 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1071 44eb4951 Michael Niedermayer
}\
1072 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1073
    uint8_t full[16*9];\
1074
    uint8_t halfH[72];\
1075
    uint8_t halfV[64];\
1076
    uint8_t halfHV[64];\
1077 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1078
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1079 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1080
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1081 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1082 44eb4951 Michael Niedermayer
}\
1083 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1084
    uint8_t full[16*9];\
1085
    uint8_t halfH[72];\
1086 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1087
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1088
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1089
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1090
}\
1091 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1092
    uint8_t full[16*9];\
1093
    uint8_t halfH[72];\
1094
    uint8_t halfV[64];\
1095
    uint8_t halfHV[64];\
1096 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1097
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1098 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1099
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1100 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1101 44eb4951 Michael Niedermayer
}\
1102 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1103
    uint8_t full[16*9];\
1104
    uint8_t halfH[72];\
1105 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1106
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1107
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1108
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1109
}\
1110 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1111
    uint8_t halfH[72];\
1112 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1113 db794953 Michael Niedermayer
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1114 b3184779 Michael Niedermayer
}\
1115 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1116 45553457 Zdenek Kabelac
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1117 b3184779 Michael Niedermayer
}\
1118
\
1119 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1120
    uint8_t half[256];\
1121 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1122
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1123
}\
1124
\
1125 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1126 b3184779 Michael Niedermayer
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1127 44eb4951 Michael Niedermayer
}\
1128 b3184779 Michael Niedermayer
\
1129 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1130
    uint8_t half[256];\
1131 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1132
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1133
}\
1134
\
1135 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1136
    uint8_t full[24*17];\
1137
    uint8_t half[256];\
1138 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1139 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1140 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1141
}\
1142
\
1143 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1144
    uint8_t full[24*17];\
1145 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1146 826f429a Michael Niedermayer
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1147 b3184779 Michael Niedermayer
}\
1148
\
1149 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1150
    uint8_t full[24*17];\
1151
    uint8_t half[256];\
1152 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1153 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1154 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1155
}\
1156 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1157
    uint8_t full[24*17];\
1158
    uint8_t halfH[272];\
1159
    uint8_t halfV[256];\
1160
    uint8_t halfHV[256];\
1161 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1162
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1163 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1164
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1165 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1166
}\
1167 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1168
    uint8_t full[24*17];\
1169
    uint8_t halfH[272];\
1170
    uint8_t halfHV[256];\
1171 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1172
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1173
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1174
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1175
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1176
}\
1177 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1178
    uint8_t full[24*17];\
1179
    uint8_t halfH[272];\
1180
    uint8_t halfV[256];\
1181
    uint8_t halfHV[256];\
1182 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1183
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1184 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1185
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1186 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1187
}\
1188 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1189
    uint8_t full[24*17];\
1190
    uint8_t halfH[272];\
1191
    uint8_t halfHV[256];\
1192 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1193
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1194
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1195
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1196
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1197
}\
1198 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1199
    uint8_t full[24*17];\
1200
    uint8_t halfH[272];\
1201
    uint8_t halfV[256];\
1202
    uint8_t halfHV[256];\
1203 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1204
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1205 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1206
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1207 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1208
}\
1209 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1210
    uint8_t full[24*17];\
1211
    uint8_t halfH[272];\
1212
    uint8_t halfHV[256];\
1213 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1214
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1215
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1216
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1217
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1218
}\
1219 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1220
    uint8_t full[24*17];\
1221
    uint8_t halfH[272];\
1222
    uint8_t halfV[256];\
1223
    uint8_t halfHV[256];\
1224 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1225
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1226 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1227
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1228 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1229
}\
1230 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1231
    uint8_t full[24*17];\
1232
    uint8_t halfH[272];\
1233
    uint8_t halfHV[256];\
1234 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1235
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1236
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1237
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1238
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1239
}\
1240 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1241
    uint8_t halfH[272];\
1242
    uint8_t halfHV[256];\
1243 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1244 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1245 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1246
}\
1247 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1248
    uint8_t halfH[272];\
1249
    uint8_t halfHV[256];\
1250 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1251 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1252 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1253
}\
1254 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1255
    uint8_t full[24*17];\
1256
    uint8_t halfH[272];\
1257
    uint8_t halfV[256];\
1258
    uint8_t halfHV[256];\
1259 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1260
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1261 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1262
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1263 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1264
}\
1265 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1266
    uint8_t full[24*17];\
1267
    uint8_t halfH[272];\
1268 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1269
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1270
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1271
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1272
}\
1273 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1274
    uint8_t full[24*17];\
1275
    uint8_t halfH[272];\
1276
    uint8_t halfV[256];\
1277
    uint8_t halfHV[256];\
1278 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1279
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1280 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1281
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1282 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1283
}\
1284 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1285
    uint8_t full[24*17];\
1286
    uint8_t halfH[272];\
1287 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1288
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1289
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1290
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1291
}\
1292 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1293
    uint8_t halfH[272];\
1294 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1295 826f429a Michael Niedermayer
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1296 45553457 Zdenek Kabelac
}
1297 44eb4951 Michael Niedermayer
1298 b3184779 Michael Niedermayer
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1299
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1300
#define op_put(a, b) a = cm[((b) + 16)>>5]
1301
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1302
1303
QPEL_MC(0, put_       , _       , op_put)
1304
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1305
QPEL_MC(0, avg_       , _       , op_avg)
1306
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1307
#undef op_avg
1308
#undef op_avg_no_rnd
1309
#undef op_put
1310
#undef op_put_no_rnd
1311 44eb4951 Michael Niedermayer
1312 1457ab52 Michael Niedermayer
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1313
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
1314
    int i;
1315
1316
    for(i=0; i<h; i++){
1317
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1318
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1319
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1320
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1321
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1322
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1323
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1324
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1325
        dst+=dstStride;
1326
        src+=srcStride;        
1327
    }
1328
}
1329
1330
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1331
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
1332
    int i;
1333
1334
    for(i=0; i<w; i++){
1335
        const int src_1= src[ -srcStride];
1336
        const int src0 = src[0          ];
1337
        const int src1 = src[  srcStride];
1338
        const int src2 = src[2*srcStride];
1339
        const int src3 = src[3*srcStride];
1340
        const int src4 = src[4*srcStride];
1341
        const int src5 = src[5*srcStride];
1342
        const int src6 = src[6*srcStride];
1343
        const int src7 = src[7*srcStride];
1344
        const int src8 = src[8*srcStride];
1345
        const int src9 = src[9*srcStride];
1346
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1347
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1348
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1349
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1350
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1351
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1352
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1353
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1354
        src++;
1355
        dst++;
1356
    }
1357
}
1358
1359
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
1360
    put_pixels8_c(dst, src, stride, 8);
1361
}
1362
1363
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1364
    uint8_t half[64];
1365
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1366
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
1367
}
1368
1369
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1370
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1371
}
1372
1373
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1374
    uint8_t half[64];
1375
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1376
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
1377
}
1378
1379
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1380
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1381
}
1382
1383
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1384
    uint8_t halfH[88];
1385
    uint8_t halfV[64];
1386
    uint8_t halfHV[64];
1387
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1388
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1389
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1390
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1391
}
1392
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1393
    uint8_t halfH[88];
1394
    uint8_t halfV[64];
1395
    uint8_t halfHV[64];
1396
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1397
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1398
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1399
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1400
}
1401
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1402
    uint8_t halfH[88];
1403
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1404
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1405
}
1406
1407
1408 0c1a9eda Zdenek Kabelac
static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1409 de6d9b64 Fabrice Bellard
{
1410
    int s, i;
1411
1412
    s = 0;
1413 ba6802de Michael Niedermayer
    for(i=0;i<16;i++) {
1414 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - pix2[0]);
1415
        s += abs(pix1[1] - pix2[1]);
1416
        s += abs(pix1[2] - pix2[2]);
1417
        s += abs(pix1[3] - pix2[3]);
1418
        s += abs(pix1[4] - pix2[4]);
1419
        s += abs(pix1[5] - pix2[5]);
1420
        s += abs(pix1[6] - pix2[6]);
1421
        s += abs(pix1[7] - pix2[7]);
1422
        s += abs(pix1[8] - pix2[8]);
1423
        s += abs(pix1[9] - pix2[9]);
1424
        s += abs(pix1[10] - pix2[10]);
1425
        s += abs(pix1[11] - pix2[11]);
1426
        s += abs(pix1[12] - pix2[12]);
1427
        s += abs(pix1[13] - pix2[13]);
1428
        s += abs(pix1[14] - pix2[14]);
1429
        s += abs(pix1[15] - pix2[15]);
1430
        pix1 += line_size;
1431
        pix2 += line_size;
1432
    }
1433
    return s;
1434
}
1435
1436 0c1a9eda Zdenek Kabelac
static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1437 de6d9b64 Fabrice Bellard
{
1438
    int s, i;
1439
1440
    s = 0;
1441 ba6802de Michael Niedermayer
    for(i=0;i<16;i++) {
1442 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1443
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1444
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1445
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1446
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1447
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1448
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1449
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1450
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1451
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1452
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1453
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1454
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1455
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1456
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1457
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1458
        pix1 += line_size;
1459
        pix2 += line_size;
1460
    }
1461
    return s;
1462
}
1463
1464 0c1a9eda Zdenek Kabelac
static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1465 de6d9b64 Fabrice Bellard
{
1466
    int s, i;
1467 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
1468 de6d9b64 Fabrice Bellard
1469
    s = 0;
1470 ba6802de Michael Niedermayer
    for(i=0;i<16;i++) {
1471 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1472
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1473
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1474
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1475
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1476
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1477
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1478
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1479
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1480
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1481
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1482
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1483
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1484
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1485
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1486
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1487
        pix1 += line_size;
1488
        pix2 += line_size;
1489
        pix3 += line_size;
1490
    }
1491
    return s;
1492
}
1493
1494 0c1a9eda Zdenek Kabelac
static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1495 de6d9b64 Fabrice Bellard
{
1496
    int s, i;
1497 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
1498 de6d9b64 Fabrice Bellard
1499
    s = 0;
1500 ba6802de Michael Niedermayer
    for(i=0;i<16;i++) {
1501 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1502
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1503
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1504
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1505
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1506
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1507
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1508
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1509
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1510
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1511
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1512
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1513
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1514
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1515
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1516
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1517
        pix1 += line_size;
1518
        pix2 += line_size;
1519
        pix3 += line_size;
1520
    }
1521
    return s;
1522
}
1523
1524 0c1a9eda Zdenek Kabelac
static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1525 ba6802de Michael Niedermayer
{
1526
    int s, i;
1527
1528
    s = 0;
1529
    for(i=0;i<8;i++) {
1530
        s += abs(pix1[0] - pix2[0]);
1531
        s += abs(pix1[1] - pix2[1]);
1532
        s += abs(pix1[2] - pix2[2]);
1533
        s += abs(pix1[3] - pix2[3]);
1534
        s += abs(pix1[4] - pix2[4]);
1535
        s += abs(pix1[5] - pix2[5]);
1536
        s += abs(pix1[6] - pix2[6]);
1537
        s += abs(pix1[7] - pix2[7]);
1538
        pix1 += line_size;
1539
        pix2 += line_size;
1540
    }
1541
    return s;
1542
}
1543
1544 0c1a9eda Zdenek Kabelac
static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1545 ba6802de Michael Niedermayer
{
1546
    int s, i;
1547
1548
    s = 0;
1549
    for(i=0;i<8;i++) {
1550
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1551
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1552
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1553
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1554
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1555
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1556
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1557
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1558
        pix1 += line_size;
1559
        pix2 += line_size;
1560
    }
1561
    return s;
1562
}
1563
1564 0c1a9eda Zdenek Kabelac
static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1565 ba6802de Michael Niedermayer
{
1566
    int s, i;
1567 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
1568 ba6802de Michael Niedermayer
1569
    s = 0;
1570
    for(i=0;i<8;i++) {
1571
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1572
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1573
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1574
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1575
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1576
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1577
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1578
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1579
        pix1 += line_size;
1580
        pix2 += line_size;
1581
        pix3 += line_size;
1582
    }
1583
    return s;
1584
}
1585
1586 0c1a9eda Zdenek Kabelac
static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1587 ba6802de Michael Niedermayer
{
1588
    int s, i;
1589 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
1590 ba6802de Michael Niedermayer
1591
    s = 0;
1592
    for(i=0;i<8;i++) {
1593
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1594
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1595
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1596
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1597
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1598
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1599
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1600
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1601
        pix1 += line_size;
1602
        pix2 += line_size;
1603
        pix3 += line_size;
1604
    }
1605
    return s;
1606
}
1607
1608 1457ab52 Michael Niedermayer
static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
1609
    return pix_abs16x16_c(a,b,stride);
1610
}
1611
1612
static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
1613
    return pix_abs8x8_c(a,b,stride);
1614
}
1615
1616 0c1a9eda Zdenek Kabelac
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1617 d962f6fd Arpi
{
1618 7801d21d Michael Niedermayer
    int i;
1619 477ab036 Michael Niedermayer
    DCTELEM temp[64];
1620 7801d21d Michael Niedermayer
    
1621
    if(last<=0) return;
1622 9a7b310d Zdenek Kabelac
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
1623 d962f6fd Arpi
1624 7801d21d Michael Niedermayer
    for(i=0; i<=last; i++){
1625
        const int j= scantable[i];
1626
        temp[j]= block[j];
1627
        block[j]=0;
1628
    }
1629
    
1630
    for(i=0; i<=last; i++){
1631
        const int j= scantable[i];
1632
        const int perm_j= permutation[j];
1633
        block[perm_j]= temp[j];
1634
    }
1635 d962f6fd Arpi
}
1636 e0eac44e Fabrice Bellard
1637 eb4b3dd3 Zdenek Kabelac
static void clear_blocks_c(DCTELEM *blocks)
1638 649c00c9 Michael Niedermayer
{
1639
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
1640
}
1641
1642 11f18faf Michael Niedermayer
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1643
    int i;
1644 d32ac509 Felix von Leitner
    for(i=0; i+7<w; i+=8){
1645 11f18faf Michael Niedermayer
        dst[i+0] += src[i+0];
1646
        dst[i+1] += src[i+1];
1647
        dst[i+2] += src[i+2];
1648
        dst[i+3] += src[i+3];
1649
        dst[i+4] += src[i+4];
1650
        dst[i+5] += src[i+5];
1651
        dst[i+6] += src[i+6];
1652
        dst[i+7] += src[i+7];
1653
    }
1654
    for(; i<w; i++)
1655
        dst[i+0] += src[i+0];
1656
}
1657
1658
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1659
    int i;
1660 d32ac509 Felix von Leitner
    for(i=0; i+7<w; i+=8){
1661 11f18faf Michael Niedermayer
        dst[i+0] = src1[i+0]-src2[i+0];
1662
        dst[i+1] = src1[i+1]-src2[i+1];
1663
        dst[i+2] = src1[i+2]-src2[i+2];
1664
        dst[i+3] = src1[i+3]-src2[i+3];
1665
        dst[i+4] = src1[i+4]-src2[i+4];
1666
        dst[i+5] = src1[i+5]-src2[i+5];
1667
        dst[i+6] = src1[i+6]-src2[i+6];
1668
        dst[i+7] = src1[i+7]-src2[i+7];
1669
    }
1670
    for(; i<w; i++)
1671
        dst[i+0] = src1[i+0]-src2[i+0];
1672
}
1673
1674 1457ab52 Michael Niedermayer
#define BUTTERFLY2(o1,o2,i1,i2) \
1675
o1= (i1)+(i2);\
1676
o2= (i1)-(i2);
1677
1678
#define BUTTERFLY1(x,y) \
1679
{\
1680
    int a,b;\
1681
    a= x;\
1682
    b= y;\
1683
    x= a+b;\
1684
    y= a-b;\
1685
}
1686
1687
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
1688
1689
static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
1690
    int i;
1691
    int temp[64];
1692
    int sum=0;
1693
1694
    for(i=0; i<8; i++){
1695
        //FIXME try pointer walks
1696
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1697
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1698
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1699
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1700
        
1701
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1702
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1703
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1704
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1705
        
1706
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1707
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1708
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1709
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1710
    }
1711
1712
    for(i=0; i<8; i++){
1713
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1714
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1715
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1716
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1717
        
1718
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1719
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1720
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1721
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1722
1723
        sum += 
1724
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1725
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1726
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1727
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1728
    }
1729
#if 0
1730
static int maxi=0;
1731
if(sum>maxi){
1732
    maxi=sum;
1733
    printf("MAX:%d\n", maxi);
1734
}
1735
#endif
1736
    return sum;
1737
}
1738
1739
static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
1740
    int i;
1741
    int temp[64];
1742
    int sum=0;
1743
//FIXME OOOPS ignore 0 term instead of mean mess
1744
    for(i=0; i<8; i++){
1745
        //FIXME try pointer walks
1746
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
1747
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
1748
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
1749
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
1750
        
1751
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1752
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1753
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1754
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1755
        
1756
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1757
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1758
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1759
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1760
    }
1761
1762
    for(i=0; i<8; i++){
1763
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1764
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1765
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1766
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1767
        
1768
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1769
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1770
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1771
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1772
    
1773
        sum += 
1774
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1775
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1776
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1777
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1778
    }
1779
    
1780
    return sum;
1781
}
1782
1783
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1784
    MpegEncContext * const s= (MpegEncContext *)c;
1785 76fbb024 Michael Niedermayer
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
1786
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
1787 1457ab52 Michael Niedermayer
    int sum=0, i;
1788
1789
    s->dsp.diff_pixels(temp, src1, src2, stride);
1790
    s->fdct(temp);
1791
1792
    for(i=0; i<64; i++)
1793
        sum+= ABS(temp[i]);
1794
        
1795
    return sum;
1796
}
1797
1798 0e15384d Michael Niedermayer
void simple_idct(DCTELEM *block); //FIXME
1799 1457ab52 Michael Niedermayer
1800
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1801
    MpegEncContext * const s= (MpegEncContext *)c;
1802 76fbb024 Michael Niedermayer
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
1803
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
1804
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1805 1457ab52 Michael Niedermayer
    int sum=0, i;
1806
1807
    s->mb_intra=0;
1808
    
1809
    s->dsp.diff_pixels(temp, src1, src2, stride);
1810
    
1811
    memcpy(bak, temp, 64*sizeof(DCTELEM));
1812
    
1813 67725183 Michael Niedermayer
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
1814 1457ab52 Michael Niedermayer
    s->dct_unquantize(s, temp, 0, s->qscale);
1815
    simple_idct(temp); //FIXME 
1816
    
1817
    for(i=0; i<64; i++)
1818
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
1819
        
1820
    return sum;
1821
}
1822
1823 3a87ac94 Michael Niedermayer
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1824
    MpegEncContext * const s= (MpegEncContext *)c;
1825 0c1a9eda Zdenek Kabelac
    const uint8_t *scantable= s->intra_scantable.permutated;
1826 76fbb024 Michael Niedermayer
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
1827
    uint64_t __align8 aligned_bak[stride];
1828
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
1829
    uint8_t * const bak= (uint8_t*)aligned_bak;
1830 3a87ac94 Michael Niedermayer
    int i, last, run, bits, level, distoration, start_i;
1831
    const int esc_length= s->ac_esc_length;
1832
    uint8_t * length;
1833
    uint8_t * last_length;
1834 67725183 Michael Niedermayer
    
1835
    for(i=0; i<8; i++){
1836
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
1837
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
1838
    }
1839 3a87ac94 Michael Niedermayer
1840 67725183 Michael Niedermayer
    s->dsp.diff_pixels(temp, src1, src2, stride);
1841
1842
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
1843
1844
    bits=0;
1845 3a87ac94 Michael Niedermayer
    
1846
    if (s->mb_intra) {
1847 67725183 Michael Niedermayer
        start_i = 1; 
1848 3a87ac94 Michael Niedermayer
        length     = s->intra_ac_vlc_length;
1849
        last_length= s->intra_ac_vlc_last_length;
1850 67725183 Michael Niedermayer
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
1851 3a87ac94 Michael Niedermayer
    } else {
1852
        start_i = 0;
1853
        length     = s->inter_ac_vlc_length;
1854
        last_length= s->inter_ac_vlc_last_length;
1855
    }
1856
    
1857 67725183 Michael Niedermayer
    if(last>=start_i){
1858 3a87ac94 Michael Niedermayer
        run=0;
1859
        for(i=start_i; i<last; i++){
1860
            int j= scantable[i];
1861
            level= temp[j];
1862
        
1863
            if(level){
1864
                level+=64;
1865
                if((level&(~127)) == 0){
1866
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
1867
                }else
1868
                    bits+= esc_length;
1869
                run=0;
1870
            }else
1871
                run++;
1872
        }
1873
        i= scantable[last];
1874 1d0eab1d Michael Niedermayer
       
1875 3a87ac94 Michael Niedermayer
        level= temp[i] + 64;
1876 1d0eab1d Michael Niedermayer
1877
        assert(level - 64);
1878
        
1879 3a87ac94 Michael Niedermayer
        if((level&(~127)) == 0){
1880
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
1881
        }else
1882
            bits+= esc_length;
1883
    
1884 67725183 Michael Niedermayer
    }
1885
1886
    if(last>=0){
1887 3a87ac94 Michael Niedermayer
        s->dct_unquantize(s, temp, 0, s->qscale);
1888
    }
1889
    
1890
    s->idct_add(bak, stride, temp);
1891
    
1892
    distoration= s->dsp.sse[1](NULL, bak, src1, stride);
1893
1894 67725183 Michael Niedermayer
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
1895 3a87ac94 Michael Niedermayer
}
1896
1897
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1898
    MpegEncContext * const s= (MpegEncContext *)c;
1899 0c1a9eda Zdenek Kabelac
    const uint8_t *scantable= s->intra_scantable.permutated;
1900 76fbb024 Michael Niedermayer
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
1901
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
1902 3a87ac94 Michael Niedermayer
    int i, last, run, bits, level, start_i;
1903
    const int esc_length= s->ac_esc_length;
1904
    uint8_t * length;
1905
    uint8_t * last_length;
1906 67725183 Michael Niedermayer
    
1907
    s->dsp.diff_pixels(temp, src1, src2, stride);
1908 3a87ac94 Michael Niedermayer
1909 67725183 Michael Niedermayer
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
1910
1911
    bits=0;
1912 3a87ac94 Michael Niedermayer
    
1913
    if (s->mb_intra) {
1914 67725183 Michael Niedermayer
        start_i = 1; 
1915 3a87ac94 Michael Niedermayer
        length     = s->intra_ac_vlc_length;
1916
        last_length= s->intra_ac_vlc_last_length;
1917 67725183 Michael Niedermayer
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
1918 3a87ac94 Michael Niedermayer
    } else {
1919
        start_i = 0;
1920
        length     = s->inter_ac_vlc_length;
1921
        last_length= s->inter_ac_vlc_last_length;
1922
    }
1923
    
1924 67725183 Michael Niedermayer
    if(last>=start_i){
1925 3a87ac94 Michael Niedermayer
        run=0;
1926
        for(i=start_i; i<last; i++){
1927
            int j= scantable[i];
1928
            level= temp[j];
1929
        
1930
            if(level){
1931
                level+=64;
1932
                if((level&(~127)) == 0){
1933
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
1934
                }else
1935
                    bits+= esc_length;
1936
                run=0;
1937
            }else
1938
                run++;
1939
        }
1940
        i= scantable[last];
1941 67725183 Michael Niedermayer
                
1942
        level= temp[i] + 64;
1943 3a87ac94 Michael Niedermayer
        
1944 67725183 Michael Niedermayer
        assert(level - 64);
1945 3a87ac94 Michael Niedermayer
        
1946
        if((level&(~127)) == 0){
1947
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
1948
        }else
1949
            bits+= esc_length;
1950
    }
1951
1952
    return bits;
1953
}
1954
1955
1956 1457ab52 Michael Niedermayer
WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
1957
WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
1958
WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
1959 3a87ac94 Michael Niedermayer
WARPER88_1616(rd8x8_c, rd16x16_c)
1960
WARPER88_1616(bit8x8_c, bit16x16_c)
1961 1457ab52 Michael Niedermayer
1962 eb4b3dd3 Zdenek Kabelac
void dsputil_init(DSPContext* c, unsigned mask)
1963 e0eac44e Fabrice Bellard
{
1964 5abd509a Zdenek Kabelac
    static int init_done = 0;
1965 d2975f8d Michael Niedermayer
    int i;
1966 e0eac44e Fabrice Bellard
1967 5abd509a Zdenek Kabelac
    if (!init_done) {
1968
        for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1969
        for(i=0;i<MAX_NEG_CROP;i++) {
1970
            cropTbl[i] = 0;
1971
            cropTbl[i + MAX_NEG_CROP + 256] = 255;
1972
        }
1973 de6d9b64 Fabrice Bellard
1974 5abd509a Zdenek Kabelac
        for(i=0;i<512;i++) {
1975
            squareTbl[i] = (i - 256) * (i - 256);
1976
        }
1977 92ddb692 Zdenek Kabelac
1978
        for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
1979
1980
        init_done = 1;
1981 de6d9b64 Fabrice Bellard
    }
1982
1983 eb4b3dd3 Zdenek Kabelac
    c->get_pixels = get_pixels_c;
1984
    c->diff_pixels = diff_pixels_c;
1985
    c->put_pixels_clamped = put_pixels_clamped_c;
1986
    c->add_pixels_clamped = add_pixels_clamped_c;
1987
    c->gmc1 = gmc1_c;
1988
    c->gmc = gmc_c;
1989
    c->clear_blocks = clear_blocks_c;
1990
    c->pix_sum = pix_sum_c;
1991
    c->pix_norm1 = pix_norm1_c;
1992 1457ab52 Michael Niedermayer
    c->sse[0]= sse16_c;
1993
    c->sse[1]= sse8_c;
1994 eb4b3dd3 Zdenek Kabelac
1995 45553457 Zdenek Kabelac
    /* TODO [0] 16  [1] 8 */
1996 eb4b3dd3 Zdenek Kabelac
    c->pix_abs16x16     = pix_abs16x16_c;
1997
    c->pix_abs16x16_x2  = pix_abs16x16_x2_c;
1998
    c->pix_abs16x16_y2  = pix_abs16x16_y2_c;
1999
    c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2000
    c->pix_abs8x8     = pix_abs8x8_c;
2001
    c->pix_abs8x8_x2  = pix_abs8x8_x2_c;
2002
    c->pix_abs8x8_y2  = pix_abs8x8_y2_c;
2003
    c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2004
2005 45553457 Zdenek Kabelac
#define dspfunc(PFX, IDX, NUM) \
2006
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
2007
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
2008
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
2009
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2010
2011
    dspfunc(put, 0, 16);
2012
    dspfunc(put_no_rnd, 0, 16);
2013
    dspfunc(put, 1, 8);
2014
    dspfunc(put_no_rnd, 1, 8);
2015
2016
    dspfunc(avg, 0, 16);
2017
    dspfunc(avg_no_rnd, 0, 16);
2018
    dspfunc(avg, 1, 8);
2019
    dspfunc(avg_no_rnd, 1, 8);
2020
#undef dspfunc
2021
2022
#define dspfunc(PFX, IDX, NUM) \
2023
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2024
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2025
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2026
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2027
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2028
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2029
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2030
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2031
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2032
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2033
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2034
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2035
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2036
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2037
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2038
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2039
2040
    dspfunc(put_qpel, 0, 16);
2041
    dspfunc(put_no_rnd_qpel, 0, 16);
2042
2043
    dspfunc(avg_qpel, 0, 16);
2044
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2045
2046
    dspfunc(put_qpel, 1, 8);
2047
    dspfunc(put_no_rnd_qpel, 1, 8);
2048
2049
    dspfunc(avg_qpel, 1, 8);
2050
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2051
#undef dspfunc
2052 c9a2ebc4 Michael Niedermayer
2053 1457ab52 Michael Niedermayer
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
2054
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2055
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2056
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2057
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2058
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2059
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2060
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2061
    
2062
    c->hadamard8_diff[0]= hadamard8_diff16_c;
2063
    c->hadamard8_diff[1]= hadamard8_diff_c;
2064
    c->hadamard8_abs = hadamard8_abs_c;
2065
    
2066
    c->dct_sad[0]= dct_sad16x16_c;
2067
    c->dct_sad[1]= dct_sad8x8_c;
2068
    
2069
    c->sad[0]= sad16x16_c;
2070
    c->sad[1]= sad8x8_c;
2071
    
2072
    c->quant_psnr[0]= quant_psnr16x16_c;
2073
    c->quant_psnr[1]= quant_psnr8x8_c;
2074 3a87ac94 Michael Niedermayer
2075
    c->rd[0]= rd16x16_c;
2076
    c->rd[1]= rd8x8_c;
2077
2078
    c->bit[0]= bit16x16_c;
2079
    c->bit[1]= bit8x8_c;
2080
        
2081 11f18faf Michael Niedermayer
    c->add_bytes= add_bytes_c;
2082
    c->diff_bytes= diff_bytes_c;
2083
2084 980fc7b8 Fabrice Bellard
#ifdef HAVE_MMX
2085 eb4b3dd3 Zdenek Kabelac
    dsputil_init_mmx(c, mask);
2086 34dfe896 Zdenek Kabelac
    if (ff_bit_exact)
2087
    {
2088
        /* FIXME - AVCodec context should have flag for bitexact match */
2089
        /* fprintf(stderr, "\n\n\nff_bit_exact %d\n\n\n\n", ff_bit_exact); */
2090
        dsputil_set_bit_exact_mmx(c, mask);
2091
    }
2092 de6d9b64 Fabrice Bellard
#endif
2093 3d03c0a2 Fabrice Bellard
#ifdef ARCH_ARMV4L
2094 eb4b3dd3 Zdenek Kabelac
    dsputil_init_armv4l(c, mask);
2095 3d03c0a2 Fabrice Bellard
#endif
2096 c34270f5 Fabrice Bellard
#ifdef HAVE_MLIB
2097 eb4b3dd3 Zdenek Kabelac
    dsputil_init_mlib(c, mask);
2098 c34270f5 Fabrice Bellard
#endif
2099 1e98dffb Nick Kurshev
#ifdef ARCH_ALPHA
2100 eb4b3dd3 Zdenek Kabelac
    dsputil_init_alpha(c, mask);
2101 1e98dffb Nick Kurshev
#endif
2102 59925ef2 Brian Foley
#ifdef ARCH_POWERPC
2103 eb4b3dd3 Zdenek Kabelac
    dsputil_init_ppc(c, mask);
2104 a43bd1d7 Heliodoro Tammaro
#endif
2105 d46aba26 Leon van Stuivenberg
#ifdef HAVE_MMI
2106 eb4b3dd3 Zdenek Kabelac
    dsputil_init_mmi(c, mask);
2107 d46aba26 Leon van Stuivenberg
#endif
2108 de6d9b64 Fabrice Bellard
}
2109 43f1708f Juanjo
2110 57060b1e Fabrice Bellard
/* remove any non bit exact operation (testing purpose) */
2111
void avcodec_set_bit_exact(void)
2112
{
2113 5596c60c Michael Niedermayer
    ff_bit_exact=1;
2114 57060b1e Fabrice Bellard
#ifdef HAVE_MMX
2115 34dfe896 Zdenek Kabelac
// FIXME - better set_bit_exact
2116 eb4b3dd3 Zdenek Kabelac
//    dsputil_set_bit_exact_mmx();
2117 57060b1e Fabrice Bellard
#endif
2118
}