Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 68ca24e6

History | View | Annotate | Download (115 KB)

1 de6d9b64 Fabrice Bellard
/*
2
 * DSP utils
3 ff4ec49e Fabrice Bellard
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 de6d9b64 Fabrice Bellard
 *
5 ff4ec49e Fabrice Bellard
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9 de6d9b64 Fabrice Bellard
 *
10 ff4ec49e Fabrice Bellard
 * This library is distributed in the hope that it will be useful,
11 de6d9b64 Fabrice Bellard
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ff4ec49e Fabrice Bellard
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14 de6d9b64 Fabrice Bellard
 *
15 ff4ec49e Fabrice Bellard
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 7ff037e9 Michael Niedermayer
 *
19 59fe111e Michael Niedermayer
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20 de6d9b64 Fabrice Bellard
 */
21 983e3246 Michael Niedermayer
 
22
/**
23
 * @file dsputil.c
24
 * DSP utils
25
 */
26
 
27 de6d9b64 Fabrice Bellard
#include "avcodec.h"
28
#include "dsputil.h"
29 1457ab52 Michael Niedermayer
#include "mpegvideo.h"
30 b0368839 Michael Niedermayer
#include "simple_idct.h"
31 65e4c8c9 Michael Niedermayer
#include "faandct.h"
32 5596c60c Michael Niedermayer
33 0c1a9eda Zdenek Kabelac
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34
uint32_t squareTbl[512];
35 de6d9b64 Fabrice Bellard
36 0c1a9eda Zdenek Kabelac
const uint8_t ff_zigzag_direct[64] = {
37 2ad1516a Michael Niedermayer
    0,   1,  8, 16,  9,  2,  3, 10,
38
    17, 24, 32, 25, 18, 11,  4,  5,
39 e0eac44e Fabrice Bellard
    12, 19, 26, 33, 40, 48, 41, 34,
40 2ad1516a Michael Niedermayer
    27, 20, 13,  6,  7, 14, 21, 28,
41 e0eac44e Fabrice Bellard
    35, 42, 49, 56, 57, 50, 43, 36,
42
    29, 22, 15, 23, 30, 37, 44, 51,
43
    58, 59, 52, 45, 38, 31, 39, 46,
44
    53, 60, 61, 54, 47, 55, 62, 63
45
};
46
47 10acc479 Roman Shaposhnik
/* Specific zigzag scan for 248 idct. NOTE that unlike the
48
   specification, we interleave the fields */
49
const uint8_t ff_zigzag248_direct[64] = {
50
     0,  8,  1,  9, 16, 24,  2, 10,
51
    17, 25, 32, 40, 48, 56, 33, 41,
52
    18, 26,  3, 11,  4, 12, 19, 27,
53
    34, 42, 49, 57, 50, 58, 35, 43,
54
    20, 28,  5, 13,  6, 14, 21, 29,
55
    36, 44, 51, 59, 52, 60, 37, 45,
56
    22, 30,  7, 15, 23, 31, 38, 46,
57
    53, 61, 54, 62, 39, 47, 55, 63,
58
};
59
60 2f349de2 Michael Niedermayer
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
61 0c1a9eda Zdenek Kabelac
uint16_t __align8 inv_zigzag_direct16[64];
62 2f349de2 Michael Niedermayer
63 0c1a9eda Zdenek Kabelac
const uint8_t ff_alternate_horizontal_scan[64] = {
64 2ad1516a Michael Niedermayer
    0,  1,   2,  3,  8,  9, 16, 17, 
65 e0eac44e Fabrice Bellard
    10, 11,  4,  5,  6,  7, 15, 14,
66
    13, 12, 19, 18, 24, 25, 32, 33, 
67
    26, 27, 20, 21, 22, 23, 28, 29,
68
    30, 31, 34, 35, 40, 41, 48, 49, 
69
    42, 43, 36, 37, 38, 39, 44, 45,
70
    46, 47, 50, 51, 56, 57, 58, 59, 
71
    52, 53, 54, 55, 60, 61, 62, 63,
72
};
73
74 0c1a9eda Zdenek Kabelac
const uint8_t ff_alternate_vertical_scan[64] = {
75 2ad1516a Michael Niedermayer
    0,  8,  16, 24,  1,  9,  2, 10, 
76 e0eac44e Fabrice Bellard
    17, 25, 32, 40, 48, 56, 57, 49,
77
    41, 33, 26, 18,  3, 11,  4, 12, 
78
    19, 27, 34, 42, 50, 58, 35, 43,
79
    51, 59, 20, 28,  5, 13,  6, 14, 
80
    21, 29, 36, 44, 52, 60, 37, 45,
81
    53, 61, 22, 30,  7, 15, 23, 31, 
82
    38, 46, 54, 62, 39, 47, 55, 63,
83
};
84
85 2f349de2 Michael Niedermayer
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
86 0c1a9eda Zdenek Kabelac
const uint32_t inverse[256]={
87 2f349de2 Michael Niedermayer
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
88
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
89
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
90
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
91
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
92
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
93
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
94
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
95
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
96
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
97
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
98
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
99
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
100
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
101
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
102
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
103
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
104
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
105
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
106
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
107
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
108
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
109
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
110
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
111
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
112
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
113
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
114
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
115
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
116
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
117
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
118
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
119
};
120
121 b0368839 Michael Niedermayer
/* Input permutation for the simple_idct_mmx */
122
static const uint8_t simple_mmx_permutation[64]={
123
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
124
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
125
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
126
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
127
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
128
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
129
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
130
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
131
};
132
133 0c1a9eda Zdenek Kabelac
static int pix_sum_c(uint8_t * pix, int line_size)
134 3aa102be Michael Niedermayer
{
135
    int s, i, j;
136
137
    s = 0;
138
    for (i = 0; i < 16; i++) {
139
        for (j = 0; j < 16; j += 8) {
140
            s += pix[0];
141
            s += pix[1];
142
            s += pix[2];
143
            s += pix[3];
144
            s += pix[4];
145
            s += pix[5];
146
            s += pix[6];
147
            s += pix[7];
148
            pix += 8;
149
        }
150
        pix += line_size - 16;
151
    }
152
    return s;
153
}
154
155 0c1a9eda Zdenek Kabelac
static int pix_norm1_c(uint8_t * pix, int line_size)
156 3aa102be Michael Niedermayer
{
157
    int s, i, j;
158 0c1a9eda Zdenek Kabelac
    uint32_t *sq = squareTbl + 256;
159 3aa102be Michael Niedermayer
160
    s = 0;
161
    for (i = 0; i < 16; i++) {
162
        for (j = 0; j < 16; j += 8) {
163 2a006cd3 Felix von Leitner
#if 0
164 3aa102be Michael Niedermayer
            s += sq[pix[0]];
165
            s += sq[pix[1]];
166
            s += sq[pix[2]];
167
            s += sq[pix[3]];
168
            s += sq[pix[4]];
169
            s += sq[pix[5]];
170
            s += sq[pix[6]];
171
            s += sq[pix[7]];
172 2a006cd3 Felix von Leitner
#else
173
#if LONG_MAX > 2147483647
174
            register uint64_t x=*(uint64_t*)pix;
175
            s += sq[x&0xff];
176
            s += sq[(x>>8)&0xff];
177
            s += sq[(x>>16)&0xff];
178
            s += sq[(x>>24)&0xff];
179
            s += sq[(x>>32)&0xff];
180
            s += sq[(x>>40)&0xff];
181
            s += sq[(x>>48)&0xff];
182
            s += sq[(x>>56)&0xff];
183
#else
184
            register uint32_t x=*(uint32_t*)pix;
185
            s += sq[x&0xff];
186
            s += sq[(x>>8)&0xff];
187
            s += sq[(x>>16)&0xff];
188
            s += sq[(x>>24)&0xff];
189
            x=*(uint32_t*)(pix+4);
190
            s += sq[x&0xff];
191
            s += sq[(x>>8)&0xff];
192
            s += sq[(x>>16)&0xff];
193
            s += sq[(x>>24)&0xff];
194
#endif
195
#endif
196 3aa102be Michael Niedermayer
            pix += 8;
197
        }
198
        pix += line_size - 16;
199
    }
200
    return s;
201
}
202
203 3d2e8cce Michael Niedermayer
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
204
    int i;
205
    
206
    for(i=0; i+8<=w; i+=8){
207
        dst[i+0]= bswap_32(src[i+0]);
208
        dst[i+1]= bswap_32(src[i+1]);
209
        dst[i+2]= bswap_32(src[i+2]);
210
        dst[i+3]= bswap_32(src[i+3]);
211
        dst[i+4]= bswap_32(src[i+4]);
212
        dst[i+5]= bswap_32(src[i+5]);
213
        dst[i+6]= bswap_32(src[i+6]);
214
        dst[i+7]= bswap_32(src[i+7]);
215
    }
216
    for(;i<w; i++){
217
        dst[i+0]= bswap_32(src[i+0]);
218
    }
219
}
220 3aa102be Michael Niedermayer
221 0c1a9eda Zdenek Kabelac
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
222 1457ab52 Michael Niedermayer
{
223
    int s, i;
224 0c1a9eda Zdenek Kabelac
    uint32_t *sq = squareTbl + 256;
225 1457ab52 Michael Niedermayer
226
    s = 0;
227
    for (i = 0; i < 8; i++) {
228
        s += sq[pix1[0] - pix2[0]];
229
        s += sq[pix1[1] - pix2[1]];
230
        s += sq[pix1[2] - pix2[2]];
231
        s += sq[pix1[3] - pix2[3]];
232
        s += sq[pix1[4] - pix2[4]];
233
        s += sq[pix1[5] - pix2[5]];
234
        s += sq[pix1[6] - pix2[6]];
235
        s += sq[pix1[7] - pix2[7]];
236
        pix1 += line_size;
237
        pix2 += line_size;
238
    }
239
    return s;
240
}
241
242 6b026927 Falk Hüffner
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
243 9c76bd48 Brian Foley
{
244 6b026927 Falk Hüffner
    int s, i;
245
    uint32_t *sq = squareTbl + 256;
246 9c76bd48 Brian Foley
247
    s = 0;
248
    for (i = 0; i < 16; i++) {
249 6b026927 Falk Hüffner
        s += sq[pix1[ 0] - pix2[ 0]];
250
        s += sq[pix1[ 1] - pix2[ 1]];
251
        s += sq[pix1[ 2] - pix2[ 2]];
252
        s += sq[pix1[ 3] - pix2[ 3]];
253
        s += sq[pix1[ 4] - pix2[ 4]];
254
        s += sq[pix1[ 5] - pix2[ 5]];
255
        s += sq[pix1[ 6] - pix2[ 6]];
256
        s += sq[pix1[ 7] - pix2[ 7]];
257
        s += sq[pix1[ 8] - pix2[ 8]];
258
        s += sq[pix1[ 9] - pix2[ 9]];
259
        s += sq[pix1[10] - pix2[10]];
260
        s += sq[pix1[11] - pix2[11]];
261
        s += sq[pix1[12] - pix2[12]];
262
        s += sq[pix1[13] - pix2[13]];
263
        s += sq[pix1[14] - pix2[14]];
264
        s += sq[pix1[15] - pix2[15]];
265 2a006cd3 Felix von Leitner
266 6b026927 Falk Hüffner
        pix1 += line_size;
267
        pix2 += line_size;
268 9c76bd48 Brian Foley
    }
269
    return s;
270
}
271
272 0c1a9eda Zdenek Kabelac
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
273 de6d9b64 Fabrice Bellard
{
274
    int i;
275
276
    /* read the pixels */
277
    for(i=0;i<8;i++) {
278 c13e1abd Falk Hüffner
        block[0] = pixels[0];
279
        block[1] = pixels[1];
280
        block[2] = pixels[2];
281
        block[3] = pixels[3];
282
        block[4] = pixels[4];
283
        block[5] = pixels[5];
284
        block[6] = pixels[6];
285
        block[7] = pixels[7];
286
        pixels += line_size;
287
        block += 8;
288 de6d9b64 Fabrice Bellard
    }
289
}
290
291 0c1a9eda Zdenek Kabelac
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
292
                          const uint8_t *s2, int stride){
293 9dbcbd92 Michael Niedermayer
    int i;
294
295
    /* read the pixels */
296
    for(i=0;i<8;i++) {
297 c13e1abd Falk Hüffner
        block[0] = s1[0] - s2[0];
298
        block[1] = s1[1] - s2[1];
299
        block[2] = s1[2] - s2[2];
300
        block[3] = s1[3] - s2[3];
301
        block[4] = s1[4] - s2[4];
302
        block[5] = s1[5] - s2[5];
303
        block[6] = s1[6] - s2[6];
304
        block[7] = s1[7] - s2[7];
305 9dbcbd92 Michael Niedermayer
        s1 += stride;
306
        s2 += stride;
307 c13e1abd Falk Hüffner
        block += 8;
308 9dbcbd92 Michael Niedermayer
    }
309
}
310
311
312 0c1a9eda Zdenek Kabelac
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
313 eb4b3dd3 Zdenek Kabelac
                                 int line_size)
314 de6d9b64 Fabrice Bellard
{
315
    int i;
316 0c1a9eda Zdenek Kabelac
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
317 de6d9b64 Fabrice Bellard
    
318
    /* read the pixels */
319
    for(i=0;i<8;i++) {
320 c13e1abd Falk Hüffner
        pixels[0] = cm[block[0]];
321
        pixels[1] = cm[block[1]];
322
        pixels[2] = cm[block[2]];
323
        pixels[3] = cm[block[3]];
324
        pixels[4] = cm[block[4]];
325
        pixels[5] = cm[block[5]];
326
        pixels[6] = cm[block[6]];
327
        pixels[7] = cm[block[7]];
328
329
        pixels += line_size;
330
        block += 8;
331 de6d9b64 Fabrice Bellard
    }
332
}
333
334 0c1a9eda Zdenek Kabelac
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
335 c13e1abd Falk Hüffner
                          int line_size)
336 de6d9b64 Fabrice Bellard
{
337
    int i;
338 0c1a9eda Zdenek Kabelac
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
339 de6d9b64 Fabrice Bellard
    
340
    /* read the pixels */
341
    for(i=0;i<8;i++) {
342 c13e1abd Falk Hüffner
        pixels[0] = cm[pixels[0] + block[0]];
343
        pixels[1] = cm[pixels[1] + block[1]];
344
        pixels[2] = cm[pixels[2] + block[2]];
345
        pixels[3] = cm[pixels[3] + block[3]];
346
        pixels[4] = cm[pixels[4] + block[4]];
347
        pixels[5] = cm[pixels[5] + block[5]];
348
        pixels[6] = cm[pixels[6] + block[6]];
349
        pixels[7] = cm[pixels[7] + block[7]];
350
        pixels += line_size;
351
        block += 8;
352 de6d9b64 Fabrice Bellard
    }
353
}
354 59fe111e Michael Niedermayer
#if 0
355

356
#define PIXOP2(OPNAME, OP) \
357 b3184779 Michael Niedermayer
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
358 59fe111e Michael Niedermayer
{\
359
    int i;\
360
    for(i=0; i<h; i++){\
361
        OP(*((uint64_t*)block), LD64(pixels));\
362
        pixels+=line_size;\
363
        block +=line_size;\
364
    }\
365
}\
366
\
367 45553457 Zdenek Kabelac
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
368 59fe111e Michael Niedermayer
{\
369
    int i;\
370
    for(i=0; i<h; i++){\
371
        const uint64_t a= LD64(pixels  );\
372
        const uint64_t b= LD64(pixels+1);\
373
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
374
        pixels+=line_size;\
375
        block +=line_size;\
376
    }\
377
}\
378
\
379 45553457 Zdenek Kabelac
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
380 59fe111e Michael Niedermayer
{\
381
    int i;\
382
    for(i=0; i<h; i++){\
383
        const uint64_t a= LD64(pixels  );\
384
        const uint64_t b= LD64(pixels+1);\
385
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
386
        pixels+=line_size;\
387
        block +=line_size;\
388
    }\
389
}\
390
\
391 45553457 Zdenek Kabelac
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
392 59fe111e Michael Niedermayer
{\
393
    int i;\
394
    for(i=0; i<h; i++){\
395
        const uint64_t a= LD64(pixels          );\
396
        const uint64_t b= LD64(pixels+line_size);\
397
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
398
        pixels+=line_size;\
399
        block +=line_size;\
400
    }\
401
}\
402
\
403 45553457 Zdenek Kabelac
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
404 59fe111e Michael Niedermayer
{\
405
    int i;\
406
    for(i=0; i<h; i++){\
407
        const uint64_t a= LD64(pixels          );\
408
        const uint64_t b= LD64(pixels+line_size);\
409
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
410
        pixels+=line_size;\
411
        block +=line_size;\
412
    }\
413
}\
414
\
415 45553457 Zdenek Kabelac
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
416 59fe111e Michael Niedermayer
{\
417
        int i;\
418
        const uint64_t a= LD64(pixels  );\
419
        const uint64_t b= LD64(pixels+1);\
420
        uint64_t l0=  (a&0x0303030303030303ULL)\
421
                    + (b&0x0303030303030303ULL)\
422
                    + 0x0202020202020202ULL;\
423
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
424
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
425
        uint64_t l1,h1;\
426
\
427
        pixels+=line_size;\
428
        for(i=0; i<h; i+=2){\
429
            uint64_t a= LD64(pixels  );\
430
            uint64_t b= LD64(pixels+1);\
431
            l1=  (a&0x0303030303030303ULL)\
432
               + (b&0x0303030303030303ULL);\
433
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
434
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
435
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
436
            pixels+=line_size;\
437
            block +=line_size;\
438
            a= LD64(pixels  );\
439
            b= LD64(pixels+1);\
440
            l0=  (a&0x0303030303030303ULL)\
441
               + (b&0x0303030303030303ULL)\
442
               + 0x0202020202020202ULL;\
443
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
444
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
445
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
446
            pixels+=line_size;\
447
            block +=line_size;\
448
        }\
449
}\
450
\
451 45553457 Zdenek Kabelac
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
452 59fe111e Michael Niedermayer
{\
453
        int i;\
454
        const uint64_t a= LD64(pixels  );\
455
        const uint64_t b= LD64(pixels+1);\
456
        uint64_t l0=  (a&0x0303030303030303ULL)\
457
                    + (b&0x0303030303030303ULL)\
458
                    + 0x0101010101010101ULL;\
459
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
460
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
461
        uint64_t l1,h1;\
462
\
463
        pixels+=line_size;\
464
        for(i=0; i<h; i+=2){\
465
            uint64_t a= LD64(pixels  );\
466
            uint64_t b= LD64(pixels+1);\
467
            l1=  (a&0x0303030303030303ULL)\
468
               + (b&0x0303030303030303ULL);\
469
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
470
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
471
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
472
            pixels+=line_size;\
473
            block +=line_size;\
474
            a= LD64(pixels  );\
475
            b= LD64(pixels+1);\
476
            l0=  (a&0x0303030303030303ULL)\
477
               + (b&0x0303030303030303ULL)\
478
               + 0x0101010101010101ULL;\
479
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
480
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
481
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
482
            pixels+=line_size;\
483
            block +=line_size;\
484
        }\
485
}\
486
\
487 45553457 Zdenek Kabelac
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
488
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
489
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
490
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
491
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
492
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
493
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
494 59fe111e Michael Niedermayer

495
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
496
#else // 64 bit variant
497
498
#define PIXOP2(OPNAME, OP) \
499 669ac79c Michael Niedermayer
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
500
    int i;\
501
    for(i=0; i<h; i++){\
502
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
503
        pixels+=line_size;\
504
        block +=line_size;\
505
    }\
506
}\
507 0da71265 Michael Niedermayer
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
508
    int i;\
509
    for(i=0; i<h; i++){\
510
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
511
        pixels+=line_size;\
512
        block +=line_size;\
513
    }\
514
}\
515 45553457 Zdenek Kabelac
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
516 59fe111e Michael Niedermayer
    int i;\
517
    for(i=0; i<h; i++){\
518
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
519
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
520
        pixels+=line_size;\
521
        block +=line_size;\
522
    }\
523
}\
524 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
525
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
526 b3184779 Michael Niedermayer
}\
527 59fe111e Michael Niedermayer
\
528 b3184779 Michael Niedermayer
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
529
                                                int src_stride1, int src_stride2, int h){\
530 59fe111e Michael Niedermayer
    int i;\
531
    for(i=0; i<h; i++){\
532 b3184779 Michael Niedermayer
        uint32_t a,b;\
533
        a= LD32(&src1[i*src_stride1  ]);\
534
        b= LD32(&src2[i*src_stride2  ]);\
535 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
536 b3184779 Michael Niedermayer
        a= LD32(&src1[i*src_stride1+4]);\
537
        b= LD32(&src2[i*src_stride2+4]);\
538 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
539 59fe111e Michael Niedermayer
    }\
540
}\
541
\
542 b3184779 Michael Niedermayer
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
543
                                                int src_stride1, int src_stride2, int h){\
544 59fe111e Michael Niedermayer
    int i;\
545
    for(i=0; i<h; i++){\
546 b3184779 Michael Niedermayer
        uint32_t a,b;\
547
        a= LD32(&src1[i*src_stride1  ]);\
548
        b= LD32(&src2[i*src_stride2  ]);\
549 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
550 b3184779 Michael Niedermayer
        a= LD32(&src1[i*src_stride1+4]);\
551
        b= LD32(&src2[i*src_stride2+4]);\
552 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
553 59fe111e Michael Niedermayer
    }\
554
}\
555
\
556 0da71265 Michael Niedermayer
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
557
                                                int src_stride1, int src_stride2, int h){\
558
    int i;\
559
    for(i=0; i<h; i++){\
560
        uint32_t a,b;\
561
        a= LD32(&src1[i*src_stride1  ]);\
562
        b= LD32(&src2[i*src_stride2  ]);\
563 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
564 0da71265 Michael Niedermayer
    }\
565
}\
566
\
567 669ac79c Michael Niedermayer
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
568
                                                int src_stride1, int src_stride2, int h){\
569
    int i;\
570
    for(i=0; i<h; i++){\
571
        uint32_t a,b;\
572
        a= LD16(&src1[i*src_stride1  ]);\
573
        b= LD16(&src2[i*src_stride2  ]);\
574
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
575
    }\
576
}\
577
\
578 b3184779 Michael Niedermayer
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
579
                                                int src_stride1, int src_stride2, int h){\
580
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
581
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
582
}\
583
\
584
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
585
                                                int src_stride1, int src_stride2, int h){\
586
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
587
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
588
}\
589
\
590 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
591 b3184779 Michael Niedermayer
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
592
}\
593
\
594 45553457 Zdenek Kabelac
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
595 b3184779 Michael Niedermayer
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
596
}\
597
\
598 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
599 b3184779 Michael Niedermayer
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
600
}\
601
\
602 45553457 Zdenek Kabelac
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
603 b3184779 Michael Niedermayer
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
604
}\
605
\
606
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
607
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
608 59fe111e Michael Niedermayer
    int i;\
609
    for(i=0; i<h; i++){\
610 b3184779 Michael Niedermayer
        uint32_t a, b, c, d, l0, l1, h0, h1;\
611
        a= LD32(&src1[i*src_stride1]);\
612
        b= LD32(&src2[i*src_stride2]);\
613
        c= LD32(&src3[i*src_stride3]);\
614
        d= LD32(&src4[i*src_stride4]);\
615
        l0=  (a&0x03030303UL)\
616
           + (b&0x03030303UL)\
617
           + 0x02020202UL;\
618
        h0= ((a&0xFCFCFCFCUL)>>2)\
619
          + ((b&0xFCFCFCFCUL)>>2);\
620
        l1=  (c&0x03030303UL)\
621
           + (d&0x03030303UL);\
622
        h1= ((c&0xFCFCFCFCUL)>>2)\
623
          + ((d&0xFCFCFCFCUL)>>2);\
624
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
625
        a= LD32(&src1[i*src_stride1+4]);\
626
        b= LD32(&src2[i*src_stride2+4]);\
627
        c= LD32(&src3[i*src_stride3+4]);\
628
        d= LD32(&src4[i*src_stride4+4]);\
629
        l0=  (a&0x03030303UL)\
630
           + (b&0x03030303UL)\
631
           + 0x02020202UL;\
632
        h0= ((a&0xFCFCFCFCUL)>>2)\
633
          + ((b&0xFCFCFCFCUL)>>2);\
634
        l1=  (c&0x03030303UL)\
635
           + (d&0x03030303UL);\
636
        h1= ((c&0xFCFCFCFCUL)>>2)\
637
          + ((d&0xFCFCFCFCUL)>>2);\
638
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
639 59fe111e Michael Niedermayer
    }\
640
}\
641 669ac79c Michael Niedermayer
\
642
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
643
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
644
}\
645
\
646
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
647
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
648
}\
649
\
650
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
651
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
652
}\
653
\
654
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
655
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
656
}\
657
\
658 b3184779 Michael Niedermayer
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
659
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
660 59fe111e Michael Niedermayer
    int i;\
661
    for(i=0; i<h; i++){\
662 b3184779 Michael Niedermayer
        uint32_t a, b, c, d, l0, l1, h0, h1;\
663
        a= LD32(&src1[i*src_stride1]);\
664
        b= LD32(&src2[i*src_stride2]);\
665
        c= LD32(&src3[i*src_stride3]);\
666
        d= LD32(&src4[i*src_stride4]);\
667
        l0=  (a&0x03030303UL)\
668
           + (b&0x03030303UL)\
669
           + 0x01010101UL;\
670
        h0= ((a&0xFCFCFCFCUL)>>2)\
671
          + ((b&0xFCFCFCFCUL)>>2);\
672
        l1=  (c&0x03030303UL)\
673
           + (d&0x03030303UL);\
674
        h1= ((c&0xFCFCFCFCUL)>>2)\
675
          + ((d&0xFCFCFCFCUL)>>2);\
676
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
677
        a= LD32(&src1[i*src_stride1+4]);\
678
        b= LD32(&src2[i*src_stride2+4]);\
679
        c= LD32(&src3[i*src_stride3+4]);\
680
        d= LD32(&src4[i*src_stride4+4]);\
681
        l0=  (a&0x03030303UL)\
682
           + (b&0x03030303UL)\
683
           + 0x01010101UL;\
684
        h0= ((a&0xFCFCFCFCUL)>>2)\
685
          + ((b&0xFCFCFCFCUL)>>2);\
686
        l1=  (c&0x03030303UL)\
687
           + (d&0x03030303UL);\
688
        h1= ((c&0xFCFCFCFCUL)>>2)\
689
          + ((d&0xFCFCFCFCUL)>>2);\
690
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
691 59fe111e Michael Niedermayer
    }\
692
}\
693 b3184779 Michael Niedermayer
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
694
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
695
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
696
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
697
}\
698
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
699
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
700
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
701
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
702
}\
703 59fe111e Michael Niedermayer
\
704 669ac79c Michael Niedermayer
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
705
{\
706
        int i, a0, b0, a1, b1;\
707
        a0= pixels[0];\
708
        b0= pixels[1] + 2;\
709
        a0 += b0;\
710
        b0 += pixels[2];\
711
\
712
        pixels+=line_size;\
713
        for(i=0; i<h; i+=2){\
714
            a1= pixels[0];\
715
            b1= pixels[1];\
716
            a1 += b1;\
717
            b1 += pixels[2];\
718
\
719
            block[0]= (a1+a0)>>2; /* FIXME non put */\
720
            block[1]= (b1+b0)>>2;\
721
\
722
            pixels+=line_size;\
723
            block +=line_size;\
724
\
725
            a0= pixels[0];\
726
            b0= pixels[1] + 2;\
727
            a0 += b0;\
728
            b0 += pixels[2];\
729
\
730
            block[0]= (a1+a0)>>2;\
731
            block[1]= (b1+b0)>>2;\
732
            pixels+=line_size;\
733
            block +=line_size;\
734
        }\
735
}\
736
\
737
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
738
{\
739
        int i;\
740
        const uint32_t a= LD32(pixels  );\
741
        const uint32_t b= LD32(pixels+1);\
742
        uint32_t l0=  (a&0x03030303UL)\
743
                    + (b&0x03030303UL)\
744
                    + 0x02020202UL;\
745
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
746
                   + ((b&0xFCFCFCFCUL)>>2);\
747
        uint32_t l1,h1;\
748
\
749
        pixels+=line_size;\
750
        for(i=0; i<h; i+=2){\
751
            uint32_t a= LD32(pixels  );\
752
            uint32_t b= LD32(pixels+1);\
753
            l1=  (a&0x03030303UL)\
754
               + (b&0x03030303UL);\
755
            h1= ((a&0xFCFCFCFCUL)>>2)\
756
              + ((b&0xFCFCFCFCUL)>>2);\
757
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
758
            pixels+=line_size;\
759
            block +=line_size;\
760
            a= LD32(pixels  );\
761
            b= LD32(pixels+1);\
762
            l0=  (a&0x03030303UL)\
763
               + (b&0x03030303UL)\
764
               + 0x02020202UL;\
765
            h0= ((a&0xFCFCFCFCUL)>>2)\
766
              + ((b&0xFCFCFCFCUL)>>2);\
767
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
768
            pixels+=line_size;\
769
            block +=line_size;\
770
        }\
771
}\
772
\
773 45553457 Zdenek Kabelac
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
774 59fe111e Michael Niedermayer
{\
775
    int j;\
776
    for(j=0; j<2; j++){\
777
        int i;\
778
        const uint32_t a= LD32(pixels  );\
779
        const uint32_t b= LD32(pixels+1);\
780
        uint32_t l0=  (a&0x03030303UL)\
781
                    + (b&0x03030303UL)\
782
                    + 0x02020202UL;\
783
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
784
                   + ((b&0xFCFCFCFCUL)>>2);\
785
        uint32_t l1,h1;\
786
\
787
        pixels+=line_size;\
788
        for(i=0; i<h; i+=2){\
789
            uint32_t a= LD32(pixels  );\
790
            uint32_t b= LD32(pixels+1);\
791
            l1=  (a&0x03030303UL)\
792
               + (b&0x03030303UL);\
793
            h1= ((a&0xFCFCFCFCUL)>>2)\
794
              + ((b&0xFCFCFCFCUL)>>2);\
795
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
796
            pixels+=line_size;\
797
            block +=line_size;\
798
            a= LD32(pixels  );\
799
            b= LD32(pixels+1);\
800
            l0=  (a&0x03030303UL)\
801
               + (b&0x03030303UL)\
802
               + 0x02020202UL;\
803
            h0= ((a&0xFCFCFCFCUL)>>2)\
804
              + ((b&0xFCFCFCFCUL)>>2);\
805
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
806
            pixels+=line_size;\
807
            block +=line_size;\
808
        }\
809
        pixels+=4-line_size*(h+1);\
810
        block +=4-line_size*h;\
811
    }\
812
}\
813
\
814 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
815 59fe111e Michael Niedermayer
{\
816
    int j;\
817
    for(j=0; j<2; j++){\
818
        int i;\
819
        const uint32_t a= LD32(pixels  );\
820
        const uint32_t b= LD32(pixels+1);\
821
        uint32_t l0=  (a&0x03030303UL)\
822
                    + (b&0x03030303UL)\
823
                    + 0x01010101UL;\
824
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
825
                   + ((b&0xFCFCFCFCUL)>>2);\
826
        uint32_t l1,h1;\
827
\
828
        pixels+=line_size;\
829
        for(i=0; i<h; i+=2){\
830
            uint32_t a= LD32(pixels  );\
831
            uint32_t b= LD32(pixels+1);\
832
            l1=  (a&0x03030303UL)\
833
               + (b&0x03030303UL);\
834
            h1= ((a&0xFCFCFCFCUL)>>2)\
835
              + ((b&0xFCFCFCFCUL)>>2);\
836
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
837
            pixels+=line_size;\
838
            block +=line_size;\
839
            a= LD32(pixels  );\
840
            b= LD32(pixels+1);\
841
            l0=  (a&0x03030303UL)\
842
               + (b&0x03030303UL)\
843
               + 0x01010101UL;\
844
            h0= ((a&0xFCFCFCFCUL)>>2)\
845
              + ((b&0xFCFCFCFCUL)>>2);\
846
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
847
            pixels+=line_size;\
848
            block +=line_size;\
849
        }\
850
        pixels+=4-line_size*(h+1);\
851
        block +=4-line_size*h;\
852
    }\
853
}\
854
\
855 45553457 Zdenek Kabelac
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
856
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
857
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
858
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
859
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
860
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
861
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
862
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
863 b3184779 Michael Niedermayer
864 d8085ea7 Michael Niedermayer
#define op_avg(a, b) a = rnd_avg32(a, b)
865 59fe111e Michael Niedermayer
#endif
866
#define op_put(a, b) a = b
867
868
PIXOP2(avg, op_avg)
869
PIXOP2(put, op_put)
870
#undef op_avg
871
#undef op_put
872
873 de6d9b64 Fabrice Bellard
#define avg2(a,b) ((a+b+1)>>1)
874
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
875
876 073b013d Michael Niedermayer
877 0c1a9eda Zdenek Kabelac
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
878 44eb4951 Michael Niedermayer
{
879
    const int A=(16-x16)*(16-y16);
880
    const int B=(   x16)*(16-y16);
881
    const int C=(16-x16)*(   y16);
882
    const int D=(   x16)*(   y16);
883
    int i;
884
885
    for(i=0; i<h; i++)
886
    {
887 b3184779 Michael Niedermayer
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
888
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
889
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
890
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
891
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
892
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
893
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
894
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
895
        dst+= stride;
896
        src+= stride;
897 44eb4951 Michael Niedermayer
    }
898
}
899
900 0c1a9eda Zdenek Kabelac
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
901 073b013d Michael Niedermayer
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
902
{
903
    int y, vx, vy;
904
    const int s= 1<<shift;
905
    
906
    width--;
907
    height--;
908
909
    for(y=0; y<h; y++){
910
        int x;
911
912
        vx= ox;
913
        vy= oy;
914
        for(x=0; x<8; x++){ //XXX FIXME optimize
915
            int src_x, src_y, frac_x, frac_y, index;
916
917
            src_x= vx>>16;
918
            src_y= vy>>16;
919
            frac_x= src_x&(s-1);
920
            frac_y= src_y&(s-1);
921
            src_x>>=shift;
922
            src_y>>=shift;
923
  
924
            if((unsigned)src_x < width){
925
                if((unsigned)src_y < height){
926
                    index= src_x + src_y*stride;
927
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
928
                                           + src[index       +1]*   frac_x )*(s-frac_y)
929
                                        + (  src[index+stride  ]*(s-frac_x)
930
                                           + src[index+stride+1]*   frac_x )*   frac_y
931
                                        + r)>>(shift*2);
932
                }else{
933
                    index= src_x + clip(src_y, 0, height)*stride;                    
934
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
935
                                          + src[index       +1]*   frac_x )*s
936
                                        + r)>>(shift*2);
937
                }
938
            }else{
939
                if((unsigned)src_y < height){
940
                    index= clip(src_x, 0, width) + src_y*stride;                    
941
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
942
                                           + src[index+stride  ]*   frac_y )*s
943
                                        + r)>>(shift*2);
944
                }else{
945
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
946
                    dst[y*stride + x]=    src[index         ];
947
                }
948
            }
949
            
950
            vx+= dxx;
951
            vy+= dyx;
952
        }
953
        ox += dxy;
954
        oy += dyy;
955
    }
956
}
957 669ac79c Michael Niedermayer
958
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
959
    switch(width){
960
    case 2: put_pixels2_c (dst, src, stride, height); break;
961
    case 4: put_pixels4_c (dst, src, stride, height); break;
962
    case 8: put_pixels8_c (dst, src, stride, height); break;
963
    case 16:put_pixels16_c(dst, src, stride, height); break;
964
    }
965
}
966
967
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
968
    int i,j;
969
    for (i=0; i < height; i++) {
970
      for (j=0; j < width; j++) {
971
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
972
      }
973
      src += stride;
974
      dst += stride;
975
    }
976
}
977
978
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
979
    int i,j;
980
    for (i=0; i < height; i++) {
981
      for (j=0; j < width; j++) {
982
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
983
      }
984
      src += stride;
985
      dst += stride;
986
    }
987
}
988
    
989
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
990
    int i,j;
991
    for (i=0; i < height; i++) {
992
      for (j=0; j < width; j++) {
993
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
994
      }
995
      src += stride;
996
      dst += stride;
997
    }
998
}
999
    
1000
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1001
    int i,j;
1002
    for (i=0; i < height; i++) {
1003
      for (j=0; j < width; j++) {
1004
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1005
      }
1006
      src += stride;
1007
      dst += stride;
1008
    }
1009
}
1010
1011
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1012
    int i,j;
1013
    for (i=0; i < height; i++) {
1014
      for (j=0; j < width; j++) {
1015 89ebf4e8 Mike Melanson
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1016 669ac79c Michael Niedermayer
      }
1017
      src += stride;
1018
      dst += stride;
1019
    }
1020
}
1021
1022
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1023
    int i,j;
1024
    for (i=0; i < height; i++) {
1025
      for (j=0; j < width; j++) {
1026
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1027
      }
1028
      src += stride;
1029
      dst += stride;
1030
    }
1031
}
1032
1033
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1034
    int i,j;
1035
    for (i=0; i < height; i++) {
1036
      for (j=0; j < width; j++) {
1037 89ebf4e8 Mike Melanson
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1038 669ac79c Michael Niedermayer
      }
1039
      src += stride;
1040
      dst += stride;
1041
    }
1042
}
1043
1044
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1045
    int i,j;
1046
    for (i=0; i < height; i++) {
1047
      for (j=0; j < width; j++) {
1048
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1049
      }
1050
      src += stride;
1051
      dst += stride;
1052
    }
1053
}
1054 da3b9756 Mike Melanson
1055
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1056
    switch(width){
1057
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1058
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1059
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1060
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1061
    }
1062
}
1063
1064
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1065
    int i,j;
1066
    for (i=0; i < height; i++) {
1067
      for (j=0; j < width; j++) {
1068
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1069
      }
1070
      src += stride;
1071
      dst += stride;
1072
    }
1073
}
1074
1075
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1076
    int i,j;
1077
    for (i=0; i < height; i++) {
1078
      for (j=0; j < width; j++) {
1079
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1080
      }
1081
      src += stride;
1082
      dst += stride;
1083
    }
1084
}
1085
    
1086
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1087
    int i,j;
1088
    for (i=0; i < height; i++) {
1089
      for (j=0; j < width; j++) {
1090
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1091
      }
1092
      src += stride;
1093
      dst += stride;
1094
    }
1095
}
1096
    
1097
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1098
    int i,j;
1099
    for (i=0; i < height; i++) {
1100
      for (j=0; j < width; j++) {
1101
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1102
      }
1103
      src += stride;
1104
      dst += stride;
1105
    }
1106
}
1107
1108
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1109
    int i,j;
1110
    for (i=0; i < height; i++) {
1111
      for (j=0; j < width; j++) {
1112 89ebf4e8 Mike Melanson
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1113 da3b9756 Mike Melanson
      }
1114
      src += stride;
1115
      dst += stride;
1116
    }
1117
}
1118
1119
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1120
    int i,j;
1121
    for (i=0; i < height; i++) {
1122
      for (j=0; j < width; j++) {
1123
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1124
      }
1125
      src += stride;
1126
      dst += stride;
1127
    }
1128
}
1129
1130
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1131
    int i,j;
1132
    for (i=0; i < height; i++) {
1133
      for (j=0; j < width; j++) {
1134 89ebf4e8 Mike Melanson
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1135 da3b9756 Mike Melanson
      }
1136
      src += stride;
1137
      dst += stride;
1138
    }
1139
}
1140
1141
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1142
    int i,j;
1143
    for (i=0; i < height; i++) {
1144
      for (j=0; j < width; j++) {
1145
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1146
      }
1147
      src += stride;
1148
      dst += stride;
1149
    }
1150
}
1151 669ac79c Michael Niedermayer
#if 0
1152
#define TPEL_WIDTH(width)\
1153
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1154
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1155
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1156
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1157
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1158
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1159
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1160
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1161
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1163
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1165
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1167
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1169
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1171
#endif
1172
1173 0da71265 Michael Niedermayer
#define H264_CHROMA_MC(OPNAME, OP)\
1174
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1175
    const int A=(8-x)*(8-y);\
1176
    const int B=(  x)*(8-y);\
1177
    const int C=(8-x)*(  y);\
1178
    const int D=(  x)*(  y);\
1179
    int i;\
1180
    \
1181
    assert(x<8 && y<8 && x>=0 && y>=0);\
1182
\
1183
    for(i=0; i<h; i++)\
1184
    {\
1185
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1186
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1187
        dst+= stride;\
1188
        src+= stride;\
1189
    }\
1190
}\
1191
\
1192
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1193
    const int A=(8-x)*(8-y);\
1194
    const int B=(  x)*(8-y);\
1195
    const int C=(8-x)*(  y);\
1196
    const int D=(  x)*(  y);\
1197
    int i;\
1198
    \
1199
    assert(x<8 && y<8 && x>=0 && y>=0);\
1200
\
1201
    for(i=0; i<h; i++)\
1202
    {\
1203
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1204
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1205
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1206
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1207
        dst+= stride;\
1208
        src+= stride;\
1209
    }\
1210
}\
1211
\
1212
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1213
    const int A=(8-x)*(8-y);\
1214
    const int B=(  x)*(8-y);\
1215
    const int C=(8-x)*(  y);\
1216
    const int D=(  x)*(  y);\
1217
    int i;\
1218
    \
1219
    assert(x<8 && y<8 && x>=0 && y>=0);\
1220
\
1221
    for(i=0; i<h; i++)\
1222
    {\
1223
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1224
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1225
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1226
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1227
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1228
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1229
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1230
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1231
        dst+= stride;\
1232
        src+= stride;\
1233
    }\
1234
}
1235
1236
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1237
#define op_put(a, b) a = (((b) + 32)>>6)
1238
1239
H264_CHROMA_MC(put_       , op_put)
1240
H264_CHROMA_MC(avg_       , op_avg)
1241
#undef op_avg
1242
#undef op_put
1243
1244
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1245
{
1246
    int i;
1247
    for(i=0; i<h; i++)
1248
    {
1249
        ST32(dst   , LD32(src   ));
1250
        dst+=dstStride;
1251
        src+=srcStride;
1252
    }
1253
}
1254
1255
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1256
{
1257
    int i;
1258
    for(i=0; i<h; i++)
1259
    {
1260
        ST32(dst   , LD32(src   ));
1261
        ST32(dst+4 , LD32(src+4 ));
1262
        dst+=dstStride;
1263
        src+=srcStride;
1264
    }
1265
}
1266
1267
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1268
{
1269
    int i;
1270
    for(i=0; i<h; i++)
1271
    {
1272
        ST32(dst   , LD32(src   ));
1273
        ST32(dst+4 , LD32(src+4 ));
1274
        ST32(dst+8 , LD32(src+8 ));
1275
        ST32(dst+12, LD32(src+12));
1276
        dst+=dstStride;
1277
        src+=srcStride;
1278
    }
1279
}
1280 073b013d Michael Niedermayer
1281 0c1a9eda Zdenek Kabelac
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1282 44eb4951 Michael Niedermayer
{
1283
    int i;
1284
    for(i=0; i<h; i++)
1285
    {
1286 b3184779 Michael Niedermayer
        ST32(dst   , LD32(src   ));
1287
        ST32(dst+4 , LD32(src+4 ));
1288
        ST32(dst+8 , LD32(src+8 ));
1289
        ST32(dst+12, LD32(src+12));
1290
        dst[16]= src[16];
1291 44eb4951 Michael Niedermayer
        dst+=dstStride;
1292
        src+=srcStride;
1293
    }
1294
}
1295
1296 0c1a9eda Zdenek Kabelac
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297 44eb4951 Michael Niedermayer
{
1298
    int i;
1299 b3184779 Michael Niedermayer
    for(i=0; i<h; i++)
1300 44eb4951 Michael Niedermayer
    {
1301 b3184779 Michael Niedermayer
        ST32(dst   , LD32(src   ));
1302
        ST32(dst+4 , LD32(src+4 ));
1303
        dst[8]= src[8];
1304 44eb4951 Michael Niedermayer
        dst+=dstStride;
1305
        src+=srcStride;
1306
    }
1307
}
1308
1309 826f429a Michael Niedermayer
1310 b3184779 Michael Niedermayer
#define QPEL_MC(r, OPNAME, RND, OP) \
1311 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1312
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1313 b3184779 Michael Niedermayer
    int i;\
1314
    for(i=0; i<h; i++)\
1315
    {\
1316
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1317
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1318
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1319
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1320
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1321
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1322
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1323
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1324
        dst+=dstStride;\
1325
        src+=srcStride;\
1326
    }\
1327 44eb4951 Michael Niedermayer
}\
1328
\
1329 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1330 db794953 Michael Niedermayer
    const int w=8;\
1331 0c1a9eda Zdenek Kabelac
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1332 b3184779 Michael Niedermayer
    int i;\
1333
    for(i=0; i<w; i++)\
1334
    {\
1335
        const int src0= src[0*srcStride];\
1336
        const int src1= src[1*srcStride];\
1337
        const int src2= src[2*srcStride];\
1338
        const int src3= src[3*srcStride];\
1339
        const int src4= src[4*srcStride];\
1340
        const int src5= src[5*srcStride];\
1341
        const int src6= src[6*srcStride];\
1342
        const int src7= src[7*srcStride];\
1343
        const int src8= src[8*srcStride];\
1344
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1345
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1346
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1347
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1348
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1349
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1350
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1351
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1352
        dst++;\
1353
        src++;\
1354
    }\
1355
}\
1356
\
1357 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1358
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1359 b3184779 Michael Niedermayer
    int i;\
1360 826f429a Michael Niedermayer
    \
1361 b3184779 Michael Niedermayer
    for(i=0; i<h; i++)\
1362
    {\
1363
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1364
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1365
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1366
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1367
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1368
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1369
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1370
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1371
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1372
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1373
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1374
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1375
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1376
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1377
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1378
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1379
        dst+=dstStride;\
1380
        src+=srcStride;\
1381
    }\
1382
}\
1383
\
1384 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1385
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1386 b3184779 Michael Niedermayer
    int i;\
1387 826f429a Michael Niedermayer
    const int w=16;\
1388 b3184779 Michael Niedermayer
    for(i=0; i<w; i++)\
1389
    {\
1390
        const int src0= src[0*srcStride];\
1391
        const int src1= src[1*srcStride];\
1392
        const int src2= src[2*srcStride];\
1393
        const int src3= src[3*srcStride];\
1394
        const int src4= src[4*srcStride];\
1395
        const int src5= src[5*srcStride];\
1396
        const int src6= src[6*srcStride];\
1397
        const int src7= src[7*srcStride];\
1398
        const int src8= src[8*srcStride];\
1399
        const int src9= src[9*srcStride];\
1400
        const int src10= src[10*srcStride];\
1401
        const int src11= src[11*srcStride];\
1402
        const int src12= src[12*srcStride];\
1403
        const int src13= src[13*srcStride];\
1404
        const int src14= src[14*srcStride];\
1405
        const int src15= src[15*srcStride];\
1406
        const int src16= src[16*srcStride];\
1407
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1408
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1409
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1410
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1411
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1412
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1413
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1414
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1415
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1416
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1417
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1418
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1419
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1420
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1421
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1422
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1423
        dst++;\
1424
        src++;\
1425
    }\
1426
}\
1427
\
1428 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1429 45553457 Zdenek Kabelac
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1430 b3184779 Michael Niedermayer
}\
1431
\
1432 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1433
    uint8_t half[64];\
1434 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1435
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1436 44eb4951 Michael Niedermayer
}\
1437
\
1438 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1439 b3184779 Michael Niedermayer
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1440 44eb4951 Michael Niedermayer
}\
1441
\
1442 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1443
    uint8_t half[64];\
1444 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1445
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1446 44eb4951 Michael Niedermayer
}\
1447
\
1448 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1449
    uint8_t full[16*9];\
1450
    uint8_t half[64];\
1451 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1452 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1453 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1454 44eb4951 Michael Niedermayer
}\
1455
\
1456 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1457
    uint8_t full[16*9];\
1458 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1459 db794953 Michael Niedermayer
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1460 44eb4951 Michael Niedermayer
}\
1461
\
1462 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1463
    uint8_t full[16*9];\
1464
    uint8_t half[64];\
1465 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1466 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1467 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1468 44eb4951 Michael Niedermayer
}\
1469 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1470
    uint8_t full[16*9];\
1471
    uint8_t halfH[72];\
1472
    uint8_t halfV[64];\
1473
    uint8_t halfHV[64];\
1474 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1475
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1476 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1477
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1478 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1479 44eb4951 Michael Niedermayer
}\
1480 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1481
    uint8_t full[16*9];\
1482
    uint8_t halfH[72];\
1483
    uint8_t halfHV[64];\
1484 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1485
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1486
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1487
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1488
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1489
}\
1490 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1491
    uint8_t full[16*9];\
1492
    uint8_t halfH[72];\
1493
    uint8_t halfV[64];\
1494
    uint8_t halfHV[64];\
1495 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1496
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1497 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1498
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1499 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1500 44eb4951 Michael Niedermayer
}\
1501 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1502
    uint8_t full[16*9];\
1503
    uint8_t halfH[72];\
1504
    uint8_t halfHV[64];\
1505 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1506
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1507
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1508
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1509
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1510
}\
1511 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1512
    uint8_t full[16*9];\
1513
    uint8_t halfH[72];\
1514
    uint8_t halfV[64];\
1515
    uint8_t halfHV[64];\
1516 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1517
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1518 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1519
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1520 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1521 44eb4951 Michael Niedermayer
}\
1522 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1523
    uint8_t full[16*9];\
1524
    uint8_t halfH[72];\
1525
    uint8_t halfHV[64];\
1526 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1527
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1528
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1529
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1530
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1531
}\
1532 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1533
    uint8_t full[16*9];\
1534
    uint8_t halfH[72];\
1535
    uint8_t halfV[64];\
1536
    uint8_t halfHV[64];\
1537 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1538
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1539 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1540
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1541 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1542 44eb4951 Michael Niedermayer
}\
1543 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1544
    uint8_t full[16*9];\
1545
    uint8_t halfH[72];\
1546
    uint8_t halfHV[64];\
1547 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1548
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1549
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1550
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1551
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1552
}\
1553 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1554
    uint8_t halfH[72];\
1555
    uint8_t halfHV[64];\
1556 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1557 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1558 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1559 44eb4951 Michael Niedermayer
}\
1560 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1561
    uint8_t halfH[72];\
1562
    uint8_t halfHV[64];\
1563 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1564 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1565 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1566 44eb4951 Michael Niedermayer
}\
1567 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1568
    uint8_t full[16*9];\
1569
    uint8_t halfH[72];\
1570
    uint8_t halfV[64];\
1571
    uint8_t halfHV[64];\
1572 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1573
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1574 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1575
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1576 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1577 44eb4951 Michael Niedermayer
}\
1578 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1579
    uint8_t full[16*9];\
1580
    uint8_t halfH[72];\
1581 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1582
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1583
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1584
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1585
}\
1586 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1587
    uint8_t full[16*9];\
1588
    uint8_t halfH[72];\
1589
    uint8_t halfV[64];\
1590
    uint8_t halfHV[64];\
1591 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1592
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1593 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1594
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1595 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1596 44eb4951 Michael Niedermayer
}\
1597 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1598
    uint8_t full[16*9];\
1599
    uint8_t halfH[72];\
1600 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1601
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1602
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1603
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1604
}\
1605 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1606
    uint8_t halfH[72];\
1607 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1608 db794953 Michael Niedermayer
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1609 b3184779 Michael Niedermayer
}\
1610 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1611 45553457 Zdenek Kabelac
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1612 b3184779 Michael Niedermayer
}\
1613
\
1614 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1615
    uint8_t half[256];\
1616 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1617
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1618
}\
1619
\
1620 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1621 b3184779 Michael Niedermayer
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1622 44eb4951 Michael Niedermayer
}\
1623 b3184779 Michael Niedermayer
\
1624 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1625
    uint8_t half[256];\
1626 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1627
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1628
}\
1629
\
1630 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1631
    uint8_t full[24*17];\
1632
    uint8_t half[256];\
1633 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1634 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1635 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1636
}\
1637
\
1638 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1639
    uint8_t full[24*17];\
1640 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1641 826f429a Michael Niedermayer
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1642 b3184779 Michael Niedermayer
}\
1643
\
1644 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1645
    uint8_t full[24*17];\
1646
    uint8_t half[256];\
1647 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1648 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1649 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1650
}\
1651 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1652
    uint8_t full[24*17];\
1653
    uint8_t halfH[272];\
1654
    uint8_t halfV[256];\
1655
    uint8_t halfHV[256];\
1656 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1657
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1658 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1659
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1660 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1661
}\
1662 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1663
    uint8_t full[24*17];\
1664
    uint8_t halfH[272];\
1665
    uint8_t halfHV[256];\
1666 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1667
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1668
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1669
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1670
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1671
}\
1672 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1673
    uint8_t full[24*17];\
1674
    uint8_t halfH[272];\
1675
    uint8_t halfV[256];\
1676
    uint8_t halfHV[256];\
1677 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1678
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1679 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1680
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1681 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1682
}\
1683 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1684
    uint8_t full[24*17];\
1685
    uint8_t halfH[272];\
1686
    uint8_t halfHV[256];\
1687 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1688
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1689
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1690
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1691
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1692
}\
1693 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1694
    uint8_t full[24*17];\
1695
    uint8_t halfH[272];\
1696
    uint8_t halfV[256];\
1697
    uint8_t halfHV[256];\
1698 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1699
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1700 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1701
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1702 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1703
}\
1704 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1705
    uint8_t full[24*17];\
1706
    uint8_t halfH[272];\
1707
    uint8_t halfHV[256];\
1708 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1709
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1710
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1711
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1712
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1713
}\
1714 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1715
    uint8_t full[24*17];\
1716
    uint8_t halfH[272];\
1717
    uint8_t halfV[256];\
1718
    uint8_t halfHV[256];\
1719 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1720
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1721 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1722
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1723 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1724
}\
1725 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1726
    uint8_t full[24*17];\
1727
    uint8_t halfH[272];\
1728
    uint8_t halfHV[256];\
1729 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1730
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1731
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1732
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1733
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1734
}\
1735 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1736
    uint8_t halfH[272];\
1737
    uint8_t halfHV[256];\
1738 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1739 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1740 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1741
}\
1742 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1743
    uint8_t halfH[272];\
1744
    uint8_t halfHV[256];\
1745 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1746 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1747 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1748
}\
1749 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1750
    uint8_t full[24*17];\
1751
    uint8_t halfH[272];\
1752
    uint8_t halfV[256];\
1753
    uint8_t halfHV[256];\
1754 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1755
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1756 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1757
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1758 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1759
}\
1760 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1761
    uint8_t full[24*17];\
1762
    uint8_t halfH[272];\
1763 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1764
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1765
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1766
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1767
}\
1768 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769
    uint8_t full[24*17];\
1770
    uint8_t halfH[272];\
1771
    uint8_t halfV[256];\
1772
    uint8_t halfHV[256];\
1773 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1774
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1775 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1776
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1777 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1778
}\
1779 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1780
    uint8_t full[24*17];\
1781
    uint8_t halfH[272];\
1782 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1783
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1784
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1785
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1786
}\
1787 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1788
    uint8_t halfH[272];\
1789 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1790 826f429a Michael Niedermayer
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1791 45553457 Zdenek Kabelac
}
1792 44eb4951 Michael Niedermayer
1793 b3184779 Michael Niedermayer
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1794
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1795
#define op_put(a, b) a = cm[((b) + 16)>>5]
1796
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1797
1798
QPEL_MC(0, put_       , _       , op_put)
1799
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1800
QPEL_MC(0, avg_       , _       , op_avg)
1801
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1802
#undef op_avg
1803
#undef op_avg_no_rnd
1804
#undef op_put
1805
#undef op_put_no_rnd
1806 44eb4951 Michael Niedermayer
1807 0da71265 Michael Niedermayer
#if 1
1808
#define H264_LOWPASS(OPNAME, OP, OP2) \
1809
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1810
    const int h=4;\
1811
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1812
    int i;\
1813
    for(i=0; i<h; i++)\
1814
    {\
1815
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1816
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1817
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1818
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1819
        dst+=dstStride;\
1820
        src+=srcStride;\
1821
    }\
1822
}\
1823
\
1824
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1825
    const int w=4;\
1826
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1827
    int i;\
1828
    for(i=0; i<w; i++)\
1829
    {\
1830
        const int srcB= src[-2*srcStride];\
1831
        const int srcA= src[-1*srcStride];\
1832
        const int src0= src[0 *srcStride];\
1833
        const int src1= src[1 *srcStride];\
1834
        const int src2= src[2 *srcStride];\
1835
        const int src3= src[3 *srcStride];\
1836
        const int src4= src[4 *srcStride];\
1837
        const int src5= src[5 *srcStride];\
1838
        const int src6= src[6 *srcStride];\
1839
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1840
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1841
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1842
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1843
        dst++;\
1844
        src++;\
1845
    }\
1846
}\
1847
\
1848
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1849
    const int h=4;\
1850
    const int w=4;\
1851
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1852
    int i;\
1853
    src -= 2*srcStride;\
1854
    for(i=0; i<h+5; i++)\
1855
    {\
1856
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1857
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1858
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1859
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1860
        tmp+=tmpStride;\
1861
        src+=srcStride;\
1862
    }\
1863
    tmp -= tmpStride*(h+5-2);\
1864
    for(i=0; i<w; i++)\
1865
    {\
1866
        const int tmpB= tmp[-2*tmpStride];\
1867
        const int tmpA= tmp[-1*tmpStride];\
1868
        const int tmp0= tmp[0 *tmpStride];\
1869
        const int tmp1= tmp[1 *tmpStride];\
1870
        const int tmp2= tmp[2 *tmpStride];\
1871
        const int tmp3= tmp[3 *tmpStride];\
1872
        const int tmp4= tmp[4 *tmpStride];\
1873
        const int tmp5= tmp[5 *tmpStride];\
1874
        const int tmp6= tmp[6 *tmpStride];\
1875
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1876
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1877
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1878
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1879
        dst++;\
1880
        tmp++;\
1881
    }\
1882
}\
1883
\
1884
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1885
    const int h=8;\
1886
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1887
    int i;\
1888
    for(i=0; i<h; i++)\
1889
    {\
1890
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1891
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1892
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1893
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1894
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1895
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1896
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1897
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1898
        dst+=dstStride;\
1899
        src+=srcStride;\
1900
    }\
1901
}\
1902
\
1903
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1904
    const int w=8;\
1905
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1906
    int i;\
1907
    for(i=0; i<w; i++)\
1908
    {\
1909
        const int srcB= src[-2*srcStride];\
1910
        const int srcA= src[-1*srcStride];\
1911
        const int src0= src[0 *srcStride];\
1912
        const int src1= src[1 *srcStride];\
1913
        const int src2= src[2 *srcStride];\
1914
        const int src3= src[3 *srcStride];\
1915
        const int src4= src[4 *srcStride];\
1916
        const int src5= src[5 *srcStride];\
1917
        const int src6= src[6 *srcStride];\
1918
        const int src7= src[7 *srcStride];\
1919
        const int src8= src[8 *srcStride];\
1920
        const int src9= src[9 *srcStride];\
1921
        const int src10=src[10*srcStride];\
1922
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1923
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1924
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1925
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1926
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1927
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1928
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1929
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1930
        dst++;\
1931
        src++;\
1932
    }\
1933
}\
1934
\
1935
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1936
    const int h=8;\
1937
    const int w=8;\
1938
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1939
    int i;\
1940
    src -= 2*srcStride;\
1941
    for(i=0; i<h+5; i++)\
1942
    {\
1943
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1944
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1945
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1946
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1947
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1948
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1949
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1950
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1951
        tmp+=tmpStride;\
1952
        src+=srcStride;\
1953
    }\
1954
    tmp -= tmpStride*(h+5-2);\
1955
    for(i=0; i<w; i++)\
1956
    {\
1957
        const int tmpB= tmp[-2*tmpStride];\
1958
        const int tmpA= tmp[-1*tmpStride];\
1959
        const int tmp0= tmp[0 *tmpStride];\
1960
        const int tmp1= tmp[1 *tmpStride];\
1961
        const int tmp2= tmp[2 *tmpStride];\
1962
        const int tmp3= tmp[3 *tmpStride];\
1963
        const int tmp4= tmp[4 *tmpStride];\
1964
        const int tmp5= tmp[5 *tmpStride];\
1965
        const int tmp6= tmp[6 *tmpStride];\
1966
        const int tmp7= tmp[7 *tmpStride];\
1967
        const int tmp8= tmp[8 *tmpStride];\
1968
        const int tmp9= tmp[9 *tmpStride];\
1969
        const int tmp10=tmp[10*tmpStride];\
1970
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1971
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1972
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1973
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1974
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1975
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1976
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1977
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1978
        dst++;\
1979
        tmp++;\
1980
    }\
1981
}\
1982
\
1983
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1984
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1985
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1986
    src += 8*srcStride;\
1987
    dst += 8*dstStride;\
1988
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1989
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1990
}\
1991
\
1992
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1993
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1994
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1995
    src += 8*srcStride;\
1996
    dst += 8*dstStride;\
1997
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1998
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1999
}\
2000
\
2001
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2002
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2003
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2004
    src += 8*srcStride;\
2005
    tmp += 8*tmpStride;\
2006
    dst += 8*dstStride;\
2007
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2008
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2009
}\
2010
2011
#define H264_MC(OPNAME, SIZE) \
2012
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2013
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2014
}\
2015
\
2016
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2017
    uint8_t half[SIZE*SIZE];\
2018
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2019
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2020
}\
2021
\
2022
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2023
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2024
}\
2025
\
2026
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2027
    uint8_t half[SIZE*SIZE];\
2028
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2029
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2030
}\
2031
\
2032
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2033
    uint8_t full[SIZE*(SIZE+5)];\
2034
    uint8_t * const full_mid= full + SIZE*2;\
2035
    uint8_t half[SIZE*SIZE];\
2036
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2037
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2038
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2039
}\
2040
\
2041
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2042
    uint8_t full[SIZE*(SIZE+5)];\
2043
    uint8_t * const full_mid= full + SIZE*2;\
2044
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2045
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2046
}\
2047
\
2048
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2049
    uint8_t full[SIZE*(SIZE+5)];\
2050
    uint8_t * const full_mid= full + SIZE*2;\
2051
    uint8_t half[SIZE*SIZE];\
2052
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2053
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2054
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2055
}\
2056
\
2057
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058
    uint8_t full[SIZE*(SIZE+5)];\
2059
    uint8_t * const full_mid= full + SIZE*2;\
2060
    uint8_t halfH[SIZE*SIZE];\
2061
    uint8_t halfV[SIZE*SIZE];\
2062
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2063
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2064
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2065
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2066
}\
2067
\
2068
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2069
    uint8_t full[SIZE*(SIZE+5)];\
2070
    uint8_t * const full_mid= full + SIZE*2;\
2071
    uint8_t halfH[SIZE*SIZE];\
2072
    uint8_t halfV[SIZE*SIZE];\
2073
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2074
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2075
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2076
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2077
}\
2078
\
2079
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2080
    uint8_t full[SIZE*(SIZE+5)];\
2081
    uint8_t * const full_mid= full + SIZE*2;\
2082
    uint8_t halfH[SIZE*SIZE];\
2083
    uint8_t halfV[SIZE*SIZE];\
2084
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2085
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2086
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2087
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2088
}\
2089
\
2090
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2091
    uint8_t full[SIZE*(SIZE+5)];\
2092
    uint8_t * const full_mid= full + SIZE*2;\
2093
    uint8_t halfH[SIZE*SIZE];\
2094
    uint8_t halfV[SIZE*SIZE];\
2095
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2096
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2097
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2098
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2099
}\
2100
\
2101
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2102
    int16_t tmp[SIZE*(SIZE+5)];\
2103
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2104
}\
2105
\
2106
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2107
    int16_t tmp[SIZE*(SIZE+5)];\
2108
    uint8_t halfH[SIZE*SIZE];\
2109
    uint8_t halfHV[SIZE*SIZE];\
2110
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2111
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2112
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2113
}\
2114
\
2115
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2116
    int16_t tmp[SIZE*(SIZE+5)];\
2117
    uint8_t halfH[SIZE*SIZE];\
2118
    uint8_t halfHV[SIZE*SIZE];\
2119
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2120
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2121
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2122
}\
2123
\
2124
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2125
    uint8_t full[SIZE*(SIZE+5)];\
2126
    uint8_t * const full_mid= full + SIZE*2;\
2127
    int16_t tmp[SIZE*(SIZE+5)];\
2128
    uint8_t halfV[SIZE*SIZE];\
2129
    uint8_t halfHV[SIZE*SIZE];\
2130
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2131
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2132
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2133
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2134
}\
2135
\
2136
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2137
    uint8_t full[SIZE*(SIZE+5)];\
2138
    uint8_t * const full_mid= full + SIZE*2;\
2139
    int16_t tmp[SIZE*(SIZE+5)];\
2140
    uint8_t halfV[SIZE*SIZE];\
2141
    uint8_t halfHV[SIZE*SIZE];\
2142
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2143
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2144
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2145
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2146
}\
2147
2148
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2149
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2150
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2151
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2152
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2153
2154
H264_LOWPASS(put_       , op_put, op2_put)
2155
H264_LOWPASS(avg_       , op_avg, op2_avg)
2156
H264_MC(put_, 4)
2157
H264_MC(put_, 8)
2158
H264_MC(put_, 16)
2159
H264_MC(avg_, 4)
2160
H264_MC(avg_, 8)
2161
H264_MC(avg_, 16)
2162
2163
#undef op_avg
2164
#undef op_put
2165
#undef op2_avg
2166
#undef op2_put
2167
#endif
2168
2169 1457ab52 Michael Niedermayer
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2170
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2171
    int i;
2172
2173
    for(i=0; i<h; i++){
2174
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2175
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2176
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2177
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2178
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2179
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2180
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2181
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2182
        dst+=dstStride;
2183
        src+=srcStride;        
2184
    }
2185
}
2186
2187
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2188
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2189
    int i;
2190
2191
    for(i=0; i<w; i++){
2192
        const int src_1= src[ -srcStride];
2193
        const int src0 = src[0          ];
2194
        const int src1 = src[  srcStride];
2195
        const int src2 = src[2*srcStride];
2196
        const int src3 = src[3*srcStride];
2197
        const int src4 = src[4*srcStride];
2198
        const int src5 = src[5*srcStride];
2199
        const int src6 = src[6*srcStride];
2200
        const int src7 = src[7*srcStride];
2201
        const int src8 = src[8*srcStride];
2202
        const int src9 = src[9*srcStride];
2203
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2204
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2205
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2206
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2207
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2208
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2209
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2210
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2211
        src++;
2212
        dst++;
2213
    }
2214
}
2215
2216
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2217
    put_pixels8_c(dst, src, stride, 8);
2218
}
2219
2220
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2221
    uint8_t half[64];
2222
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2223
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2224
}
2225
2226
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2227
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2228
}
2229
2230
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2231
    uint8_t half[64];
2232
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2233
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2234
}
2235
2236
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2237
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2238
}
2239
2240
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2241
    uint8_t halfH[88];
2242
    uint8_t halfV[64];
2243
    uint8_t halfHV[64];
2244
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2245
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2246
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2247
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2248
}
2249
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2250
    uint8_t halfH[88];
2251
    uint8_t halfV[64];
2252
    uint8_t halfHV[64];
2253
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2254
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2255
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2256
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2257
}
2258
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2259
    uint8_t halfH[88];
2260
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2261
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2262
}
2263
2264
2265 0c1a9eda Zdenek Kabelac
static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2266 de6d9b64 Fabrice Bellard
{
2267
    int s, i;
2268
2269
    s = 0;
2270 ba6802de Michael Niedermayer
    for(i=0;i<16;i++) {
2271 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - pix2[0]);
2272
        s += abs(pix1[1] - pix2[1]);
2273
        s += abs(pix1[2] - pix2[2]);
2274
        s += abs(pix1[3] - pix2[3]);
2275
        s += abs(pix1[4] - pix2[4]);
2276
        s += abs(pix1[5] - pix2[5]);
2277
        s += abs(pix1[6] - pix2[6]);
2278
        s += abs(pix1[7] - pix2[7]);
2279
        s += abs(pix1[8] - pix2[8]);
2280
        s += abs(pix1[9] - pix2[9]);
2281
        s += abs(pix1[10] - pix2[10]);
2282
        s += abs(pix1[11] - pix2[11]);
2283
        s += abs(pix1[12] - pix2[12]);
2284
        s += abs(pix1[13] - pix2[13]);
2285
        s += abs(pix1[14] - pix2[14]);
2286
        s += abs(pix1[15] - pix2[15]);
2287
        pix1 += line_size;
2288
        pix2 += line_size;
2289
    }
2290
    return s;
2291
}
2292
2293 0c1a9eda Zdenek Kabelac
static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2294 de6d9b64 Fabrice Bellard
{
2295
    int s, i;
2296
2297
    s = 0;
2298 ba6802de Michael Niedermayer
    for(i=0;i<16;i++) {
2299 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2300
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2301
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2302
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2303
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2304
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2305
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2306
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2307
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2308
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2309
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2310
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2311
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2312
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2313
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2314
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2315
        pix1 += line_size;
2316
        pix2 += line_size;
2317
    }
2318
    return s;
2319
}
2320
2321 0c1a9eda Zdenek Kabelac
static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2322 de6d9b64 Fabrice Bellard
{
2323
    int s, i;
2324 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
2325 de6d9b64 Fabrice Bellard
2326
    s = 0;
2327 ba6802de Michael Niedermayer
    for(i=0;i<16;i++) {
2328 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2329
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2330
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2331
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2332
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2333
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2334
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2335
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2336
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2337
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2338
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2339
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2340
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2341
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2342
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2343
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2344
        pix1 += line_size;
2345
        pix2 += line_size;
2346
        pix3 += line_size;
2347
    }
2348
    return s;
2349
}
2350
2351 0c1a9eda Zdenek Kabelac
static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2352 de6d9b64 Fabrice Bellard
{
2353
    int s, i;
2354 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
2355 de6d9b64 Fabrice Bellard
2356
    s = 0;
2357 ba6802de Michael Niedermayer
    for(i=0;i<16;i++) {
2358 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2359
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2360
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2361
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2362
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2363
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2364
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2365
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2366
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2367
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2368
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2369
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2370
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2371
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2372
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2373
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2374
        pix1 += line_size;
2375
        pix2 += line_size;
2376
        pix3 += line_size;
2377
    }
2378
    return s;
2379
}
2380
2381 0c1a9eda Zdenek Kabelac
static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2382 ba6802de Michael Niedermayer
{
2383
    int s, i;
2384
2385
    s = 0;
2386
    for(i=0;i<8;i++) {
2387
        s += abs(pix1[0] - pix2[0]);
2388
        s += abs(pix1[1] - pix2[1]);
2389
        s += abs(pix1[2] - pix2[2]);
2390
        s += abs(pix1[3] - pix2[3]);
2391
        s += abs(pix1[4] - pix2[4]);
2392
        s += abs(pix1[5] - pix2[5]);
2393
        s += abs(pix1[6] - pix2[6]);
2394
        s += abs(pix1[7] - pix2[7]);
2395
        pix1 += line_size;
2396
        pix2 += line_size;
2397
    }
2398
    return s;
2399
}
2400
2401 0c1a9eda Zdenek Kabelac
static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2402 ba6802de Michael Niedermayer
{
2403
    int s, i;
2404
2405
    s = 0;
2406
    for(i=0;i<8;i++) {
2407
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2408
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2409
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2410
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2411
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2412
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2413
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2414
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2415
        pix1 += line_size;
2416
        pix2 += line_size;
2417
    }
2418
    return s;
2419
}
2420
2421 0c1a9eda Zdenek Kabelac
static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2422 ba6802de Michael Niedermayer
{
2423
    int s, i;
2424 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
2425 ba6802de Michael Niedermayer
2426
    s = 0;
2427
    for(i=0;i<8;i++) {
2428
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2429
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2430
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2431
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2432
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2433
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2434
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2435
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2436
        pix1 += line_size;
2437
        pix2 += line_size;
2438
        pix3 += line_size;
2439
    }
2440
    return s;
2441
}
2442
2443 0c1a9eda Zdenek Kabelac
static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2444 ba6802de Michael Niedermayer
{
2445
    int s, i;
2446 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
2447 ba6802de Michael Niedermayer
2448
    s = 0;
2449
    for(i=0;i<8;i++) {
2450
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2451
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2452
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2453
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2454
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2455
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2456
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2457
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2458
        pix1 += line_size;
2459
        pix2 += line_size;
2460
        pix3 += line_size;
2461
    }
2462
    return s;
2463
}
2464
2465 1457ab52 Michael Niedermayer
static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2466
    return pix_abs16x16_c(a,b,stride);
2467
}
2468
2469
static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2470
    return pix_abs8x8_c(a,b,stride);
2471
}
2472
2473 a9badb51 Michael Niedermayer
/**
2474
 * permutes an 8x8 block.
2475 2a5700de Michael Niedermayer
 * @param block the block which will be permuted according to the given permutation vector
2476 a9badb51 Michael Niedermayer
 * @param permutation the permutation vector
2477
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2478 2a5700de Michael Niedermayer
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
2479
 *                  (inverse) permutated to scantable order!
2480 a9badb51 Michael Niedermayer
 */
2481 0c1a9eda Zdenek Kabelac
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2482 d962f6fd Arpi
{
2483 7801d21d Michael Niedermayer
    int i;
2484 477ab036 Michael Niedermayer
    DCTELEM temp[64];
2485 7801d21d Michael Niedermayer
    
2486
    if(last<=0) return;
2487 9a7b310d Zdenek Kabelac
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2488 d962f6fd Arpi
2489 7801d21d Michael Niedermayer
    for(i=0; i<=last; i++){
2490
        const int j= scantable[i];
2491
        temp[j]= block[j];
2492
        block[j]=0;
2493
    }
2494
    
2495
    for(i=0; i<=last; i++){
2496
        const int j= scantable[i];
2497
        const int perm_j= permutation[j];
2498
        block[perm_j]= temp[j];
2499
    }
2500 d962f6fd Arpi
}
2501 e0eac44e Fabrice Bellard
2502 2a5700de Michael Niedermayer
/**
2503
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2504
 */
2505 eb4b3dd3 Zdenek Kabelac
static void clear_blocks_c(DCTELEM *blocks)
2506 649c00c9 Michael Niedermayer
{
2507
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
2508
}
2509
2510 11f18faf Michael Niedermayer
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2511
    int i;
2512 d32ac509 Felix von Leitner
    for(i=0; i+7<w; i+=8){
2513 11f18faf Michael Niedermayer
        dst[i+0] += src[i+0];
2514
        dst[i+1] += src[i+1];
2515
        dst[i+2] += src[i+2];
2516
        dst[i+3] += src[i+3];
2517
        dst[i+4] += src[i+4];
2518
        dst[i+5] += src[i+5];
2519
        dst[i+6] += src[i+6];
2520
        dst[i+7] += src[i+7];
2521
    }
2522
    for(; i<w; i++)
2523
        dst[i+0] += src[i+0];
2524
}
2525
2526
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2527
    int i;
2528 d32ac509 Felix von Leitner
    for(i=0; i+7<w; i+=8){
2529 11f18faf Michael Niedermayer
        dst[i+0] = src1[i+0]-src2[i+0];
2530
        dst[i+1] = src1[i+1]-src2[i+1];
2531
        dst[i+2] = src1[i+2]-src2[i+2];
2532
        dst[i+3] = src1[i+3]-src2[i+3];
2533
        dst[i+4] = src1[i+4]-src2[i+4];
2534
        dst[i+5] = src1[i+5]-src2[i+5];
2535
        dst[i+6] = src1[i+6]-src2[i+6];
2536
        dst[i+7] = src1[i+7]-src2[i+7];
2537
    }
2538
    for(; i<w; i++)
2539
        dst[i+0] = src1[i+0]-src2[i+0];
2540
}
2541
2542 84705403 Michael Niedermayer
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2543
    int i;
2544
    uint8_t l, lt;
2545
2546
    l= *left;
2547
    lt= *left_top;
2548
2549
    for(i=0; i<w; i++){
2550
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2551
        lt= src1[i];
2552
        l= src2[i];
2553
        dst[i]= l - pred;
2554
    }    
2555
2556
    *left= l;
2557
    *left_top= lt;
2558
}
2559
2560 1457ab52 Michael Niedermayer
#define BUTTERFLY2(o1,o2,i1,i2) \
2561
o1= (i1)+(i2);\
2562
o2= (i1)-(i2);
2563
2564
#define BUTTERFLY1(x,y) \
2565
{\
2566
    int a,b;\
2567
    a= x;\
2568
    b= y;\
2569
    x= a+b;\
2570
    y= a-b;\
2571
}
2572
2573
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2574
2575
static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2576
    int i;
2577
    int temp[64];
2578
    int sum=0;
2579
2580
    for(i=0; i<8; i++){
2581
        //FIXME try pointer walks
2582
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2583
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2584
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2585
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2586
        
2587
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2588
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2589
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2590
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2591
        
2592
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2593
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2594
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2595
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2596
    }
2597
2598
    for(i=0; i<8; i++){
2599
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2600
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2601
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2602
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2603
        
2604
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2605
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2606
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2607
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2608
2609
        sum += 
2610
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2611
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2612
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2613
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2614
    }
2615
#if 0
2616
static int maxi=0;
2617
if(sum>maxi){
2618
    maxi=sum;
2619
    printf("MAX:%d\n", maxi);
2620
}
2621
#endif
2622
    return sum;
2623
}
2624
2625
static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2626
    int i;
2627
    int temp[64];
2628
    int sum=0;
2629
//FIXME OOOPS ignore 0 term instead of mean mess
2630
    for(i=0; i<8; i++){
2631
        //FIXME try pointer walks
2632
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2633
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2634
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2635
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2636
        
2637
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2638
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2639
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2640
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2641
        
2642
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2643
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2644
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2645
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2646
    }
2647
2648
    for(i=0; i<8; i++){
2649
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2650
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2651
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2652
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2653
        
2654
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2655
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2656
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2657
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2658
    
2659
        sum += 
2660
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2661
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2662
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2663
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2664
    }
2665
    
2666
    return sum;
2667
}
2668
2669
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2670
    MpegEncContext * const s= (MpegEncContext *)c;
2671 76fbb024 Michael Niedermayer
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2672
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2673 1457ab52 Michael Niedermayer
    int sum=0, i;
2674
2675
    s->dsp.diff_pixels(temp, src1, src2, stride);
2676 b0368839 Michael Niedermayer
    s->dsp.fdct(temp);
2677 1457ab52 Michael Niedermayer
2678
    for(i=0; i<64; i++)
2679
        sum+= ABS(temp[i]);
2680
        
2681
    return sum;
2682
}
2683
2684 0e15384d Michael Niedermayer
void simple_idct(DCTELEM *block); //FIXME
2685 1457ab52 Michael Niedermayer
2686
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2687
    MpegEncContext * const s= (MpegEncContext *)c;
2688 76fbb024 Michael Niedermayer
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2689
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2690
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2691 1457ab52 Michael Niedermayer
    int sum=0, i;
2692
2693
    s->mb_intra=0;
2694
    
2695
    s->dsp.diff_pixels(temp, src1, src2, stride);
2696
    
2697
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2698
    
2699 67725183 Michael Niedermayer
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2700 1457ab52 Michael Niedermayer
    s->dct_unquantize(s, temp, 0, s->qscale);
2701
    simple_idct(temp); //FIXME 
2702
    
2703
    for(i=0; i<64; i++)
2704
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2705
        
2706
    return sum;
2707
}
2708
2709 3a87ac94 Michael Niedermayer
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2710
    MpegEncContext * const s= (MpegEncContext *)c;
2711 0c1a9eda Zdenek Kabelac
    const uint8_t *scantable= s->intra_scantable.permutated;
2712 76fbb024 Michael Niedermayer
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2713
    uint64_t __align8 aligned_bak[stride];
2714
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2715
    uint8_t * const bak= (uint8_t*)aligned_bak;
2716 3a87ac94 Michael Niedermayer
    int i, last, run, bits, level, distoration, start_i;
2717
    const int esc_length= s->ac_esc_length;
2718
    uint8_t * length;
2719
    uint8_t * last_length;
2720 67725183 Michael Niedermayer
    
2721
    for(i=0; i<8; i++){
2722
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2723
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2724
    }
2725 3a87ac94 Michael Niedermayer
2726 67725183 Michael Niedermayer
    s->dsp.diff_pixels(temp, src1, src2, stride);
2727
2728
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2729
2730
    bits=0;
2731 3a87ac94 Michael Niedermayer
    
2732
    if (s->mb_intra) {
2733 67725183 Michael Niedermayer
        start_i = 1;