Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 58c2182d

History | View | Annotate | Download (121 KB)

1 de6d9b64 Fabrice Bellard
/*
2
 * DSP utils
3 ff4ec49e Fabrice Bellard
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 8f2ab833 Michael Niedermayer
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 de6d9b64 Fabrice Bellard
 *
6 ff4ec49e Fabrice Bellard
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10 de6d9b64 Fabrice Bellard
 *
11 ff4ec49e Fabrice Bellard
 * This library is distributed in the hope that it will be useful,
12 de6d9b64 Fabrice Bellard
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ff4ec49e Fabrice Bellard
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15 de6d9b64 Fabrice Bellard
 *
16 ff4ec49e Fabrice Bellard
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19 7ff037e9 Michael Niedermayer
 *
20 59fe111e Michael Niedermayer
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 de6d9b64 Fabrice Bellard
 */
22 983e3246 Michael Niedermayer
 
23
/**
24
 * @file dsputil.c
25
 * DSP utils
26
 */
27
 
28 de6d9b64 Fabrice Bellard
#include "avcodec.h"
29
#include "dsputil.h"
30 1457ab52 Michael Niedermayer
#include "mpegvideo.h"
31 b0368839 Michael Niedermayer
#include "simple_idct.h"
32 65e4c8c9 Michael Niedermayer
#include "faandct.h"
33 5596c60c Michael Niedermayer
34 0c1a9eda Zdenek Kabelac
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35
uint32_t squareTbl[512];
36 de6d9b64 Fabrice Bellard
37 0c1a9eda Zdenek Kabelac
const uint8_t ff_zigzag_direct[64] = {
38 2ad1516a Michael Niedermayer
    0,   1,  8, 16,  9,  2,  3, 10,
39
    17, 24, 32, 25, 18, 11,  4,  5,
40 e0eac44e Fabrice Bellard
    12, 19, 26, 33, 40, 48, 41, 34,
41 2ad1516a Michael Niedermayer
    27, 20, 13,  6,  7, 14, 21, 28,
42 e0eac44e Fabrice Bellard
    35, 42, 49, 56, 57, 50, 43, 36,
43
    29, 22, 15, 23, 30, 37, 44, 51,
44
    58, 59, 52, 45, 38, 31, 39, 46,
45
    53, 60, 61, 54, 47, 55, 62, 63
46
};
47
48 10acc479 Roman Shaposhnik
/* Specific zigzag scan for 248 idct. NOTE that unlike the
49
   specification, we interleave the fields */
50
const uint8_t ff_zigzag248_direct[64] = {
51
     0,  8,  1,  9, 16, 24,  2, 10,
52
    17, 25, 32, 40, 48, 56, 33, 41,
53
    18, 26,  3, 11,  4, 12, 19, 27,
54
    34, 42, 49, 57, 50, 58, 35, 43,
55
    20, 28,  5, 13,  6, 14, 21, 29,
56
    36, 44, 51, 59, 52, 60, 37, 45,
57
    22, 30,  7, 15, 23, 31, 38, 46,
58
    53, 61, 54, 62, 39, 47, 55, 63,
59
};
60
61 2f349de2 Michael Niedermayer
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 0c1a9eda Zdenek Kabelac
uint16_t __align8 inv_zigzag_direct16[64];
63 2f349de2 Michael Niedermayer
64 0c1a9eda Zdenek Kabelac
const uint8_t ff_alternate_horizontal_scan[64] = {
65 2ad1516a Michael Niedermayer
    0,  1,   2,  3,  8,  9, 16, 17, 
66 e0eac44e Fabrice Bellard
    10, 11,  4,  5,  6,  7, 15, 14,
67
    13, 12, 19, 18, 24, 25, 32, 33, 
68
    26, 27, 20, 21, 22, 23, 28, 29,
69
    30, 31, 34, 35, 40, 41, 48, 49, 
70
    42, 43, 36, 37, 38, 39, 44, 45,
71
    46, 47, 50, 51, 56, 57, 58, 59, 
72
    52, 53, 54, 55, 60, 61, 62, 63,
73
};
74
75 0c1a9eda Zdenek Kabelac
const uint8_t ff_alternate_vertical_scan[64] = {
76 2ad1516a Michael Niedermayer
    0,  8,  16, 24,  1,  9,  2, 10, 
77 e0eac44e Fabrice Bellard
    17, 25, 32, 40, 48, 56, 57, 49,
78
    41, 33, 26, 18,  3, 11,  4, 12, 
79
    19, 27, 34, 42, 50, 58, 35, 43,
80
    51, 59, 20, 28,  5, 13,  6, 14, 
81
    21, 29, 36, 44, 52, 60, 37, 45,
82
    53, 61, 22, 30,  7, 15, 23, 31, 
83
    38, 46, 54, 62, 39, 47, 55, 63,
84
};
85
86 2f349de2 Michael Niedermayer
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 0c1a9eda Zdenek Kabelac
const uint32_t inverse[256]={
88 2f349de2 Michael Niedermayer
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
89
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
90
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
91
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
92
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
93
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
94
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
95
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
96
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
97
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
98
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
99
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
100
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
101
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
102
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
103
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
104
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
105
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
106
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
107
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
108
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
109
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
110
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
111
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
112
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
113
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
114
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
115
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
116
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
117
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
118
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
119
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
120
};
121
122 b0368839 Michael Niedermayer
/* Input permutation for the simple_idct_mmx */
123
static const uint8_t simple_mmx_permutation[64]={
124
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
125
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
126
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
127
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
128
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
129
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
130
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
131
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132
};
133
134 0c1a9eda Zdenek Kabelac
static int pix_sum_c(uint8_t * pix, int line_size)
135 3aa102be Michael Niedermayer
{
136
    int s, i, j;
137
138
    s = 0;
139
    for (i = 0; i < 16; i++) {
140
        for (j = 0; j < 16; j += 8) {
141
            s += pix[0];
142
            s += pix[1];
143
            s += pix[2];
144
            s += pix[3];
145
            s += pix[4];
146
            s += pix[5];
147
            s += pix[6];
148
            s += pix[7];
149
            pix += 8;
150
        }
151
        pix += line_size - 16;
152
    }
153
    return s;
154
}
155
156 0c1a9eda Zdenek Kabelac
static int pix_norm1_c(uint8_t * pix, int line_size)
157 3aa102be Michael Niedermayer
{
158
    int s, i, j;
159 0c1a9eda Zdenek Kabelac
    uint32_t *sq = squareTbl + 256;
160 3aa102be Michael Niedermayer
161
    s = 0;
162
    for (i = 0; i < 16; i++) {
163
        for (j = 0; j < 16; j += 8) {
164 2a006cd3 Felix von Leitner
#if 0
165 3aa102be Michael Niedermayer
            s += sq[pix[0]];
166
            s += sq[pix[1]];
167
            s += sq[pix[2]];
168
            s += sq[pix[3]];
169
            s += sq[pix[4]];
170
            s += sq[pix[5]];
171
            s += sq[pix[6]];
172
            s += sq[pix[7]];
173 2a006cd3 Felix von Leitner
#else
174
#if LONG_MAX > 2147483647
175
            register uint64_t x=*(uint64_t*)pix;
176
            s += sq[x&0xff];
177
            s += sq[(x>>8)&0xff];
178
            s += sq[(x>>16)&0xff];
179
            s += sq[(x>>24)&0xff];
180
            s += sq[(x>>32)&0xff];
181
            s += sq[(x>>40)&0xff];
182
            s += sq[(x>>48)&0xff];
183
            s += sq[(x>>56)&0xff];
184
#else
185
            register uint32_t x=*(uint32_t*)pix;
186
            s += sq[x&0xff];
187
            s += sq[(x>>8)&0xff];
188
            s += sq[(x>>16)&0xff];
189
            s += sq[(x>>24)&0xff];
190
            x=*(uint32_t*)(pix+4);
191
            s += sq[x&0xff];
192
            s += sq[(x>>8)&0xff];
193
            s += sq[(x>>16)&0xff];
194
            s += sq[(x>>24)&0xff];
195
#endif
196
#endif
197 3aa102be Michael Niedermayer
            pix += 8;
198
        }
199
        pix += line_size - 16;
200
    }
201
    return s;
202
}
203
204 3d2e8cce Michael Niedermayer
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205
    int i;
206
    
207
    for(i=0; i+8<=w; i+=8){
208
        dst[i+0]= bswap_32(src[i+0]);
209
        dst[i+1]= bswap_32(src[i+1]);
210
        dst[i+2]= bswap_32(src[i+2]);
211
        dst[i+3]= bswap_32(src[i+3]);
212
        dst[i+4]= bswap_32(src[i+4]);
213
        dst[i+5]= bswap_32(src[i+5]);
214
        dst[i+6]= bswap_32(src[i+6]);
215
        dst[i+7]= bswap_32(src[i+7]);
216
    }
217
    for(;i<w; i++){
218
        dst[i+0]= bswap_32(src[i+0]);
219
    }
220
}
221 3aa102be Michael Niedermayer
222 bb198e19 Michael Niedermayer
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223 1457ab52 Michael Niedermayer
{
224
    int s, i;
225 0c1a9eda Zdenek Kabelac
    uint32_t *sq = squareTbl + 256;
226 1457ab52 Michael Niedermayer
227
    s = 0;
228 bb198e19 Michael Niedermayer
    for (i = 0; i < h; i++) {
229 1457ab52 Michael Niedermayer
        s += sq[pix1[0] - pix2[0]];
230
        s += sq[pix1[1] - pix2[1]];
231
        s += sq[pix1[2] - pix2[2]];
232
        s += sq[pix1[3] - pix2[3]];
233
        s += sq[pix1[4] - pix2[4]];
234
        s += sq[pix1[5] - pix2[5]];
235
        s += sq[pix1[6] - pix2[6]];
236
        s += sq[pix1[7] - pix2[7]];
237
        pix1 += line_size;
238
        pix2 += line_size;
239
    }
240
    return s;
241
}
242
243 bb198e19 Michael Niedermayer
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
244 9c76bd48 Brian Foley
{
245 6b026927 Falk Hüffner
    int s, i;
246
    uint32_t *sq = squareTbl + 256;
247 9c76bd48 Brian Foley
248
    s = 0;
249 bb198e19 Michael Niedermayer
    for (i = 0; i < h; i++) {
250 6b026927 Falk Hüffner
        s += sq[pix1[ 0] - pix2[ 0]];
251
        s += sq[pix1[ 1] - pix2[ 1]];
252
        s += sq[pix1[ 2] - pix2[ 2]];
253
        s += sq[pix1[ 3] - pix2[ 3]];
254
        s += sq[pix1[ 4] - pix2[ 4]];
255
        s += sq[pix1[ 5] - pix2[ 5]];
256
        s += sq[pix1[ 6] - pix2[ 6]];
257
        s += sq[pix1[ 7] - pix2[ 7]];
258
        s += sq[pix1[ 8] - pix2[ 8]];
259
        s += sq[pix1[ 9] - pix2[ 9]];
260
        s += sq[pix1[10] - pix2[10]];
261
        s += sq[pix1[11] - pix2[11]];
262
        s += sq[pix1[12] - pix2[12]];
263
        s += sq[pix1[13] - pix2[13]];
264
        s += sq[pix1[14] - pix2[14]];
265
        s += sq[pix1[15] - pix2[15]];
266 2a006cd3 Felix von Leitner
267 6b026927 Falk Hüffner
        pix1 += line_size;
268
        pix2 += line_size;
269 9c76bd48 Brian Foley
    }
270
    return s;
271
}
272
273 0c1a9eda Zdenek Kabelac
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
274 de6d9b64 Fabrice Bellard
{
275
    int i;
276
277
    /* read the pixels */
278
    for(i=0;i<8;i++) {
279 c13e1abd Falk Hüffner
        block[0] = pixels[0];
280
        block[1] = pixels[1];
281
        block[2] = pixels[2];
282
        block[3] = pixels[3];
283
        block[4] = pixels[4];
284
        block[5] = pixels[5];
285
        block[6] = pixels[6];
286
        block[7] = pixels[7];
287
        pixels += line_size;
288
        block += 8;
289 de6d9b64 Fabrice Bellard
    }
290
}
291
292 0c1a9eda Zdenek Kabelac
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293
                          const uint8_t *s2, int stride){
294 9dbcbd92 Michael Niedermayer
    int i;
295
296
    /* read the pixels */
297
    for(i=0;i<8;i++) {
298 c13e1abd Falk Hüffner
        block[0] = s1[0] - s2[0];
299
        block[1] = s1[1] - s2[1];
300
        block[2] = s1[2] - s2[2];
301
        block[3] = s1[3] - s2[3];
302
        block[4] = s1[4] - s2[4];
303
        block[5] = s1[5] - s2[5];
304
        block[6] = s1[6] - s2[6];
305
        block[7] = s1[7] - s2[7];
306 9dbcbd92 Michael Niedermayer
        s1 += stride;
307
        s2 += stride;
308 c13e1abd Falk Hüffner
        block += 8;
309 9dbcbd92 Michael Niedermayer
    }
310
}
311
312
313 0c1a9eda Zdenek Kabelac
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
314 eb4b3dd3 Zdenek Kabelac
                                 int line_size)
315 de6d9b64 Fabrice Bellard
{
316
    int i;
317 0c1a9eda Zdenek Kabelac
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
318 de6d9b64 Fabrice Bellard
    
319
    /* read the pixels */
320
    for(i=0;i<8;i++) {
321 c13e1abd Falk Hüffner
        pixels[0] = cm[block[0]];
322
        pixels[1] = cm[block[1]];
323
        pixels[2] = cm[block[2]];
324
        pixels[3] = cm[block[3]];
325
        pixels[4] = cm[block[4]];
326
        pixels[5] = cm[block[5]];
327
        pixels[6] = cm[block[6]];
328
        pixels[7] = cm[block[7]];
329
330
        pixels += line_size;
331
        block += 8;
332 de6d9b64 Fabrice Bellard
    }
333
}
334
335 0c1a9eda Zdenek Kabelac
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
336 c13e1abd Falk Hüffner
                          int line_size)
337 de6d9b64 Fabrice Bellard
{
338
    int i;
339 0c1a9eda Zdenek Kabelac
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
340 de6d9b64 Fabrice Bellard
    
341
    /* read the pixels */
342
    for(i=0;i<8;i++) {
343 c13e1abd Falk Hüffner
        pixels[0] = cm[pixels[0] + block[0]];
344
        pixels[1] = cm[pixels[1] + block[1]];
345
        pixels[2] = cm[pixels[2] + block[2]];
346
        pixels[3] = cm[pixels[3] + block[3]];
347
        pixels[4] = cm[pixels[4] + block[4]];
348
        pixels[5] = cm[pixels[5] + block[5]];
349
        pixels[6] = cm[pixels[6] + block[6]];
350
        pixels[7] = cm[pixels[7] + block[7]];
351
        pixels += line_size;
352
        block += 8;
353 de6d9b64 Fabrice Bellard
    }
354
}
355 59fe111e Michael Niedermayer
#if 0
356

357
#define PIXOP2(OPNAME, OP) \
358 b3184779 Michael Niedermayer
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
359 59fe111e Michael Niedermayer
{\
360
    int i;\
361
    for(i=0; i<h; i++){\
362
        OP(*((uint64_t*)block), LD64(pixels));\
363
        pixels+=line_size;\
364
        block +=line_size;\
365
    }\
366
}\
367
\
368 45553457 Zdenek Kabelac
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
369 59fe111e Michael Niedermayer
{\
370
    int i;\
371
    for(i=0; i<h; i++){\
372
        const uint64_t a= LD64(pixels  );\
373
        const uint64_t b= LD64(pixels+1);\
374
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
375
        pixels+=line_size;\
376
        block +=line_size;\
377
    }\
378
}\
379
\
380 45553457 Zdenek Kabelac
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
381 59fe111e Michael Niedermayer
{\
382
    int i;\
383
    for(i=0; i<h; i++){\
384
        const uint64_t a= LD64(pixels  );\
385
        const uint64_t b= LD64(pixels+1);\
386
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
387
        pixels+=line_size;\
388
        block +=line_size;\
389
    }\
390
}\
391
\
392 45553457 Zdenek Kabelac
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
393 59fe111e Michael Niedermayer
{\
394
    int i;\
395
    for(i=0; i<h; i++){\
396
        const uint64_t a= LD64(pixels          );\
397
        const uint64_t b= LD64(pixels+line_size);\
398
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
399
        pixels+=line_size;\
400
        block +=line_size;\
401
    }\
402
}\
403
\
404 45553457 Zdenek Kabelac
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
405 59fe111e Michael Niedermayer
{\
406
    int i;\
407
    for(i=0; i<h; i++){\
408
        const uint64_t a= LD64(pixels          );\
409
        const uint64_t b= LD64(pixels+line_size);\
410
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
411
        pixels+=line_size;\
412
        block +=line_size;\
413
    }\
414
}\
415
\
416 45553457 Zdenek Kabelac
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
417 59fe111e Michael Niedermayer
{\
418
        int i;\
419
        const uint64_t a= LD64(pixels  );\
420
        const uint64_t b= LD64(pixels+1);\
421
        uint64_t l0=  (a&0x0303030303030303ULL)\
422
                    + (b&0x0303030303030303ULL)\
423
                    + 0x0202020202020202ULL;\
424
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
425
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
426
        uint64_t l1,h1;\
427
\
428
        pixels+=line_size;\
429
        for(i=0; i<h; i+=2){\
430
            uint64_t a= LD64(pixels  );\
431
            uint64_t b= LD64(pixels+1);\
432
            l1=  (a&0x0303030303030303ULL)\
433
               + (b&0x0303030303030303ULL);\
434
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
435
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
436
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
437
            pixels+=line_size;\
438
            block +=line_size;\
439
            a= LD64(pixels  );\
440
            b= LD64(pixels+1);\
441
            l0=  (a&0x0303030303030303ULL)\
442
               + (b&0x0303030303030303ULL)\
443
               + 0x0202020202020202ULL;\
444
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
445
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
446
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
447
            pixels+=line_size;\
448
            block +=line_size;\
449
        }\
450
}\
451
\
452 45553457 Zdenek Kabelac
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
453 59fe111e Michael Niedermayer
{\
454
        int i;\
455
        const uint64_t a= LD64(pixels  );\
456
        const uint64_t b= LD64(pixels+1);\
457
        uint64_t l0=  (a&0x0303030303030303ULL)\
458
                    + (b&0x0303030303030303ULL)\
459
                    + 0x0101010101010101ULL;\
460
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
461
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
462
        uint64_t l1,h1;\
463
\
464
        pixels+=line_size;\
465
        for(i=0; i<h; i+=2){\
466
            uint64_t a= LD64(pixels  );\
467
            uint64_t b= LD64(pixels+1);\
468
            l1=  (a&0x0303030303030303ULL)\
469
               + (b&0x0303030303030303ULL);\
470
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
471
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
472
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
473
            pixels+=line_size;\
474
            block +=line_size;\
475
            a= LD64(pixels  );\
476
            b= LD64(pixels+1);\
477
            l0=  (a&0x0303030303030303ULL)\
478
               + (b&0x0303030303030303ULL)\
479
               + 0x0101010101010101ULL;\
480
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
481
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
482
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
483
            pixels+=line_size;\
484
            block +=line_size;\
485
        }\
486
}\
487
\
488 45553457 Zdenek Kabelac
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
489
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
490
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
491
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
492
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
493
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
494
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
495 59fe111e Michael Niedermayer

496
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
497
#else // 64 bit variant
498
499
#define PIXOP2(OPNAME, OP) \
500 669ac79c Michael Niedermayer
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
501
    int i;\
502
    for(i=0; i<h; i++){\
503
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
504
        pixels+=line_size;\
505
        block +=line_size;\
506
    }\
507
}\
508 0da71265 Michael Niedermayer
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
509
    int i;\
510
    for(i=0; i<h; i++){\
511
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
512
        pixels+=line_size;\
513
        block +=line_size;\
514
    }\
515
}\
516 45553457 Zdenek Kabelac
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
517 59fe111e Michael Niedermayer
    int i;\
518
    for(i=0; i<h; i++){\
519
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
520
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
521
        pixels+=line_size;\
522
        block +=line_size;\
523
    }\
524
}\
525 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
526
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
527 b3184779 Michael Niedermayer
}\
528 59fe111e Michael Niedermayer
\
529 b3184779 Michael Niedermayer
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530
                                                int src_stride1, int src_stride2, int h){\
531 59fe111e Michael Niedermayer
    int i;\
532
    for(i=0; i<h; i++){\
533 b3184779 Michael Niedermayer
        uint32_t a,b;\
534
        a= LD32(&src1[i*src_stride1  ]);\
535
        b= LD32(&src2[i*src_stride2  ]);\
536 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
537 b3184779 Michael Niedermayer
        a= LD32(&src1[i*src_stride1+4]);\
538
        b= LD32(&src2[i*src_stride2+4]);\
539 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
540 59fe111e Michael Niedermayer
    }\
541
}\
542
\
543 b3184779 Michael Niedermayer
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
544
                                                int src_stride1, int src_stride2, int h){\
545 59fe111e Michael Niedermayer
    int i;\
546
    for(i=0; i<h; i++){\
547 b3184779 Michael Niedermayer
        uint32_t a,b;\
548
        a= LD32(&src1[i*src_stride1  ]);\
549
        b= LD32(&src2[i*src_stride2  ]);\
550 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
551 b3184779 Michael Niedermayer
        a= LD32(&src1[i*src_stride1+4]);\
552
        b= LD32(&src2[i*src_stride2+4]);\
553 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
554 59fe111e Michael Niedermayer
    }\
555
}\
556
\
557 0da71265 Michael Niedermayer
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
558
                                                int src_stride1, int src_stride2, int h){\
559
    int i;\
560
    for(i=0; i<h; i++){\
561
        uint32_t a,b;\
562
        a= LD32(&src1[i*src_stride1  ]);\
563
        b= LD32(&src2[i*src_stride2  ]);\
564 d8085ea7 Michael Niedermayer
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
565 0da71265 Michael Niedermayer
    }\
566
}\
567
\
568 669ac79c Michael Niedermayer
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
569
                                                int src_stride1, int src_stride2, int h){\
570
    int i;\
571
    for(i=0; i<h; i++){\
572
        uint32_t a,b;\
573
        a= LD16(&src1[i*src_stride1  ]);\
574
        b= LD16(&src2[i*src_stride2  ]);\
575
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
576
    }\
577
}\
578
\
579 b3184779 Michael Niedermayer
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
580
                                                int src_stride1, int src_stride2, int h){\
581
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
582
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
583
}\
584
\
585
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
586
                                                int src_stride1, int src_stride2, int h){\
587
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
588
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
589
}\
590
\
591 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
592 b3184779 Michael Niedermayer
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
593
}\
594
\
595 45553457 Zdenek Kabelac
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
596 b3184779 Michael Niedermayer
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
597
}\
598
\
599 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
600 b3184779 Michael Niedermayer
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
601
}\
602
\
603 45553457 Zdenek Kabelac
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
604 b3184779 Michael Niedermayer
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
605
}\
606
\
607
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
608
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
609 59fe111e Michael Niedermayer
    int i;\
610
    for(i=0; i<h; i++){\
611 b3184779 Michael Niedermayer
        uint32_t a, b, c, d, l0, l1, h0, h1;\
612
        a= LD32(&src1[i*src_stride1]);\
613
        b= LD32(&src2[i*src_stride2]);\
614
        c= LD32(&src3[i*src_stride3]);\
615
        d= LD32(&src4[i*src_stride4]);\
616
        l0=  (a&0x03030303UL)\
617
           + (b&0x03030303UL)\
618
           + 0x02020202UL;\
619
        h0= ((a&0xFCFCFCFCUL)>>2)\
620
          + ((b&0xFCFCFCFCUL)>>2);\
621
        l1=  (c&0x03030303UL)\
622
           + (d&0x03030303UL);\
623
        h1= ((c&0xFCFCFCFCUL)>>2)\
624
          + ((d&0xFCFCFCFCUL)>>2);\
625
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
626
        a= LD32(&src1[i*src_stride1+4]);\
627
        b= LD32(&src2[i*src_stride2+4]);\
628
        c= LD32(&src3[i*src_stride3+4]);\
629
        d= LD32(&src4[i*src_stride4+4]);\
630
        l0=  (a&0x03030303UL)\
631
           + (b&0x03030303UL)\
632
           + 0x02020202UL;\
633
        h0= ((a&0xFCFCFCFCUL)>>2)\
634
          + ((b&0xFCFCFCFCUL)>>2);\
635
        l1=  (c&0x03030303UL)\
636
           + (d&0x03030303UL);\
637
        h1= ((c&0xFCFCFCFCUL)>>2)\
638
          + ((d&0xFCFCFCFCUL)>>2);\
639
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
640 59fe111e Michael Niedermayer
    }\
641
}\
642 669ac79c Michael Niedermayer
\
643
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
644
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
645
}\
646
\
647
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
648
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
649
}\
650
\
651
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
652
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
653
}\
654
\
655
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
656
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
657
}\
658
\
659 b3184779 Michael Niedermayer
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
660
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
661 59fe111e Michael Niedermayer
    int i;\
662
    for(i=0; i<h; i++){\
663 b3184779 Michael Niedermayer
        uint32_t a, b, c, d, l0, l1, h0, h1;\
664
        a= LD32(&src1[i*src_stride1]);\
665
        b= LD32(&src2[i*src_stride2]);\
666
        c= LD32(&src3[i*src_stride3]);\
667
        d= LD32(&src4[i*src_stride4]);\
668
        l0=  (a&0x03030303UL)\
669
           + (b&0x03030303UL)\
670
           + 0x01010101UL;\
671
        h0= ((a&0xFCFCFCFCUL)>>2)\
672
          + ((b&0xFCFCFCFCUL)>>2);\
673
        l1=  (c&0x03030303UL)\
674
           + (d&0x03030303UL);\
675
        h1= ((c&0xFCFCFCFCUL)>>2)\
676
          + ((d&0xFCFCFCFCUL)>>2);\
677
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
678
        a= LD32(&src1[i*src_stride1+4]);\
679
        b= LD32(&src2[i*src_stride2+4]);\
680
        c= LD32(&src3[i*src_stride3+4]);\
681
        d= LD32(&src4[i*src_stride4+4]);\
682
        l0=  (a&0x03030303UL)\
683
           + (b&0x03030303UL)\
684
           + 0x01010101UL;\
685
        h0= ((a&0xFCFCFCFCUL)>>2)\
686
          + ((b&0xFCFCFCFCUL)>>2);\
687
        l1=  (c&0x03030303UL)\
688
           + (d&0x03030303UL);\
689
        h1= ((c&0xFCFCFCFCUL)>>2)\
690
          + ((d&0xFCFCFCFCUL)>>2);\
691
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
692 59fe111e Michael Niedermayer
    }\
693
}\
694 b3184779 Michael Niedermayer
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
695
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
696
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
697
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
698
}\
699
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
700
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
701
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
702
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
703
}\
704 59fe111e Michael Niedermayer
\
705 669ac79c Michael Niedermayer
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
706
{\
707
        int i, a0, b0, a1, b1;\
708
        a0= pixels[0];\
709
        b0= pixels[1] + 2;\
710
        a0 += b0;\
711
        b0 += pixels[2];\
712
\
713
        pixels+=line_size;\
714
        for(i=0; i<h; i+=2){\
715
            a1= pixels[0];\
716
            b1= pixels[1];\
717
            a1 += b1;\
718
            b1 += pixels[2];\
719
\
720
            block[0]= (a1+a0)>>2; /* FIXME non put */\
721
            block[1]= (b1+b0)>>2;\
722
\
723
            pixels+=line_size;\
724
            block +=line_size;\
725
\
726
            a0= pixels[0];\
727
            b0= pixels[1] + 2;\
728
            a0 += b0;\
729
            b0 += pixels[2];\
730
\
731
            block[0]= (a1+a0)>>2;\
732
            block[1]= (b1+b0)>>2;\
733
            pixels+=line_size;\
734
            block +=line_size;\
735
        }\
736
}\
737
\
738
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
739
{\
740
        int i;\
741
        const uint32_t a= LD32(pixels  );\
742
        const uint32_t b= LD32(pixels+1);\
743
        uint32_t l0=  (a&0x03030303UL)\
744
                    + (b&0x03030303UL)\
745
                    + 0x02020202UL;\
746
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
747
                   + ((b&0xFCFCFCFCUL)>>2);\
748
        uint32_t l1,h1;\
749
\
750
        pixels+=line_size;\
751
        for(i=0; i<h; i+=2){\
752
            uint32_t a= LD32(pixels  );\
753
            uint32_t b= LD32(pixels+1);\
754
            l1=  (a&0x03030303UL)\
755
               + (b&0x03030303UL);\
756
            h1= ((a&0xFCFCFCFCUL)>>2)\
757
              + ((b&0xFCFCFCFCUL)>>2);\
758
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
759
            pixels+=line_size;\
760
            block +=line_size;\
761
            a= LD32(pixels  );\
762
            b= LD32(pixels+1);\
763
            l0=  (a&0x03030303UL)\
764
               + (b&0x03030303UL)\
765
               + 0x02020202UL;\
766
            h0= ((a&0xFCFCFCFCUL)>>2)\
767
              + ((b&0xFCFCFCFCUL)>>2);\
768
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
769
            pixels+=line_size;\
770
            block +=line_size;\
771
        }\
772
}\
773
\
774 45553457 Zdenek Kabelac
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
775 59fe111e Michael Niedermayer
{\
776
    int j;\
777
    for(j=0; j<2; j++){\
778
        int i;\
779
        const uint32_t a= LD32(pixels  );\
780
        const uint32_t b= LD32(pixels+1);\
781
        uint32_t l0=  (a&0x03030303UL)\
782
                    + (b&0x03030303UL)\
783
                    + 0x02020202UL;\
784
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
785
                   + ((b&0xFCFCFCFCUL)>>2);\
786
        uint32_t l1,h1;\
787
\
788
        pixels+=line_size;\
789
        for(i=0; i<h; i+=2){\
790
            uint32_t a= LD32(pixels  );\
791
            uint32_t b= LD32(pixels+1);\
792
            l1=  (a&0x03030303UL)\
793
               + (b&0x03030303UL);\
794
            h1= ((a&0xFCFCFCFCUL)>>2)\
795
              + ((b&0xFCFCFCFCUL)>>2);\
796
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
797
            pixels+=line_size;\
798
            block +=line_size;\
799
            a= LD32(pixels  );\
800
            b= LD32(pixels+1);\
801
            l0=  (a&0x03030303UL)\
802
               + (b&0x03030303UL)\
803
               + 0x02020202UL;\
804
            h0= ((a&0xFCFCFCFCUL)>>2)\
805
              + ((b&0xFCFCFCFCUL)>>2);\
806
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
807
            pixels+=line_size;\
808
            block +=line_size;\
809
        }\
810
        pixels+=4-line_size*(h+1);\
811
        block +=4-line_size*h;\
812
    }\
813
}\
814
\
815 45553457 Zdenek Kabelac
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
816 59fe111e Michael Niedermayer
{\
817
    int j;\
818
    for(j=0; j<2; j++){\
819
        int i;\
820
        const uint32_t a= LD32(pixels  );\
821
        const uint32_t b= LD32(pixels+1);\
822
        uint32_t l0=  (a&0x03030303UL)\
823
                    + (b&0x03030303UL)\
824
                    + 0x01010101UL;\
825
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
826
                   + ((b&0xFCFCFCFCUL)>>2);\
827
        uint32_t l1,h1;\
828
\
829
        pixels+=line_size;\
830
        for(i=0; i<h; i+=2){\
831
            uint32_t a= LD32(pixels  );\
832
            uint32_t b= LD32(pixels+1);\
833
            l1=  (a&0x03030303UL)\
834
               + (b&0x03030303UL);\
835
            h1= ((a&0xFCFCFCFCUL)>>2)\
836
              + ((b&0xFCFCFCFCUL)>>2);\
837
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
838
            pixels+=line_size;\
839
            block +=line_size;\
840
            a= LD32(pixels  );\
841
            b= LD32(pixels+1);\
842
            l0=  (a&0x03030303UL)\
843
               + (b&0x03030303UL)\
844
               + 0x01010101UL;\
845
            h0= ((a&0xFCFCFCFCUL)>>2)\
846
              + ((b&0xFCFCFCFCUL)>>2);\
847
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
848
            pixels+=line_size;\
849
            block +=line_size;\
850
        }\
851
        pixels+=4-line_size*(h+1);\
852
        block +=4-line_size*h;\
853
    }\
854
}\
855
\
856 45553457 Zdenek Kabelac
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
857
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
858
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
859
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
860
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
861
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
862
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
863
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
864 b3184779 Michael Niedermayer
865 d8085ea7 Michael Niedermayer
#define op_avg(a, b) a = rnd_avg32(a, b)
866 59fe111e Michael Niedermayer
#endif
867
#define op_put(a, b) a = b
868
869
PIXOP2(avg, op_avg)
870
PIXOP2(put, op_put)
871
#undef op_avg
872
#undef op_put
873
874 de6d9b64 Fabrice Bellard
#define avg2(a,b) ((a+b+1)>>1)
875
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
876
877 c0a0170c Michael Niedermayer
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
878
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
879
}
880
881
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
882
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
883
}
884 073b013d Michael Niedermayer
885 0c1a9eda Zdenek Kabelac
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
886 44eb4951 Michael Niedermayer
{
887
    const int A=(16-x16)*(16-y16);
888
    const int B=(   x16)*(16-y16);
889
    const int C=(16-x16)*(   y16);
890
    const int D=(   x16)*(   y16);
891
    int i;
892
893
    for(i=0; i<h; i++)
894
    {
895 b3184779 Michael Niedermayer
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
896
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
897
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
898
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
899
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
900
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
901
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
902
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
903
        dst+= stride;
904
        src+= stride;
905 44eb4951 Michael Niedermayer
    }
906
}
907
908 0c1a9eda Zdenek Kabelac
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
909 073b013d Michael Niedermayer
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
910
{
911
    int y, vx, vy;
912
    const int s= 1<<shift;
913
    
914
    width--;
915
    height--;
916
917
    for(y=0; y<h; y++){
918
        int x;
919
920
        vx= ox;
921
        vy= oy;
922
        for(x=0; x<8; x++){ //XXX FIXME optimize
923
            int src_x, src_y, frac_x, frac_y, index;
924
925
            src_x= vx>>16;
926
            src_y= vy>>16;
927
            frac_x= src_x&(s-1);
928
            frac_y= src_y&(s-1);
929
            src_x>>=shift;
930
            src_y>>=shift;
931
  
932
            if((unsigned)src_x < width){
933
                if((unsigned)src_y < height){
934
                    index= src_x + src_y*stride;
935
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
936
                                           + src[index       +1]*   frac_x )*(s-frac_y)
937
                                        + (  src[index+stride  ]*(s-frac_x)
938
                                           + src[index+stride+1]*   frac_x )*   frac_y
939
                                        + r)>>(shift*2);
940
                }else{
941
                    index= src_x + clip(src_y, 0, height)*stride;                    
942
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
943
                                          + src[index       +1]*   frac_x )*s
944
                                        + r)>>(shift*2);
945
                }
946
            }else{
947
                if((unsigned)src_y < height){
948
                    index= clip(src_x, 0, width) + src_y*stride;                    
949
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
950
                                           + src[index+stride  ]*   frac_y )*s
951
                                        + r)>>(shift*2);
952
                }else{
953
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
954
                    dst[y*stride + x]=    src[index         ];
955
                }
956
            }
957
            
958
            vx+= dxx;
959
            vy+= dyx;
960
        }
961
        ox += dxy;
962
        oy += dyy;
963
    }
964
}
965 669ac79c Michael Niedermayer
966
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
967
    switch(width){
968
    case 2: put_pixels2_c (dst, src, stride, height); break;
969
    case 4: put_pixels4_c (dst, src, stride, height); break;
970
    case 8: put_pixels8_c (dst, src, stride, height); break;
971
    case 16:put_pixels16_c(dst, src, stride, height); break;
972
    }
973
}
974
975
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
976
    int i,j;
977
    for (i=0; i < height; i++) {
978
      for (j=0; j < width; j++) {
979
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
980
      }
981
      src += stride;
982
      dst += stride;
983
    }
984
}
985
986
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
987
    int i,j;
988
    for (i=0; i < height; i++) {
989
      for (j=0; j < width; j++) {
990
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
991
      }
992
      src += stride;
993
      dst += stride;
994
    }
995
}
996
    
997
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
998
    int i,j;
999
    for (i=0; i < height; i++) {
1000
      for (j=0; j < width; j++) {
1001
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1002
      }
1003
      src += stride;
1004
      dst += stride;
1005
    }
1006
}
1007
    
1008
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1009
    int i,j;
1010
    for (i=0; i < height; i++) {
1011
      for (j=0; j < width; j++) {
1012
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1013
      }
1014
      src += stride;
1015
      dst += stride;
1016
    }
1017
}
1018
1019
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1020
    int i,j;
1021
    for (i=0; i < height; i++) {
1022
      for (j=0; j < width; j++) {
1023 89ebf4e8 Mike Melanson
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1024 669ac79c Michael Niedermayer
      }
1025
      src += stride;
1026
      dst += stride;
1027
    }
1028
}
1029
1030
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1031
    int i,j;
1032
    for (i=0; i < height; i++) {
1033
      for (j=0; j < width; j++) {
1034
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1035
      }
1036
      src += stride;
1037
      dst += stride;
1038
    }
1039
}
1040
1041
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1042
    int i,j;
1043
    for (i=0; i < height; i++) {
1044
      for (j=0; j < width; j++) {
1045 89ebf4e8 Mike Melanson
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1046 669ac79c Michael Niedermayer
      }
1047
      src += stride;
1048
      dst += stride;
1049
    }
1050
}
1051
1052
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1053
    int i,j;
1054
    for (i=0; i < height; i++) {
1055
      for (j=0; j < width; j++) {
1056
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1057
      }
1058
      src += stride;
1059
      dst += stride;
1060
    }
1061
}
1062 da3b9756 Mike Melanson
1063
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1064
    switch(width){
1065
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1066
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1067
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1068
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1069
    }
1070
}
1071
1072
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1073
    int i,j;
1074
    for (i=0; i < height; i++) {
1075
      for (j=0; j < width; j++) {
1076
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1077
      }
1078
      src += stride;
1079
      dst += stride;
1080
    }
1081
}
1082
1083
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1084
    int i,j;
1085
    for (i=0; i < height; i++) {
1086
      for (j=0; j < width; j++) {
1087
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1088
      }
1089
      src += stride;
1090
      dst += stride;
1091
    }
1092
}
1093
    
1094
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1095
    int i,j;
1096
    for (i=0; i < height; i++) {
1097
      for (j=0; j < width; j++) {
1098
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1099
      }
1100
      src += stride;
1101
      dst += stride;
1102
    }
1103
}
1104
    
1105
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1106
    int i,j;
1107
    for (i=0; i < height; i++) {
1108
      for (j=0; j < width; j++) {
1109
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1110
      }
1111
      src += stride;
1112
      dst += stride;
1113
    }
1114
}
1115
1116
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1117
    int i,j;
1118
    for (i=0; i < height; i++) {
1119
      for (j=0; j < width; j++) {
1120 89ebf4e8 Mike Melanson
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1121 da3b9756 Mike Melanson
      }
1122
      src += stride;
1123
      dst += stride;
1124
    }
1125
}
1126
1127
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1128
    int i,j;
1129
    for (i=0; i < height; i++) {
1130
      for (j=0; j < width; j++) {
1131
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1132
      }
1133
      src += stride;
1134
      dst += stride;
1135
    }
1136
}
1137
1138
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1139
    int i,j;
1140
    for (i=0; i < height; i++) {
1141
      for (j=0; j < width; j++) {
1142 89ebf4e8 Mike Melanson
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1143 da3b9756 Mike Melanson
      }
1144
      src += stride;
1145
      dst += stride;
1146
    }
1147
}
1148
1149
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1150
    int i,j;
1151
    for (i=0; i < height; i++) {
1152
      for (j=0; j < width; j++) {
1153
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1154
      }
1155
      src += stride;
1156
      dst += stride;
1157
    }
1158
}
1159 669ac79c Michael Niedermayer
#if 0
1160
#define TPEL_WIDTH(width)\
1161
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1163
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1165
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1167
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1169
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1171
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1172
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1173
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1174
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1175
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1176
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1177
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1178
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1179
#endif
1180
1181 0da71265 Michael Niedermayer
#define H264_CHROMA_MC(OPNAME, OP)\
1182
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1183
    const int A=(8-x)*(8-y);\
1184
    const int B=(  x)*(8-y);\
1185
    const int C=(8-x)*(  y);\
1186
    const int D=(  x)*(  y);\
1187
    int i;\
1188
    \
1189
    assert(x<8 && y<8 && x>=0 && y>=0);\
1190
\
1191
    for(i=0; i<h; i++)\
1192
    {\
1193
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1194
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1195
        dst+= stride;\
1196
        src+= stride;\
1197
    }\
1198
}\
1199
\
1200
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1201
    const int A=(8-x)*(8-y);\
1202
    const int B=(  x)*(8-y);\
1203
    const int C=(8-x)*(  y);\
1204
    const int D=(  x)*(  y);\
1205
    int i;\
1206
    \
1207
    assert(x<8 && y<8 && x>=0 && y>=0);\
1208
\
1209
    for(i=0; i<h; i++)\
1210
    {\
1211
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1212
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1213
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1214
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1215
        dst+= stride;\
1216
        src+= stride;\
1217
    }\
1218
}\
1219
\
1220
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1221
    const int A=(8-x)*(8-y);\
1222
    const int B=(  x)*(8-y);\
1223
    const int C=(8-x)*(  y);\
1224
    const int D=(  x)*(  y);\
1225
    int i;\
1226
    \
1227
    assert(x<8 && y<8 && x>=0 && y>=0);\
1228
\
1229
    for(i=0; i<h; i++)\
1230
    {\
1231
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1232
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1233
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1234
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1235
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1236
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1237
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1238
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1239
        dst+= stride;\
1240
        src+= stride;\
1241
    }\
1242
}
1243
1244
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1245
#define op_put(a, b) a = (((b) + 32)>>6)
1246
1247
H264_CHROMA_MC(put_       , op_put)
1248
H264_CHROMA_MC(avg_       , op_avg)
1249
#undef op_avg
1250
#undef op_put
1251
1252
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1253
{
1254
    int i;
1255
    for(i=0; i<h; i++)
1256
    {
1257
        ST32(dst   , LD32(src   ));
1258
        dst+=dstStride;
1259
        src+=srcStride;
1260
    }
1261
}
1262
1263
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1264
{
1265
    int i;
1266
    for(i=0; i<h; i++)
1267
    {
1268
        ST32(dst   , LD32(src   ));
1269
        ST32(dst+4 , LD32(src+4 ));
1270
        dst+=dstStride;
1271
        src+=srcStride;
1272
    }
1273
}
1274
1275
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1276
{
1277
    int i;
1278
    for(i=0; i<h; i++)
1279
    {
1280
        ST32(dst   , LD32(src   ));
1281
        ST32(dst+4 , LD32(src+4 ));
1282
        ST32(dst+8 , LD32(src+8 ));
1283
        ST32(dst+12, LD32(src+12));
1284
        dst+=dstStride;
1285
        src+=srcStride;
1286
    }
1287
}
1288 073b013d Michael Niedermayer
1289 0c1a9eda Zdenek Kabelac
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1290 44eb4951 Michael Niedermayer
{
1291
    int i;
1292
    for(i=0; i<h; i++)
1293
    {
1294 b3184779 Michael Niedermayer
        ST32(dst   , LD32(src   ));
1295
        ST32(dst+4 , LD32(src+4 ));
1296
        ST32(dst+8 , LD32(src+8 ));
1297
        ST32(dst+12, LD32(src+12));
1298
        dst[16]= src[16];
1299 44eb4951 Michael Niedermayer
        dst+=dstStride;
1300
        src+=srcStride;
1301
    }
1302
}
1303
1304 0c1a9eda Zdenek Kabelac
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1305 44eb4951 Michael Niedermayer
{
1306
    int i;
1307 b3184779 Michael Niedermayer
    for(i=0; i<h; i++)
1308 44eb4951 Michael Niedermayer
    {
1309 b3184779 Michael Niedermayer
        ST32(dst   , LD32(src   ));
1310
        ST32(dst+4 , LD32(src+4 ));
1311
        dst[8]= src[8];
1312 44eb4951 Michael Niedermayer
        dst+=dstStride;
1313
        src+=srcStride;
1314
    }
1315
}
1316
1317 826f429a Michael Niedermayer
1318 b3184779 Michael Niedermayer
#define QPEL_MC(r, OPNAME, RND, OP) \
1319 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1320
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1321 b3184779 Michael Niedermayer
    int i;\
1322
    for(i=0; i<h; i++)\
1323
    {\
1324
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1325
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1326
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1327
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1328
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1329
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1330
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1331
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1332
        dst+=dstStride;\
1333
        src+=srcStride;\
1334
    }\
1335 44eb4951 Michael Niedermayer
}\
1336
\
1337 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1338 db794953 Michael Niedermayer
    const int w=8;\
1339 0c1a9eda Zdenek Kabelac
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1340 b3184779 Michael Niedermayer
    int i;\
1341
    for(i=0; i<w; i++)\
1342
    {\
1343
        const int src0= src[0*srcStride];\
1344
        const int src1= src[1*srcStride];\
1345
        const int src2= src[2*srcStride];\
1346
        const int src3= src[3*srcStride];\
1347
        const int src4= src[4*srcStride];\
1348
        const int src5= src[5*srcStride];\
1349
        const int src6= src[6*srcStride];\
1350
        const int src7= src[7*srcStride];\
1351
        const int src8= src[8*srcStride];\
1352
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1353
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1354
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1355
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1356
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1357
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1358
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1359
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1360
        dst++;\
1361
        src++;\
1362
    }\
1363
}\
1364
\
1365 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1366
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1367 b3184779 Michael Niedermayer
    int i;\
1368 826f429a Michael Niedermayer
    \
1369 b3184779 Michael Niedermayer
    for(i=0; i<h; i++)\
1370
    {\
1371
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1372
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1373
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1374
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1375
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1376
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1377
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1378
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1379
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1380
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1381
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1382
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1383
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1384
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1385
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1386
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1387
        dst+=dstStride;\
1388
        src+=srcStride;\
1389
    }\
1390
}\
1391
\
1392 0c1a9eda Zdenek Kabelac
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1393
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1394 b3184779 Michael Niedermayer
    int i;\
1395 826f429a Michael Niedermayer
    const int w=16;\
1396 b3184779 Michael Niedermayer
    for(i=0; i<w; i++)\
1397
    {\
1398
        const int src0= src[0*srcStride];\
1399
        const int src1= src[1*srcStride];\
1400
        const int src2= src[2*srcStride];\
1401
        const int src3= src[3*srcStride];\
1402
        const int src4= src[4*srcStride];\
1403
        const int src5= src[5*srcStride];\
1404
        const int src6= src[6*srcStride];\
1405
        const int src7= src[7*srcStride];\
1406
        const int src8= src[8*srcStride];\
1407
        const int src9= src[9*srcStride];\
1408
        const int src10= src[10*srcStride];\
1409
        const int src11= src[11*srcStride];\
1410
        const int src12= src[12*srcStride];\
1411
        const int src13= src[13*srcStride];\
1412
        const int src14= src[14*srcStride];\
1413
        const int src15= src[15*srcStride];\
1414
        const int src16= src[16*srcStride];\
1415
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1416
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1417
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1418
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1419
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1420
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1421
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1422
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1423
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1424
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1425
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1426
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1427
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1428
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1429
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1430
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1431
        dst++;\
1432
        src++;\
1433
    }\
1434
}\
1435
\
1436 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1437 45553457 Zdenek Kabelac
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1438 b3184779 Michael Niedermayer
}\
1439
\
1440 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1441
    uint8_t half[64];\
1442 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1443
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1444 44eb4951 Michael Niedermayer
}\
1445
\
1446 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1447 b3184779 Michael Niedermayer
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1448 44eb4951 Michael Niedermayer
}\
1449
\
1450 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1451
    uint8_t half[64];\
1452 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1453
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1454 44eb4951 Michael Niedermayer
}\
1455
\
1456 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1457
    uint8_t full[16*9];\
1458
    uint8_t half[64];\
1459 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1460 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1461 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1462 44eb4951 Michael Niedermayer
}\
1463
\
1464 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1465
    uint8_t full[16*9];\
1466 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1467 db794953 Michael Niedermayer
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1468 44eb4951 Michael Niedermayer
}\
1469
\
1470 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1471
    uint8_t full[16*9];\
1472
    uint8_t half[64];\
1473 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1474 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1475 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1476 44eb4951 Michael Niedermayer
}\
1477 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1478
    uint8_t full[16*9];\
1479
    uint8_t halfH[72];\
1480
    uint8_t halfV[64];\
1481
    uint8_t halfHV[64];\
1482 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1483
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1484 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1485
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1486 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1487 44eb4951 Michael Niedermayer
}\
1488 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1489
    uint8_t full[16*9];\
1490
    uint8_t halfH[72];\
1491
    uint8_t halfHV[64];\
1492 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1493
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1494
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1495
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1496
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1497
}\
1498 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499
    uint8_t full[16*9];\
1500
    uint8_t halfH[72];\
1501
    uint8_t halfV[64];\
1502
    uint8_t halfHV[64];\
1503 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1504
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1506
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1508 44eb4951 Michael Niedermayer
}\
1509 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1510
    uint8_t full[16*9];\
1511
    uint8_t halfH[72];\
1512
    uint8_t halfHV[64];\
1513 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1514
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1516
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1518
}\
1519 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520
    uint8_t full[16*9];\
1521
    uint8_t halfH[72];\
1522
    uint8_t halfV[64];\
1523
    uint8_t halfHV[64];\
1524 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1525
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1526 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1527
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1529 44eb4951 Michael Niedermayer
}\
1530 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1531
    uint8_t full[16*9];\
1532
    uint8_t halfH[72];\
1533
    uint8_t halfHV[64];\
1534 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1535
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1537
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1539
}\
1540 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1541
    uint8_t full[16*9];\
1542
    uint8_t halfH[72];\
1543
    uint8_t halfV[64];\
1544
    uint8_t halfHV[64];\
1545 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1546
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1547 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1548
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1549 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1550 44eb4951 Michael Niedermayer
}\
1551 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1552
    uint8_t full[16*9];\
1553
    uint8_t halfH[72];\
1554
    uint8_t halfHV[64];\
1555 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1556
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1557
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1558
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1560
}\
1561 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1562
    uint8_t halfH[72];\
1563
    uint8_t halfHV[64];\
1564 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1565 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1566 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1567 44eb4951 Michael Niedermayer
}\
1568 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1569
    uint8_t halfH[72];\
1570
    uint8_t halfHV[64];\
1571 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1572 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1573 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1574 44eb4951 Michael Niedermayer
}\
1575 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1576
    uint8_t full[16*9];\
1577
    uint8_t halfH[72];\
1578
    uint8_t halfV[64];\
1579
    uint8_t halfHV[64];\
1580 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1581
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1582 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1583
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1584 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1585 44eb4951 Michael Niedermayer
}\
1586 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1587
    uint8_t full[16*9];\
1588
    uint8_t halfH[72];\
1589 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1590
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1591
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1592
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1593
}\
1594 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1595
    uint8_t full[16*9];\
1596
    uint8_t halfH[72];\
1597
    uint8_t halfV[64];\
1598
    uint8_t halfHV[64];\
1599 b3184779 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1600
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1601 db794953 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1602
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1603 b3184779 Michael Niedermayer
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1604 44eb4951 Michael Niedermayer
}\
1605 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1606
    uint8_t full[16*9];\
1607
    uint8_t halfH[72];\
1608 db794953 Michael Niedermayer
    copy_block9(full, src, 16, stride, 9);\
1609
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1610
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1611
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1612
}\
1613 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1614
    uint8_t halfH[72];\
1615 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1616 db794953 Michael Niedermayer
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1617 b3184779 Michael Niedermayer
}\
1618 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1619 45553457 Zdenek Kabelac
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1620 b3184779 Michael Niedermayer
}\
1621
\
1622 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1623
    uint8_t half[256];\
1624 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1625
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1626
}\
1627
\
1628 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1629 b3184779 Michael Niedermayer
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1630 44eb4951 Michael Niedermayer
}\
1631 b3184779 Michael Niedermayer
\
1632 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1633
    uint8_t half[256];\
1634 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1635
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1636
}\
1637
\
1638 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1639
    uint8_t full[24*17];\
1640
    uint8_t half[256];\
1641 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1642 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1643 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1644
}\
1645
\
1646 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1647
    uint8_t full[24*17];\
1648 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1649 826f429a Michael Niedermayer
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1650 b3184779 Michael Niedermayer
}\
1651
\
1652 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1653
    uint8_t full[24*17];\
1654
    uint8_t half[256];\
1655 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1656 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1657 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1658
}\
1659 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1660
    uint8_t full[24*17];\
1661
    uint8_t halfH[272];\
1662
    uint8_t halfV[256];\
1663
    uint8_t halfHV[256];\
1664 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1665
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1666 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1667
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1668 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1669
}\
1670 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1671
    uint8_t full[24*17];\
1672
    uint8_t halfH[272];\
1673
    uint8_t halfHV[256];\
1674 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1675
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1676
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1677
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1678
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1679
}\
1680 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681
    uint8_t full[24*17];\
1682
    uint8_t halfH[272];\
1683
    uint8_t halfV[256];\
1684
    uint8_t halfHV[256];\
1685 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1686
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1688
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1690
}\
1691 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1692
    uint8_t full[24*17];\
1693
    uint8_t halfH[272];\
1694
    uint8_t halfHV[256];\
1695 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1696
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1698
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1700
}\
1701 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702
    uint8_t full[24*17];\
1703
    uint8_t halfH[272];\
1704
    uint8_t halfV[256];\
1705
    uint8_t halfHV[256];\
1706 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1707
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1708 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1709
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1711
}\
1712 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1713
    uint8_t full[24*17];\
1714
    uint8_t halfH[272];\
1715
    uint8_t halfHV[256];\
1716 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1717
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1719
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1721
}\
1722 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1723
    uint8_t full[24*17];\
1724
    uint8_t halfH[272];\
1725
    uint8_t halfV[256];\
1726
    uint8_t halfHV[256];\
1727 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1728
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1729 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1730
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1731 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1732
}\
1733 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1734
    uint8_t full[24*17];\
1735
    uint8_t halfH[272];\
1736
    uint8_t halfHV[256];\
1737 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1738
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1739
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1740
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1742
}\
1743 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1744
    uint8_t halfH[272];\
1745
    uint8_t halfHV[256];\
1746 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1747 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1748 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1749
}\
1750 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1751
    uint8_t halfH[272];\
1752
    uint8_t halfHV[256];\
1753 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1754 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1755 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1756
}\
1757 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1758
    uint8_t full[24*17];\
1759
    uint8_t halfH[272];\
1760
    uint8_t halfV[256];\
1761
    uint8_t halfHV[256];\
1762 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1763
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1764 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1765
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1766 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1767
}\
1768 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1769
    uint8_t full[24*17];\
1770
    uint8_t halfH[272];\
1771 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1772
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1773
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1774
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1775
}\
1776 0c1a9eda Zdenek Kabelac
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1777
    uint8_t full[24*17];\
1778
    uint8_t halfH[272];\
1779
    uint8_t halfV[256];\
1780
    uint8_t halfHV[256];\
1781 b3184779 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1782
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1783 826f429a Michael Niedermayer
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1784
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1785 b3184779 Michael Niedermayer
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1786
}\
1787 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1788
    uint8_t full[24*17];\
1789
    uint8_t halfH[272];\
1790 db794953 Michael Niedermayer
    copy_block17(full, src, 24, stride, 17);\
1791
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1792
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1793
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1794
}\
1795 0c1a9eda Zdenek Kabelac
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1796
    uint8_t halfH[272];\
1797 b3184779 Michael Niedermayer
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1798 826f429a Michael Niedermayer
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1799 45553457 Zdenek Kabelac
}
1800 44eb4951 Michael Niedermayer
1801 b3184779 Michael Niedermayer
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1802
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1803
#define op_put(a, b) a = cm[((b) + 16)>>5]
1804
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1805
1806
QPEL_MC(0, put_       , _       , op_put)
1807
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1808
QPEL_MC(0, avg_       , _       , op_avg)
1809
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1810
#undef op_avg
1811
#undef op_avg_no_rnd
1812
#undef op_put
1813
#undef op_put_no_rnd
1814 44eb4951 Michael Niedermayer
1815 0da71265 Michael Niedermayer
#if 1
1816
#define H264_LOWPASS(OPNAME, OP, OP2) \
1817
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1818
    const int h=4;\
1819
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1820
    int i;\
1821
    for(i=0; i<h; i++)\
1822
    {\
1823
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1824
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1825
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1826
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1827
        dst+=dstStride;\
1828
        src+=srcStride;\
1829
    }\
1830
}\
1831
\
1832
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1833
    const int w=4;\
1834
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1835
    int i;\
1836
    for(i=0; i<w; i++)\
1837
    {\
1838
        const int srcB= src[-2*srcStride];\
1839
        const int srcA= src[-1*srcStride];\
1840
        const int src0= src[0 *srcStride];\
1841
        const int src1= src[1 *srcStride];\
1842
        const int src2= src[2 *srcStride];\
1843
        const int src3= src[3 *srcStride];\
1844
        const int src4= src[4 *srcStride];\
1845
        const int src5= src[5 *srcStride];\
1846
        const int src6= src[6 *srcStride];\
1847
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1848
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1849
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1850
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1851
        dst++;\
1852
        src++;\
1853
    }\
1854
}\
1855
\
1856
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1857
    const int h=4;\
1858
    const int w=4;\
1859
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1860
    int i;\
1861
    src -= 2*srcStride;\
1862
    for(i=0; i<h+5; i++)\
1863
    {\
1864
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1865
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1866
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1867
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1868
        tmp+=tmpStride;\
1869
        src+=srcStride;\
1870
    }\
1871
    tmp -= tmpStride*(h+5-2);\
1872
    for(i=0; i<w; i++)\
1873
    {\
1874
        const int tmpB= tmp[-2*tmpStride];\
1875
        const int tmpA= tmp[-1*tmpStride];\
1876
        const int tmp0= tmp[0 *tmpStride];\
1877
        const int tmp1= tmp[1 *tmpStride];\
1878
        const int tmp2= tmp[2 *tmpStride];\
1879
        const int tmp3= tmp[3 *tmpStride];\
1880
        const int tmp4= tmp[4 *tmpStride];\
1881
        const int tmp5= tmp[5 *tmpStride];\
1882
        const int tmp6= tmp[6 *tmpStride];\
1883
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1884
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1885
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1886
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1887
        dst++;\
1888
        tmp++;\
1889
    }\
1890
}\
1891
\
1892
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1893
    const int h=8;\
1894
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1895
    int i;\
1896
    for(i=0; i<h; i++)\
1897
    {\
1898
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1899
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1900
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1901
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1902
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1903
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1904
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1905
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1906
        dst+=dstStride;\
1907
        src+=srcStride;\
1908
    }\
1909
}\
1910
\
1911
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1912
    const int w=8;\
1913
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1914
    int i;\
1915
    for(i=0; i<w; i++)\
1916
    {\
1917
        const int srcB= src[-2*srcStride];\
1918
        const int srcA= src[-1*srcStride];\
1919
        const int src0= src[0 *srcStride];\
1920
        const int src1= src[1 *srcStride];\
1921
        const int src2= src[2 *srcStride];\
1922
        const int src3= src[3 *srcStride];\
1923
        const int src4= src[4 *srcStride];\
1924
        const int src5= src[5 *srcStride];\
1925
        const int src6= src[6 *srcStride];\
1926
        const int src7= src[7 *srcStride];\
1927
        const int src8= src[8 *srcStride];\
1928
        const int src9= src[9 *srcStride];\
1929
        const int src10=src[10*srcStride];\
1930
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1931
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1932
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1933
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1934
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1935
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1936
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1937
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1938
        dst++;\
1939
        src++;\
1940
    }\
1941
}\
1942
\
1943
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1944
    const int h=8;\
1945
    const int w=8;\
1946
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1947
    int i;\
1948
    src -= 2*srcStride;\
1949
    for(i=0; i<h+5; i++)\
1950
    {\
1951
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1952
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1953
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1954
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1955
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1956
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1957
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1958
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1959
        tmp+=tmpStride;\
1960
        src+=srcStride;\
1961
    }\
1962
    tmp -= tmpStride*(h+5-2);\
1963
    for(i=0; i<w; i++)\
1964
    {\
1965
        const int tmpB= tmp[-2*tmpStride];\
1966
        const int tmpA= tmp[-1*tmpStride];\
1967
        const int tmp0= tmp[0 *tmpStride];\
1968
        const int tmp1= tmp[1 *tmpStride];\
1969
        const int tmp2= tmp[2 *tmpStride];\
1970
        const int tmp3= tmp[3 *tmpStride];\
1971
        const int tmp4= tmp[4 *tmpStride];\
1972
        const int tmp5= tmp[5 *tmpStride];\
1973
        const int tmp6= tmp[6 *tmpStride];\
1974
        const int tmp7= tmp[7 *tmpStride];\
1975
        const int tmp8= tmp[8 *tmpStride];\
1976
        const int tmp9= tmp[9 *tmpStride];\
1977
        const int tmp10=tmp[10*tmpStride];\
1978
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1979
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1980
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1981
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1982
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1983
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1984
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1985
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1986
        dst++;\
1987
        tmp++;\
1988
    }\
1989
}\
1990
\
1991
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1992
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1993
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1994
    src += 8*srcStride;\
1995
    dst += 8*dstStride;\
1996
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1997
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1998
}\
1999
\
2000
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2001
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2002
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2003
    src += 8*srcStride;\
2004
    dst += 8*dstStride;\
2005
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2006
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2007
}\
2008
\
2009
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2010
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2011
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2012
    src += 8*srcStride;\
2013
    tmp += 8*tmpStride;\
2014
    dst += 8*dstStride;\
2015
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2016
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2017
}\
2018
2019
#define H264_MC(OPNAME, SIZE) \
2020
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2021
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2022
}\
2023
\
2024
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2025
    uint8_t half[SIZE*SIZE];\
2026
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2027
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2028
}\
2029
\
2030
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2031
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2032
}\
2033
\
2034
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2035
    uint8_t half[SIZE*SIZE];\
2036
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2037
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2038
}\
2039
\
2040
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2041
    uint8_t full[SIZE*(SIZE+5)];\
2042
    uint8_t * const full_mid= full + SIZE*2;\
2043
    uint8_t half[SIZE*SIZE];\
2044
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2045
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2046
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2047
}\
2048
\
2049
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2050
    uint8_t full[SIZE*(SIZE+5)];\
2051
    uint8_t * const full_mid= full + SIZE*2;\
2052
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2053
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2054
}\
2055
\
2056
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2057
    uint8_t full[SIZE*(SIZE+5)];\
2058
    uint8_t * const full_mid= full + SIZE*2;\
2059
    uint8_t half[SIZE*SIZE];\
2060
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2061
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2062
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2063
}\
2064
\
2065
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2066
    uint8_t full[SIZE*(SIZE+5)];\
2067
    uint8_t * const full_mid= full + SIZE*2;\
2068
    uint8_t halfH[SIZE*SIZE];\
2069
    uint8_t halfV[SIZE*SIZE];\
2070
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2071
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2072
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2073
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2074
}\
2075
\
2076
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2077
    uint8_t full[SIZE*(SIZE+5)];\
2078
    uint8_t * const full_mid= full + SIZE*2;\
2079
    uint8_t halfH[SIZE*SIZE];\
2080
    uint8_t halfV[SIZE*SIZE];\
2081
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2082
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2083
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2084
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2085
}\
2086
\
2087
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2088
    uint8_t full[SIZE*(SIZE+5)];\
2089
    uint8_t * const full_mid= full + SIZE*2;\
2090
    uint8_t halfH[SIZE*SIZE];\
2091
    uint8_t halfV[SIZE*SIZE];\
2092
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2093
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2094
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2095
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2096
}\
2097
\
2098
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2099
    uint8_t full[SIZE*(SIZE+5)];\
2100
    uint8_t * const full_mid= full + SIZE*2;\
2101
    uint8_t halfH[SIZE*SIZE];\
2102
    uint8_t halfV[SIZE*SIZE];\
2103
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2104
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2105
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2106
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2107
}\
2108
\
2109
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2110
    int16_t tmp[SIZE*(SIZE+5)];\
2111
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2112
}\
2113
\
2114
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2115
    int16_t tmp[SIZE*(SIZE+5)];\
2116
    uint8_t halfH[SIZE*SIZE];\
2117
    uint8_t halfHV[SIZE*SIZE];\
2118
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2119
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2120
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2121
}\
2122
\
2123
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2124
    int16_t tmp[SIZE*(SIZE+5)];\
2125
    uint8_t halfH[SIZE*SIZE];\
2126
    uint8_t halfHV[SIZE*SIZE];\
2127
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2128
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2129
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2130
}\
2131
\
2132
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2133
    uint8_t full[SIZE*(SIZE+5)];\
2134
    uint8_t * const full_mid= full + SIZE*2;\
2135
    int16_t tmp[SIZE*(SIZE+5)];\
2136
    uint8_t halfV[SIZE*SIZE];\
2137
    uint8_t halfHV[SIZE*SIZE];\
2138
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2139
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2140
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2141
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2142
}\
2143
\
2144
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2145
    uint8_t full[SIZE*(SIZE+5)];\
2146
    uint8_t * const full_mid= full + SIZE*2;\
2147
    int16_t tmp[SIZE*(SIZE+5)];\
2148
    uint8_t halfV[SIZE*SIZE];\
2149
    uint8_t halfHV[SIZE*SIZE];\
2150
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2151
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2152
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2153
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2154
}\
2155
2156
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2157
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2158
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2159
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2160
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2161
2162
H264_LOWPASS(put_       , op_put, op2_put)
2163
H264_LOWPASS(avg_       , op_avg, op2_avg)
2164
H264_MC(put_, 4)
2165
H264_MC(put_, 8)
2166
H264_MC(put_, 16)
2167
H264_MC(avg_, 4)
2168
H264_MC(avg_, 8)
2169
H264_MC(avg_, 16)
2170
2171
#undef op_avg
2172
#undef op_put
2173
#undef op2_avg
2174
#undef op2_put
2175
#endif
2176
2177 1457ab52 Michael Niedermayer
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2178
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2179
    int i;
2180
2181
    for(i=0; i<h; i++){
2182
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2183
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2184
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2185
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2186
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2187
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2188
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2189
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2190
        dst+=dstStride;
2191
        src+=srcStride;        
2192
    }
2193
}
2194
2195
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2196
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2197
    int i;
2198
2199
    for(i=0; i<w; i++){
2200
        const int src_1= src[ -srcStride];
2201
        const int src0 = src[0          ];
2202
        const int src1 = src[  srcStride];
2203
        const int src2 = src[2*srcStride];
2204
        const int src3 = src[3*srcStride];
2205
        const int src4 = src[4*srcStride];
2206
        const int src5 = src[5*srcStride];
2207
        const int src6 = src[6*srcStride];
2208
        const int src7 = src[7*srcStride];
2209
        const int src8 = src[8*srcStride];
2210
        const int src9 = src[9*srcStride];
2211
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2212
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2213
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2214
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2215
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2216
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2217
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2218
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2219
        src++;
2220
        dst++;
2221
    }
2222
}
2223
2224
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2225
    put_pixels8_c(dst, src, stride, 8);
2226
}
2227
2228
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2229
    uint8_t half[64];
2230
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2231
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2232
}
2233
2234
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2235
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2236
}
2237
2238
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2239
    uint8_t half[64];
2240
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2241
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2242
}
2243
2244
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2245
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2246
}
2247
2248
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2249
    uint8_t halfH[88];
2250
    uint8_t halfV[64];
2251
    uint8_t halfHV[64];
2252
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2253
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2254
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2255
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2256
}
2257
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2258
    uint8_t halfH[88];
2259
    uint8_t halfV[64];
2260
    uint8_t halfHV[64];
2261
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2262
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2263
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2264
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2265
}
2266
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2267
    uint8_t halfH[88];
2268
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2269
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2270
}
2271
2272 332f9ac4 Michael Niedermayer
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2273
    int x;
2274
    const int strength= ff_h263_loop_filter_strength[qscale];
2275
    
2276
    for(x=0; x<8; x++){
2277
        int d1, d2, ad1;
2278
        int p0= src[x-2*stride];
2279
        int p1= src[x-1*stride];
2280
        int p2= src[x+0*stride];
2281
        int p3= src[x+1*stride];
2282
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2283
2284
        if     (d<-2*strength) d1= 0;
2285
        else if(d<-  strength) d1=-2*strength - d;
2286
        else if(d<   strength) d1= d;
2287
        else if(d< 2*strength) d1= 2*strength - d;
2288
        else                   d1= 0;
2289
        
2290
        p1 += d1;
2291
        p2 -= d1;
2292
        if(p1&256) p1= ~(p1>>31);
2293
        if(p2&256) p2= ~(p2>>31);
2294
        
2295
        src[x-1*stride] = p1;
2296
        src[x+0*stride] = p2;
2297
2298 5b5404e3 Michael Niedermayer
        ad1= ABS(d1)>>1;
2299 332f9ac4 Michael Niedermayer
        
2300
        d2= clip((p0-p3)/4, -ad1, ad1);
2301
        
2302
        src[x-2*stride] = p0 - d2;
2303
        src[x+  stride] = p3 + d2;
2304
    }
2305
}
2306
2307
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2308
    int y;
2309
    const int strength= ff_h263_loop_filter_strength[qscale];
2310
    
2311
    for(y=0; y<8; y++){
2312
        int d1, d2, ad1;
2313
        int p0= src[y*stride-2];
2314
        int p1= src[y*stride-1];
2315
        int p2= src[y*stride+0];
2316
        int p3= src[y*stride+1];
2317
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2318
2319
        if     (d<-2*strength) d1= 0;
2320
        else if(d<-  strength) d1=-2*strength - d;
2321
        else if(d<   strength) d1= d;
2322
        else if(d< 2*strength) d1= 2*strength - d;
2323
        else                   d1= 0;
2324
        
2325
        p1 += d1;
2326
        p2 -= d1;
2327
        if(p1&256) p1= ~(p1>>31);
2328
        if(p2&256) p2= ~(p2>>31);
2329
        
2330
        src[y*stride-1] = p1;
2331
        src[y*stride+0] = p2;
2332
2333
        ad1= ABS(d1)>>1;
2334
        
2335
        d2= clip((p0-p3)/4, -ad1, ad1);
2336
        
2337
        src[y*stride-2] = p0 - d2;
2338
        src[y*stride+1] = p3 + d2;
2339
    }
2340
}
2341 1457ab52 Michael Niedermayer
2342 bb198e19 Michael Niedermayer
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2343 de6d9b64 Fabrice Bellard
{
2344
    int s, i;
2345
2346
    s = 0;
2347 bb198e19 Michael Niedermayer
    for(i=0;i<h;i++) {
2348 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - pix2[0]);
2349
        s += abs(pix1[1] - pix2[1]);
2350
        s += abs(pix1[2] - pix2[2]);
2351
        s += abs(pix1[3] - pix2[3]);
2352
        s += abs(pix1[4] - pix2[4]);
2353
        s += abs(pix1[5] - pix2[5]);
2354
        s += abs(pix1[6] - pix2[6]);
2355
        s += abs(pix1[7] - pix2[7]);
2356
        s += abs(pix1[8] - pix2[8]);
2357
        s += abs(pix1[9] - pix2[9]);
2358
        s += abs(pix1[10] - pix2[10]);
2359
        s += abs(pix1[11] - pix2[11]);
2360
        s += abs(pix1[12] - pix2[12]);
2361
        s += abs(pix1[13] - pix2[13]);
2362
        s += abs(pix1[14] - pix2[14]);
2363
        s += abs(pix1[15] - pix2[15]);
2364
        pix1 += line_size;
2365
        pix2 += line_size;
2366
    }
2367
    return s;
2368
}
2369
2370 bb198e19 Michael Niedermayer
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2371 de6d9b64 Fabrice Bellard
{
2372
    int s, i;
2373
2374
    s = 0;
2375 bb198e19 Michael Niedermayer
    for(i=0;i<h;i++) {
2376 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2377
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2378
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2379
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2380
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2381
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2382
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2383
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2384
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2385
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2386
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2387
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2388
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2389
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2390
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2391
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2392
        pix1 += line_size;
2393
        pix2 += line_size;
2394
    }
2395
    return s;
2396
}
2397
2398 bb198e19 Michael Niedermayer
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2399 de6d9b64 Fabrice Bellard
{
2400
    int s, i;
2401 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
2402 de6d9b64 Fabrice Bellard
2403
    s = 0;
2404 bb198e19 Michael Niedermayer
    for(i=0;i<h;i++) {
2405 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2406
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2407
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2408
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2409
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2410
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2411
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2412
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2413
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2414
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2415
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2416
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2417
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2418
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2419
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2420
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2421
        pix1 += line_size;
2422
        pix2 += line_size;
2423
        pix3 += line_size;
2424
    }
2425
    return s;
2426
}
2427
2428 bb198e19 Michael Niedermayer
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2429 de6d9b64 Fabrice Bellard
{
2430
    int s, i;
2431 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
2432 de6d9b64 Fabrice Bellard
2433
    s = 0;
2434 bb198e19 Michael Niedermayer
    for(i=0;i<h;i++) {
2435 de6d9b64 Fabrice Bellard
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2436
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2437
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2438
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2439
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2440
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2441
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2442
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2443
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2444
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2445
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2446
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2447
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2448
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2449
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2450
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2451
        pix1 += line_size;
2452
        pix2 += line_size;
2453
        pix3 += line_size;
2454
    }
2455
    return s;
2456
}
2457
2458 bb198e19 Michael Niedermayer
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2459 ba6802de Michael Niedermayer
{
2460
    int s, i;
2461
2462
    s = 0;
2463 bb198e19 Michael Niedermayer
    for(i=0;i<h;i++) {
2464 ba6802de Michael Niedermayer
        s += abs(pix1[0] - pix2[0]);
2465
        s += abs(pix1[1] - pix2[1]);
2466
        s += abs(pix1[2] - pix2[2]);
2467
        s += abs(pix1[3] - pix2[3]);
2468
        s += abs(pix1[4] - pix2[4]);
2469
        s += abs(pix1[5] - pix2[5]);
2470
        s += abs(pix1[6] - pix2[6]);
2471
        s += abs(pix1[7] - pix2[7]);
2472
        pix1 += line_size;
2473
        pix2 += line_size;
2474
    }
2475
    return s;
2476
}
2477
2478 bb198e19 Michael Niedermayer
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2479 ba6802de Michael Niedermayer
{
2480
    int s, i;
2481
2482
    s = 0;
2483 bb198e19 Michael Niedermayer
    for(i=0;i<h;i++) {
2484 ba6802de Michael Niedermayer
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2485
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2486
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2487
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2488
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2489
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2490
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2491
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2492
        pix1 += line_size;
2493
        pix2 += line_size;
2494
    }
2495
    return s;
2496
}
2497
2498 bb198e19 Michael Niedermayer
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2499 ba6802de Michael Niedermayer
{
2500
    int s, i;
2501 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
2502 ba6802de Michael Niedermayer
2503
    s = 0;
2504 bb198e19 Michael Niedermayer
    for(i=0;i<h;i++) {
2505 ba6802de Michael Niedermayer
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2506
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2507
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2508
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2509
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2510
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2511
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2512
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2513
        pix1 += line_size;
2514
        pix2 += line_size;
2515
        pix3 += line_size;
2516
    }
2517
    return s;
2518
}
2519
2520 bb198e19 Michael Niedermayer
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2521 ba6802de Michael Niedermayer
{
2522
    int s, i;
2523 0c1a9eda Zdenek Kabelac
    uint8_t *pix3 = pix2 + line_size;
2524 ba6802de Michael Niedermayer
2525
    s = 0;
2526 bb198e19 Michael Niedermayer
    for(i=0;i<h;i++) {
2527 ba6802de Michael Niedermayer
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2528
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2529
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2530
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2531
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2532
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2533
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2534
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2535
        pix1 += line_size;
2536
        pix2 += line_size;
2537
        pix3 += line_size;
2538
    }
2539
    return s;
2540
}
2541
2542 364a1797 Michael Niedermayer
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2543
    int i;
2544
    unsigned int sum=0;
2545
2546
    for(i=0; i<8*8; i++){
2547
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2548
        int w= weight[i];
2549
        b>>= RECON_SHIFT;
2550
        assert(-512<b && b<512);
2551
2552
        sum += (w*b)*(w*b)>>4;
2553
    }
2554
    return sum>>2;
2555
}
2556
2557
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2558
    int i;
2559
2560
    for(i=0; i<8*8; i++){
2561
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2562
    }    
2563
}
2564
2565 a9badb51 Michael Niedermayer
/**
2566
 * permutes an 8x8 block.
2567 2a5700de Michael Niedermayer
 * @param block the block which will be permuted according to the given permutation vector
2568 a9badb51 Michael Niedermayer
 * @param permutation the permutation vector
2569
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2570 2a5700de Michael Niedermayer
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
2571
 *                  (inverse) permutated to scantable order!
2572 a9badb51 Michael Niedermayer
 */
2573 0c1a9eda Zdenek Kabelac
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2574 d962f6fd Arpi
{
2575 7801d21d Michael Niedermayer
    int i;
2576 477ab036 Michael Niedermayer
    DCTELEM temp[64];
2577 7801d21d Michael Niedermayer
    
2578
    if(last<=0) return;
2579 9a7b310d Zdenek Kabelac
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2580 d962f6fd Arpi
2581 7801d21d Michael Niedermayer
    for(i=0; i<=last; i++){
2582
        const int j= scantable[i];
2583
        temp[j]= block[j];
2584
        block[j]=0;
2585
    }
2586
    
2587
    for(i=0; i<=last; i++){
2588
        const int j= scantable[i];
2589
        const int perm_j= permutation[j];
2590
        block[perm_j]= temp[j];
2591
    }
2592 d962f6fd Arpi
}
2593 e0eac44e Fabrice Bellard
2594 622348f9 Michael Niedermayer
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2595
    return 0;
2596
}
2597
2598
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2599
    int i;
2600
    
2601
    memset(cmp, 0, sizeof(void*)*5);
2602
        
2603
    for(i=0; i<5; i++){
2604
        switch(type&0xFF){
2605
        case FF_CMP_SAD:
2606
            cmp[i]= c->sad[i];
2607
            break;
2608
        case FF_CMP_SATD:
2609
            cmp[i]= c->hadamard8_diff[i];
2610
            break;
2611
        case FF_CMP_SSE:
2612
            cmp[i]= c->sse[i];
2613
            break;
2614
        case FF_CMP_DCT:
2615
            cmp[i]= c->dct_sad[i];
2616
            break;
2617
        case FF_CMP_PSNR:
2618
            cmp[i]= c->quant_psnr[i];
2619
            break;
2620
        case FF_CMP_BIT:
2621
            cmp[i]= c->bit[i];
2622
            break;
2623
        case FF_CMP_RD:
2624
            cmp[i]= c->rd[i];
2625
            break;
2626
        case FF_CMP_VSAD:
2627
            cmp[i]= c->vsad[i];
2628
            break;
2629
        case FF_CMP_VSSE:
2630
            cmp[i]= c->vsse[i];
2631
            break;
2632
        case FF_CMP_ZERO:
2633
            cmp[i]= zero_cmp;
2634
            break;
2635
        default:
2636
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2637
        }
2638
    }
2639
}
2640
2641 2a5700de Michael Niedermayer
/**
2642
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2643
 */
2644 eb4b3dd3 Zdenek Kabelac
static void clear_blocks_c(DCTELEM *blocks)
2645 649c00c9 Michael Niedermayer
{
2646
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
2647
}
2648
2649 11f18faf Michael Niedermayer
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2650
    int i;
2651 d32ac509 Felix von Leitner
    for(i=0; i+7<w; i+=8){
2652 11f18faf Michael Niedermayer
        dst[i+0] += src[i+0];
2653
        dst[i+1] += src[i+1];
2654
        dst[i+2] += src[i+2];
2655
        dst[i+3] += src[i+3];
2656
        dst[i+4] += src[i+4];
2657
        dst[i+5] += src[i+5];
2658
        dst[i+6] += src[i+6];
2659
        dst[i+7] += src[i+7];
2660
    }
2661
    for(; i<w; i++)
2662
        dst[i+0] += src[i+0];
2663
}
2664
2665
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2666
    int i;
2667 d32ac509 Felix von Leitner
    for(i=0; i+7<w; i+=8){
2668 11f18faf Michael Niedermayer
        dst[i+0] = src1[i+0]-src2[i+0];
2669
        dst[i+1] = src1[i+1]-src2[i+1];
2670
        dst[i+2] = src1[i+2]-src2[i+2];
2671
        dst[i+3] = src1[i+3]-src2[i+3];
2672
        dst[i+4] = src1[i+4]-src2[i+4];
2673
        dst[i+5] = src1[i+5]-src2[i+5];
2674
        dst[i+6] = src1[i+6]-src2[i+6];
2675
        dst[i+7] = src1[i+7]-src2[i+7];
2676
    }
2677
    for(; i<w; i++)
2678
        dst[i+0] = src1[i+0]-src2[i+0];
2679
}
2680
2681 84705403 Michael Niedermayer
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2682
    int i;
2683
    uint8_t l, lt;
2684
2685
    l= *left;
2686
    lt= *left_top;
2687
2688
    for(i=0; i<w; i++){
2689
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2690
        lt= src1[i];
2691
        l= src2[i];
2692
        dst[i]= l - pred;
2693
    }    
2694
2695
    *left= l;
2696
    *left_top= lt;
2697
}
2698
2699 1457ab52 Michael Niedermayer
#define BUTTERFLY2(o1,o2,i1,i2) \
2700
o1= (i1)+(i2);\
2701
o2= (i1)-(i2);
2702
2703
#define BUTTERFLY1(x,y) \
2704
{\
2705
    int a,b;\
2706
    a= x;\
2707
    b= y;\
2708
    x= a+b;\
2709
    y= a-b;\
2710
}
2711
2712
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2713
2714 bb198e19 Michael Niedermayer
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2715 1457ab52 Michael Niedermayer
    int i;
2716
    int temp[64];
2717
    int sum=0;
2718 bb198e19 Michael Niedermayer
    
2719
    assert(h==8);
2720 1457ab52 Michael Niedermayer
2721
    for(i=0; i<8; i++){
2722
        //FIXME try pointer walks
2723
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2724
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2725
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2726
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2727
        
2728
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2729
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2730
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2731
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2732
        
2733
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2734
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2735
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2736
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2737
    }
2738
2739
    for(i=0; i<8; i++){