Statistics
| Branch: | Revision:

ffmpeg / libavcodec / sh4 / qpel.c @ 2912e87a

History | View | Annotate | Download (60.7 KB)

1
/*
2
 * This is optimized for sh, which have post increment addressing (*p++).
3
 * Some CPU may be index (p[n]) faster than post increment (*p++).
4
 *
5
 * copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
6
 *
7
 * This file is part of Libav.
8
 *
9
 * Libav is free software; you can redistribute it and/or
10
 * modify it under the terms of the GNU Lesser General Public
11
 * License as published by the Free Software Foundation; either
12
 * version 2.1 of the License, or (at your option) any later version.
13
 *
14
 * Libav is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 * Lesser General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU Lesser General Public
20
 * License along with Libav; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
 */
23

    
24
#define PIXOP2(OPNAME, OP) \
25
\
26
static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
27
{\
28
        do {\
29
                OP(LP(dst  ),rnd_avg32(LPC(src1  ),LPC(src2  )) ); \
30
                src1+=src_stride1; \
31
                src2+=src_stride2; \
32
                dst+=dst_stride; \
33
        } while(--h); \
34
}\
35
\
36
static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
37
{\
38
        do {\
39
                OP(LP(dst  ),rnd_avg32(AV_RN32(src1  ),LPC(src2  )) ); \
40
                src1+=src_stride1; \
41
                src2+=src_stride2; \
42
                dst+=dst_stride; \
43
        } while(--h); \
44
}\
45
\
46
static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
47
{\
48
        do {\
49
                OP(LP(dst  ),no_rnd_avg32(AV_RN32(src1  ),LPC(src2  )) ); \
50
                OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
51
                OP(LP(dst+8),no_rnd_avg32(AV_RN32(src1+8),LPC(src2+8)) ); \
52
                OP(LP(dst+12),no_rnd_avg32(AV_RN32(src1+12),LPC(src2+12)) ); \
53
                src1+=src_stride1; \
54
                src2+=src_stride2; \
55
                dst+=dst_stride; \
56
        } while(--h); \
57
}\
58
\
59
static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
60
{\
61
        do {\
62
                OP(LP(dst  ),rnd_avg32(AV_RN32(src1  ),LPC(src2  )) ); \
63
                OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
64
                OP(LP(dst+8),rnd_avg32(AV_RN32(src1+8),LPC(src2+8)) ); \
65
                OP(LP(dst+12),rnd_avg32(AV_RN32(src1+12),LPC(src2+12)) ); \
66
                src1+=src_stride1; \
67
                src2+=src_stride2; \
68
                dst+=dst_stride; \
69
        } while(--h); \
70
}\
71
\
72
static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
73
{\
74
        do { /* onlye src2 aligned */\
75
                OP(LP(dst  ),no_rnd_avg32(AV_RN32(src1  ),LPC(src2  )) ); \
76
                OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
77
                src1+=src_stride1; \
78
                src2+=src_stride2; \
79
                dst+=dst_stride; \
80
        } while(--h); \
81
}\
82
\
83
static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
84
{\
85
        do {\
86
                OP(LP(dst  ),rnd_avg32(AV_RN32(src1  ),LPC(src2  )) ); \
87
                OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
88
                src1+=src_stride1; \
89
                src2+=src_stride2; \
90
                dst+=dst_stride; \
91
        } while(--h); \
92
}\
93
\
94
static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
95
{\
96
        do {\
97
                OP(LP(dst  ),no_rnd_avg32(LPC(src1  ),LPC(src2  )) ); \
98
                OP(LP(dst+4),no_rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
99
                src1+=src_stride1; \
100
                src2+=src_stride2; \
101
                dst+=dst_stride; \
102
        } while(--h); \
103
}\
104
\
105
static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
106
{\
107
        do {\
108
                OP(LP(dst  ),rnd_avg32(LPC(src1  ),LPC(src2  )) ); \
109
                OP(LP(dst+4),rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
110
                src1+=src_stride1; \
111
                src2+=src_stride2; \
112
                dst+=dst_stride; \
113
        } while(--h); \
114
}\
115
\
116
static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
117
{\
118
        do {\
119
                OP(LP(dst  ),no_rnd_avg32(LPC(src1  ),LPC(src2  )) ); \
120
                OP(LP(dst+4),no_rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
121
                OP(LP(dst+8),no_rnd_avg32(LPC(src1+8),LPC(src2+8)) ); \
122
                OP(LP(dst+12),no_rnd_avg32(LPC(src1+12),LPC(src2+12)) ); \
123
                src1+=src_stride1; \
124
                src2+=src_stride2; \
125
                dst+=dst_stride; \
126
        } while(--h); \
127
}\
128
\
129
static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
130
{\
131
        do {\
132
                OP(LP(dst  ),rnd_avg32(LPC(src1  ),LPC(src2  )) ); \
133
                OP(LP(dst+4),rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
134
                OP(LP(dst+8),rnd_avg32(LPC(src1+8),LPC(src2+8)) ); \
135
                OP(LP(dst+12),rnd_avg32(LPC(src1+12),LPC(src2+12)) ); \
136
                src1+=src_stride1; \
137
                src2+=src_stride2; \
138
                dst+=dst_stride; \
139
        } while(--h); \
140
}\
141
\
142
static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
143
{ OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
144
\
145
static inline void OPNAME ## _pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
146
{ OPNAME ## _pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
147
\
148
static inline void OPNAME ## _no_rnd_pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
149
{ OPNAME ## _no_rnd_pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
150
\
151
static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
152
{ OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
153
\
154
static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
155
        do { \
156
                uint32_t a0,a1,a2,a3; \
157
                UNPACK(a0,a1,LPC(src1),LPC(src2)); \
158
                UNPACK(a2,a3,LPC(src3),LPC(src4)); \
159
                OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
160
                UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
161
                UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
162
                OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
163
                src1+=src_stride1;\
164
                src2+=src_stride2;\
165
                src3+=src_stride3;\
166
                src4+=src_stride4;\
167
                dst+=dst_stride;\
168
        } while(--h); \
169
} \
170
\
171
static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
172
        do { \
173
                uint32_t a0,a1,a2,a3; \
174
                UNPACK(a0,a1,LPC(src1),LPC(src2)); \
175
                UNPACK(a2,a3,LPC(src3),LPC(src4)); \
176
                OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
177
                UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
178
                UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
179
                OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
180
                src1+=src_stride1;\
181
                src2+=src_stride2;\
182
                src3+=src_stride3;\
183
                src4+=src_stride4;\
184
                dst+=dst_stride;\
185
        } while(--h); \
186
} \
187
\
188
static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
189
        do { \
190
                uint32_t a0,a1,a2,a3; /* src1 only not aligned */\
191
                UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
192
                UNPACK(a2,a3,LPC(src3),LPC(src4)); \
193
                OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
194
                UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
195
                UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
196
                OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
197
                src1+=src_stride1;\
198
                src2+=src_stride2;\
199
                src3+=src_stride3;\
200
                src4+=src_stride4;\
201
                dst+=dst_stride;\
202
        } while(--h); \
203
} \
204
\
205
static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
206
        do { \
207
                uint32_t a0,a1,a2,a3; \
208
                UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
209
                UNPACK(a2,a3,LPC(src3),LPC(src4)); \
210
                OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
211
                UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
212
                UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
213
                OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
214
                src1+=src_stride1;\
215
                src2+=src_stride2;\
216
                src3+=src_stride3;\
217
                src4+=src_stride4;\
218
                dst+=dst_stride;\
219
        } while(--h); \
220
} \
221
\
222
static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
223
        do { \
224
                uint32_t a0,a1,a2,a3; \
225
                UNPACK(a0,a1,LPC(src1),LPC(src2)); \
226
                UNPACK(a2,a3,LPC(src3),LPC(src4)); \
227
                OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
228
                UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
229
                UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
230
                OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
231
                UNPACK(a0,a1,LPC(src1+8),LPC(src2+8)); \
232
                UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
233
                OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
234
                UNPACK(a0,a1,LPC(src1+12),LPC(src2+12)); \
235
                UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
236
                OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
237
                src1+=src_stride1;\
238
                src2+=src_stride2;\
239
                src3+=src_stride3;\
240
                src4+=src_stride4;\
241
                dst+=dst_stride;\
242
        } while(--h); \
243
} \
244
\
245
static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
246
        do { \
247
                uint32_t a0,a1,a2,a3; \
248
                UNPACK(a0,a1,LPC(src1),LPC(src2)); \
249
                UNPACK(a2,a3,LPC(src3),LPC(src4)); \
250
                OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
251
                UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
252
                UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
253
                OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
254
                UNPACK(a0,a1,LPC(src1+8),LPC(src2+8)); \
255
                UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
256
                OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
257
                UNPACK(a0,a1,LPC(src1+12),LPC(src2+12)); \
258
                UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
259
                OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
260
                src1+=src_stride1;\
261
                src2+=src_stride2;\
262
                src3+=src_stride3;\
263
                src4+=src_stride4;\
264
                dst+=dst_stride;\
265
        } while(--h); \
266
} \
267
\
268
static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
269
        do { /* src1 is unaligned */\
270
                uint32_t a0,a1,a2,a3; \
271
                UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
272
                UNPACK(a2,a3,LPC(src3),LPC(src4)); \
273
                OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
274
                UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
275
                UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
276
                OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
277
                UNPACK(a0,a1,AV_RN32(src1+8),LPC(src2+8)); \
278
                UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
279
                OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
280
                UNPACK(a0,a1,AV_RN32(src1+12),LPC(src2+12)); \
281
                UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
282
                OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
283
                src1+=src_stride1;\
284
                src2+=src_stride2;\
285
                src3+=src_stride3;\
286
                src4+=src_stride4;\
287
                dst+=dst_stride;\
288
        } while(--h); \
289
} \
290
\
291
static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
292
        do { \
293
                uint32_t a0,a1,a2,a3; \
294
                UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
295
                UNPACK(a2,a3,LPC(src3),LPC(src4)); \
296
                OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
297
                UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
298
                UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
299
                OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
300
                UNPACK(a0,a1,AV_RN32(src1+8),LPC(src2+8)); \
301
                UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
302
                OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
303
                UNPACK(a0,a1,AV_RN32(src1+12),LPC(src2+12)); \
304
                UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
305
                OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
306
                src1+=src_stride1;\
307
                src2+=src_stride2;\
308
                src3+=src_stride3;\
309
                src4+=src_stride4;\
310
                dst+=dst_stride;\
311
        } while(--h); \
312
} \
313
\
314

    
315
#define op_avg(a, b) a = rnd_avg32(a,b)
316
#define op_put(a, b) a = b
317

    
318
PIXOP2(avg, op_avg)
319
PIXOP2(put, op_put)
320
#undef op_avg
321
#undef op_put
322

    
323
#define avg2(a,b) ((a+b+1)>>1)
324
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
325

    
326

    
327
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
328
{
329
    const int A=(16-x16)*(16-y16);
330
    const int B=(   x16)*(16-y16);
331
    const int C=(16-x16)*(   y16);
332
    const int D=(   x16)*(   y16);
333

    
334
    do {
335
        int t0,t1,t2,t3;
336
        uint8_t *s0 = src;
337
        uint8_t *s1 = src+stride;
338
        t0 = *s0++; t2 = *s1++;
339
        t1 = *s0++; t3 = *s1++;
340
        dst[0]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
341
        t0 = *s0++; t2 = *s1++;
342
        dst[1]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
343
        t1 = *s0++; t3 = *s1++;
344
        dst[2]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
345
        t0 = *s0++; t2 = *s1++;
346
        dst[3]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
347
        t1 = *s0++; t3 = *s1++;
348
        dst[4]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
349
        t0 = *s0++; t2 = *s1++;
350
        dst[5]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
351
        t1 = *s0++; t3 = *s1++;
352
        dst[6]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
353
        t0 = *s0++; t2 = *s1++;
354
        dst[7]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
355
        dst+= stride;
356
        src+= stride;
357
    }while(--h);
358
}
359

    
360
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
361
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
362
{
363
    int y, vx, vy;
364
    const int s= 1<<shift;
365

    
366
    width--;
367
    height--;
368

    
369
    for(y=0; y<h; y++){
370
        int x;
371

    
372
        vx= ox;
373
        vy= oy;
374
        for(x=0; x<8; x++){ //XXX FIXME optimize
375
            int src_x, src_y, frac_x, frac_y, index;
376

    
377
            src_x= vx>>16;
378
            src_y= vy>>16;
379
            frac_x= src_x&(s-1);
380
            frac_y= src_y&(s-1);
381
            src_x>>=shift;
382
            src_y>>=shift;
383

    
384
            if((unsigned)src_x < width){
385
                if((unsigned)src_y < height){
386
                    index= src_x + src_y*stride;
387
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
388
                                           + src[index       +1]*   frac_x )*(s-frac_y)
389
                                        + (  src[index+stride  ]*(s-frac_x)
390
                                           + src[index+stride+1]*   frac_x )*   frac_y
391
                                        + r)>>(shift*2);
392
                }else{
393
                    index= src_x + av_clip(src_y, 0, height)*stride;
394
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
395
                                          + src[index       +1]*   frac_x )*s
396
                                        + r)>>(shift*2);
397
                }
398
            }else{
399
                if((unsigned)src_y < height){
400
                    index= av_clip(src_x, 0, width) + src_y*stride;
401
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
402
                                           + src[index+stride  ]*   frac_y )*s
403
                                        + r)>>(shift*2);
404
                }else{
405
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
406
                    dst[y*stride + x]=    src[index         ];
407
                }
408
            }
409

    
410
            vx+= dxx;
411
            vy+= dyx;
412
        }
413
        ox += dxy;
414
        oy += dyy;
415
    }
416
}
417
#define H264_CHROMA_MC(OPNAME, OP)\
418
static void OPNAME ## h264_chroma_mc2_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
419
    const int A=(8-x)*(8-y);\
420
    const int B=(  x)*(8-y);\
421
    const int C=(8-x)*(  y);\
422
    const int D=(  x)*(  y);\
423
    \
424
    assert(x<8 && y<8 && x>=0 && y>=0);\
425
\
426
    do {\
427
        int t0,t1,t2,t3; \
428
        uint8_t *s0 = src; \
429
        uint8_t *s1 = src+stride; \
430
        t0 = *s0++; t2 = *s1++; \
431
        t1 = *s0++; t3 = *s1++; \
432
        OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
433
        t0 = *s0++; t2 = *s1++; \
434
        OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
435
        dst+= stride;\
436
        src+= stride;\
437
    }while(--h);\
438
}\
439
\
440
static void OPNAME ## h264_chroma_mc4_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
441
    const int A=(8-x)*(8-y);\
442
    const int B=(  x)*(8-y);\
443
    const int C=(8-x)*(  y);\
444
    const int D=(  x)*(  y);\
445
    \
446
    assert(x<8 && y<8 && x>=0 && y>=0);\
447
\
448
    do {\
449
        int t0,t1,t2,t3; \
450
        uint8_t *s0 = src; \
451
        uint8_t *s1 = src+stride; \
452
        t0 = *s0++; t2 = *s1++; \
453
        t1 = *s0++; t3 = *s1++; \
454
        OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
455
        t0 = *s0++; t2 = *s1++; \
456
        OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
457
        t1 = *s0++; t3 = *s1++; \
458
        OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
459
        t0 = *s0++; t2 = *s1++; \
460
        OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
461
        dst+= stride;\
462
        src+= stride;\
463
    }while(--h);\
464
}\
465
\
466
static void OPNAME ## h264_chroma_mc8_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
467
    const int A=(8-x)*(8-y);\
468
    const int B=(  x)*(8-y);\
469
    const int C=(8-x)*(  y);\
470
    const int D=(  x)*(  y);\
471
    \
472
    assert(x<8 && y<8 && x>=0 && y>=0);\
473
\
474
    do {\
475
        int t0,t1,t2,t3; \
476
        uint8_t *s0 = src; \
477
        uint8_t *s1 = src+stride; \
478
        t0 = *s0++; t2 = *s1++; \
479
        t1 = *s0++; t3 = *s1++; \
480
        OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
481
        t0 = *s0++; t2 = *s1++; \
482
        OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
483
        t1 = *s0++; t3 = *s1++; \
484
        OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
485
        t0 = *s0++; t2 = *s1++; \
486
        OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
487
        t1 = *s0++; t3 = *s1++; \
488
        OP(dst[4], (A*t0 + B*t1 + C*t2 + D*t3));\
489
        t0 = *s0++; t2 = *s1++; \
490
        OP(dst[5], (A*t1 + B*t0 + C*t3 + D*t2));\
491
        t1 = *s0++; t3 = *s1++; \
492
        OP(dst[6], (A*t0 + B*t1 + C*t2 + D*t3));\
493
        t0 = *s0++; t2 = *s1++; \
494
        OP(dst[7], (A*t1 + B*t0 + C*t3 + D*t2));\
495
        dst+= stride;\
496
        src+= stride;\
497
    }while(--h);\
498
}
499

    
500
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
501
#define op_put(a, b) a = (((b) + 32)>>6)
502

    
503
H264_CHROMA_MC(put_       , op_put)
504
H264_CHROMA_MC(avg_       , op_avg)
505
#undef op_avg
506
#undef op_put
507

    
508
#define QPEL_MC(r, OPNAME, RND, OP) \
509
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
510
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
511
    do {\
512
        uint8_t *s = src; \
513
        int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
514
        src0= *s++;\
515
        src1= *s++;\
516
        src2= *s++;\
517
        src3= *s++;\
518
        src4= *s++;\
519
        OP(dst[0], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
520
        src5= *s++;\
521
        OP(dst[1], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
522
        src6= *s++;\
523
        OP(dst[2], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
524
        src7= *s++;\
525
        OP(dst[3], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
526
        src8= *s++;\
527
        OP(dst[4], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
528
        OP(dst[5], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
529
        OP(dst[6], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
530
        OP(dst[7], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
531
        dst+=dstStride;\
532
        src+=srcStride;\
533
    }while(--h);\
534
}\
535
\
536
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
537
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
538
    int w=8;\
539
    do{\
540
        uint8_t *s = src, *d=dst;\
541
        int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
542
        src0 = *s; s+=srcStride; \
543
        src1 = *s; s+=srcStride; \
544
        src2 = *s; s+=srcStride; \
545
        src3 = *s; s+=srcStride; \
546
        src4 = *s; s+=srcStride; \
547
        OP(*d, (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));d+=dstStride;\
548
        src5 = *s; s+=srcStride; \
549
        OP(*d, (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));d+=dstStride;\
550
        src6 = *s; s+=srcStride; \
551
        OP(*d, (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));d+=dstStride;\
552
        src7 = *s; s+=srcStride; \
553
        OP(*d, (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));d+=dstStride;\
554
        src8 = *s; \
555
        OP(*d, (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));d+=dstStride;\
556
        OP(*d, (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));d+=dstStride;\
557
        OP(*d, (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));d+=dstStride;\
558
        OP(*d, (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
559
        dst++;\
560
        src++;\
561
    }while(--w);\
562
}\
563
\
564
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
565
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
566
    do {\
567
        uint8_t *s = src;\
568
        int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
569
        int src9,src10,src11,src12,src13,src14,src15,src16;\
570
        src0= *s++;\
571
        src1= *s++;\
572
        src2= *s++;\
573
        src3= *s++;\
574
        src4= *s++;\
575
        OP(dst[ 0], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
576
        src5= *s++;\
577
        OP(dst[ 1], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
578
        src6= *s++;\
579
        OP(dst[ 2], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
580
        src7= *s++;\
581
        OP(dst[ 3], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
582
        src8= *s++;\
583
        OP(dst[ 4], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
584
        src9= *s++;\
585
        OP(dst[ 5], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
586
        src10= *s++;\
587
        OP(dst[ 6], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
588
        src11= *s++;\
589
        OP(dst[ 7], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
590
        src12= *s++;\
591
        OP(dst[ 8], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
592
        src13= *s++;\
593
        OP(dst[ 9], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
594
        src14= *s++;\
595
        OP(dst[10], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
596
        src15= *s++;\
597
        OP(dst[11], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
598
        src16= *s++;\
599
        OP(dst[12], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
600
        OP(dst[13], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
601
        OP(dst[14], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
602
        OP(dst[15], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
603
        dst+=dstStride;\
604
        src+=srcStride;\
605
    }while(--h);\
606
}\
607
\
608
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
609
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
610
    int w=16;\
611
    do {\
612
        uint8_t *s = src, *d=dst;\
613
        int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
614
        int src9,src10,src11,src12,src13,src14,src15,src16;\
615
        src0 = *s; s+=srcStride; \
616
        src1 = *s; s+=srcStride; \
617
        src2 = *s; s+=srcStride; \
618
        src3 = *s; s+=srcStride; \
619
        src4 = *s; s+=srcStride; \
620
        OP(*d, (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));d+=dstStride;\
621
        src5 = *s; s+=srcStride; \
622
        OP(*d, (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));d+=dstStride;\
623
        src6 = *s; s+=srcStride; \
624
        OP(*d, (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));d+=dstStride;\
625
        src7 = *s; s+=srcStride; \
626
        OP(*d, (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));d+=dstStride;\
627
        src8 = *s; s+=srcStride; \
628
        OP(*d, (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));d+=dstStride;\
629
        src9 = *s; s+=srcStride; \
630
        OP(*d, (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));d+=dstStride;\
631
        src10 = *s; s+=srcStride; \
632
        OP(*d, (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));d+=dstStride;\
633
        src11 = *s; s+=srcStride; \
634
        OP(*d, (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));d+=dstStride;\
635
        src12 = *s; s+=srcStride; \
636
        OP(*d, (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));d+=dstStride;\
637
        src13 = *s; s+=srcStride; \
638
        OP(*d, (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));d+=dstStride;\
639
        src14 = *s; s+=srcStride; \
640
        OP(*d, (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));d+=dstStride;\
641
        src15 = *s; s+=srcStride; \
642
        OP(*d, (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));d+=dstStride;\
643
        src16 = *s; \
644
        OP(*d, (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));d+=dstStride;\
645
        OP(*d, (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));d+=dstStride;\
646
        OP(*d, (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));d+=dstStride;\
647
        OP(*d, (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
648
        dst++;\
649
        src++;\
650
    }while(--w);\
651
}\
652
\
653
static void OPNAME ## qpel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
654
    OPNAME ## pixels8_c(dst, src, stride, 8);\
655
}\
656
\
657
static void OPNAME ## qpel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
658
    uint8_t half[64];\
659
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
660
    OPNAME ## pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);\
661
}\
662
\
663
static void OPNAME ## qpel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
664
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
665
}\
666
\
667
static void OPNAME ## qpel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
668
    uint8_t half[64];\
669
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
670
    OPNAME ## pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);\
671
}\
672
\
673
static void OPNAME ## qpel8_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
674
    uint8_t full[16*9];\
675
    uint8_t half[64];\
676
    copy_block9(full, src, 16, stride, 9);\
677
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
678
    OPNAME ## pixels8_l2_aligned(dst, full, half, stride, 16, 8, 8);\
679
}\
680
\
681
static void OPNAME ## qpel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
682
    uint8_t full[16*9];\
683
    copy_block9(full, src, 16, stride, 9);\
684
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
685
}\
686
\
687
static void OPNAME ## qpel8_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
688
    uint8_t full[16*9];\
689
    uint8_t half[64];\
690
    copy_block9(full, src, 16, stride, 9);\
691
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
692
    OPNAME ## pixels8_l2_aligned(dst, full+16, half, stride, 16, 8, 8);\
693
}\
694
static void OPNAME ## qpel8_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
695
    uint8_t full[16*9];\
696
    uint8_t halfH[72];\
697
    uint8_t halfHV[64];\
698
    copy_block9(full, src, 16, stride, 9);\
699
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
700
    put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
701
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
702
    OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
703
}\
704
static void OPNAME ## qpel8_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
705
    uint8_t full[16*9];\
706
    uint8_t halfH[72];\
707
    uint8_t halfHV[64];\
708
    copy_block9(full, src, 16, stride, 9);\
709
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
710
    put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
711
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
712
    OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
713
}\
714
static void OPNAME ## qpel8_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
715
    uint8_t full[16*9];\
716
    uint8_t halfH[72];\
717
    uint8_t halfHV[64];\
718
    copy_block9(full, src, 16, stride, 9);\
719
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
720
    put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
721
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
722
    OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
723
}\
724
static void OPNAME ## qpel8_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
725
    uint8_t full[16*9];\
726
    uint8_t halfH[72];\
727
    uint8_t halfHV[64];\
728
    copy_block9(full, src, 16, stride, 9);\
729
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
730
    put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
731
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
732
    OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
733
}\
734
static void OPNAME ## qpel8_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
735
    uint8_t halfH[72];\
736
    uint8_t halfHV[64];\
737
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
738
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
739
    OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
740
}\
741
static void OPNAME ## qpel8_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
742
    uint8_t halfH[72];\
743
    uint8_t halfHV[64];\
744
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
745
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
746
    OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
747
}\
748
static void OPNAME ## qpel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
749
    uint8_t full[16*9];\
750
    uint8_t halfH[72];\
751
    copy_block9(full, src, 16, stride, 9);\
752
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
753
    put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
754
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
755
}\
756
static void OPNAME ## qpel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
757
    uint8_t full[16*9];\
758
    uint8_t halfH[72];\
759
    copy_block9(full, src, 16, stride, 9);\
760
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
761
    put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
762
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
763
}\
764
static void OPNAME ## qpel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
765
    uint8_t halfH[72];\
766
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
767
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
768
}\
769
static void OPNAME ## qpel16_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
770
    OPNAME ## pixels16_c(dst, src, stride, 16);\
771
}\
772
\
773
static void OPNAME ## qpel16_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
774
    uint8_t half[256];\
775
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
776
    OPNAME ## pixels16_l2_aligned2(dst, src, half, stride, stride, 16, 16);\
777
}\
778
\
779
static void OPNAME ## qpel16_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
780
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
781
}\
782
\
783
static void OPNAME ## qpel16_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
784
    uint8_t half[256];\
785
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
786
    OPNAME ## pixels16_l2_aligned2(dst, src+1, half, stride, stride, 16, 16);\
787
}\
788
\
789
static void OPNAME ## qpel16_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
790
    uint8_t full[24*17];\
791
    uint8_t half[256];\
792
    copy_block17(full, src, 24, stride, 17);\
793
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
794
    OPNAME ## pixels16_l2_aligned(dst, full, half, stride, 24, 16, 16);\
795
}\
796
\
797
static void OPNAME ## qpel16_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
798
    uint8_t full[24*17];\
799
    copy_block17(full, src, 24, stride, 17);\
800
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
801
}\
802
\
803
static void OPNAME ## qpel16_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
804
    uint8_t full[24*17];\
805
    uint8_t half[256];\
806
    copy_block17(full, src, 24, stride, 17);\
807
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
808
    OPNAME ## pixels16_l2_aligned(dst, full+24, half, stride, 24, 16, 16);\
809
}\
810
static void OPNAME ## qpel16_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
811
    uint8_t full[24*17];\
812
    uint8_t halfH[272];\
813
    uint8_t halfHV[256];\
814
    copy_block17(full, src, 24, stride, 17);\
815
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
816
    put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
817
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
818
    OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
819
}\
820
static void OPNAME ## qpel16_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
821
    uint8_t full[24*17];\
822
    uint8_t halfH[272];\
823
    uint8_t halfHV[256];\
824
    copy_block17(full, src, 24, stride, 17);\
825
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826
    put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
827
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
828
    OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
829
}\
830
static void OPNAME ## qpel16_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
831
    uint8_t full[24*17];\
832
    uint8_t halfH[272];\
833
    uint8_t halfHV[256];\
834
    copy_block17(full, src, 24, stride, 17);\
835
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
836
    put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
837
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
838
    OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
839
}\
840
static void OPNAME ## qpel16_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
841
    uint8_t full[24*17];\
842
    uint8_t halfH[272];\
843
    uint8_t halfHV[256];\
844
    copy_block17(full, src, 24, stride, 17);\
845
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
846
    put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
847
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
848
    OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
849
}\
850
static void OPNAME ## qpel16_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
851
    uint8_t halfH[272];\
852
    uint8_t halfHV[256];\
853
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
854
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
855
    OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
856
}\
857
static void OPNAME ## qpel16_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
858
    uint8_t halfH[272];\
859
    uint8_t halfHV[256];\
860
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
861
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
862
    OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
863
}\
864
static void OPNAME ## qpel16_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
865
    uint8_t full[24*17];\
866
    uint8_t halfH[272];\
867
    copy_block17(full, src, 24, stride, 17);\
868
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
869
    put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
870
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
871
}\
872
static void OPNAME ## qpel16_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
873
    uint8_t full[24*17];\
874
    uint8_t halfH[272];\
875
    copy_block17(full, src, 24, stride, 17);\
876
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
877
    put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
878
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
879
}\
880
static void OPNAME ## qpel16_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
881
    uint8_t halfH[272];\
882
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
883
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
884
}
885

    
886
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
887
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
888
#define op_put(a, b) a = cm[((b) + 16)>>5]
889
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
890

    
891
QPEL_MC(0, put_       , _       , op_put)
892
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
893
QPEL_MC(0, avg_       , _       , op_avg)
894
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
895
#undef op_avg
896
#undef op_avg_no_rnd
897
#undef op_put
898
#undef op_put_no_rnd
899

    
900
#if 1
901
#define H264_LOWPASS(OPNAME, OP, OP2) \
902
static inline void OPNAME ## h264_qpel_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
903
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
904
    do {\
905
        int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
906
        uint8_t *s = src-2;\
907
        srcB = *s++;\
908
        srcA = *s++;\
909
        src0 = *s++;\
910
        src1 = *s++;\
911
        src2 = *s++;\
912
        src3 = *s++;\
913
        OP(dst[0], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
914
        src4 = *s++;\
915
        OP(dst[1], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
916
        src5 = *s++;\
917
        OP(dst[2], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
918
        src6 = *s++;\
919
        OP(dst[3], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
920
      if (w>4) { /* it optimized */ \
921
        int src7,src8,src9,src10; \
922
        src7 = *s++;\
923
        OP(dst[4], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
924
        src8 = *s++;\
925
        OP(dst[5], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
926
        src9 = *s++;\
927
        OP(dst[6], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
928
        src10 = *s++;\
929
        OP(dst[7], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
930
       if (w>8) { \
931
        int src11,src12,src13,src14,src15,src16,src17,src18; \
932
        src11 = *s++;\
933
        OP(dst[8] , (src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));\
934
        src12 = *s++;\
935
        OP(dst[9] , (src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));\
936
        src13 = *s++;\
937
        OP(dst[10], (src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));\
938
        src14 = *s++;\
939
        OP(dst[11], (src11+src12)*20 - (src10+src13)*5 + (src9 +src14));\
940
        src15 = *s++;\
941
        OP(dst[12], (src12+src13)*20 - (src11+src14)*5 + (src10+src15));\
942
        src16 = *s++;\
943
        OP(dst[13], (src13+src14)*20 - (src12+src15)*5 + (src11+src16));\
944
        src17 = *s++;\
945
        OP(dst[14], (src14+src15)*20 - (src13+src16)*5 + (src12+src17));\
946
        src18 = *s++;\
947
        OP(dst[15], (src15+src16)*20 - (src14+src17)*5 + (src13+src18));\
948
       } \
949
      } \
950
        dst+=dstStride;\
951
        src+=srcStride;\
952
    }while(--h);\
953
}\
954
\
955
static inline void OPNAME ## h264_qpel_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
956
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
957
    do{\
958
        int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
959
        uint8_t *s = src-2*srcStride,*d=dst;\
960
        srcB = *s; s+=srcStride;\
961
        srcA = *s; s+=srcStride;\
962
        src0 = *s; s+=srcStride;\
963
        src1 = *s; s+=srcStride;\
964
        src2 = *s; s+=srcStride;\
965
        src3 = *s; s+=srcStride;\
966
        OP(*d, (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));d+=dstStride;\
967
        src4 = *s; s+=srcStride;\
968
        OP(*d, (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));d+=dstStride;\
969
        src5 = *s; s+=srcStride;\
970
        OP(*d, (src2+src3)*20 - (src1+src4)*5 + (src0+src5));d+=dstStride;\
971
        src6 = *s; s+=srcStride;\
972
        OP(*d, (src3+src4)*20 - (src2+src5)*5 + (src1+src6));d+=dstStride;\
973
      if (h>4) { \
974
        int src7,src8,src9,src10; \
975
        src7 = *s; s+=srcStride;\
976
        OP(*d, (src4+src5)*20 - (src3+src6)*5 + (src2+src7));d+=dstStride;\
977
        src8 = *s; s+=srcStride;\
978
        OP(*d, (src5+src6)*20 - (src4+src7)*5 + (src3+src8));d+=dstStride;\
979
        src9 = *s; s+=srcStride;\
980
        OP(*d, (src6+src7)*20 - (src5+src8)*5 + (src4+src9));d+=dstStride;\
981
        src10 = *s; s+=srcStride;\
982
        OP(*d, (src7+src8)*20 - (src6+src9)*5 + (src5+src10));d+=dstStride;\
983
       if (h>8) { \
984
        int src11,src12,src13,src14,src15,src16,src17,src18; \
985
        src11 = *s; s+=srcStride;\
986
        OP(*d , (src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));d+=dstStride;\
987
        src12 = *s; s+=srcStride;\
988
        OP(*d , (src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));d+=dstStride;\
989
        src13 = *s; s+=srcStride;\
990
        OP(*d, (src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));d+=dstStride;\
991
        src14 = *s; s+=srcStride;\
992
        OP(*d, (src11+src12)*20 - (src10+src13)*5 + (src9 +src14));d+=dstStride;\
993
        src15 = *s; s+=srcStride;\
994
        OP(*d, (src12+src13)*20 - (src11+src14)*5 + (src10+src15));d+=dstStride;\
995
        src16 = *s; s+=srcStride;\
996
        OP(*d, (src13+src14)*20 - (src12+src15)*5 + (src11+src16));d+=dstStride;\
997
        src17 = *s; s+=srcStride;\
998
        OP(*d, (src14+src15)*20 - (src13+src16)*5 + (src12+src17));d+=dstStride;\
999
        src18 = *s; s+=srcStride;\
1000
        OP(*d, (src15+src16)*20 - (src14+src17)*5 + (src13+src18));d+=dstStride;\
1001
       } \
1002
      } \
1003
        dst++;\
1004
        src++;\
1005
    }while(--w);\
1006
}\
1007
\
1008
static inline void OPNAME ## h264_qpel_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride,int w,int h){\
1009
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1010
    int i;\
1011
    src -= 2*srcStride;\
1012
    i= h+5; \
1013
    do {\
1014
        int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
1015
        uint8_t *s = src-2;\
1016
        srcB = *s++;\
1017
        srcA = *s++;\
1018
        src0 = *s++;\
1019
        src1 = *s++;\
1020
        src2 = *s++;\
1021
        src3 = *s++;\
1022
        tmp[0] = ((src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1023
        src4 = *s++;\
1024
        tmp[1] = ((src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1025
        src5 = *s++;\
1026
        tmp[2] = ((src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1027
        src6 = *s++;\
1028
        tmp[3] = ((src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1029
      if (w>4) { /* it optimized */ \
1030
        int src7,src8,src9,src10; \
1031
        src7 = *s++;\
1032
        tmp[4] = ((src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1033
        src8 = *s++;\
1034
        tmp[5] = ((src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1035
        src9 = *s++;\
1036
        tmp[6] = ((src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1037
        src10 = *s++;\
1038
        tmp[7] = ((src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1039
       if (w>8) { \
1040
        int src11,src12,src13,src14,src15,src16,src17,src18; \
1041
        src11 = *s++;\
1042
        tmp[8] = ((src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));\
1043
        src12 = *s++;\
1044
        tmp[9] = ((src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));\
1045
        src13 = *s++;\
1046
        tmp[10] = ((src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));\
1047
        src14 = *s++;\
1048
        tmp[11] = ((src11+src12)*20 - (src10+src13)*5 + (src9 +src14));\
1049
        src15 = *s++;\
1050
        tmp[12] = ((src12+src13)*20 - (src11+src14)*5 + (src10+src15));\
1051
        src16 = *s++;\
1052
        tmp[13] = ((src13+src14)*20 - (src12+src15)*5 + (src11+src16));\
1053
        src17 = *s++;\
1054
        tmp[14] = ((src14+src15)*20 - (src13+src16)*5 + (src12+src17));\
1055
        src18 = *s++;\
1056
        tmp[15] = ((src15+src16)*20 - (src14+src17)*5 + (src13+src18));\
1057
       } \
1058
      } \
1059
        tmp+=tmpStride;\
1060
        src+=srcStride;\
1061
    }while(--i);\
1062
    tmp -= tmpStride*(h+5-2);\
1063
    i = w; \
1064
    do {\
1065
        int tmpB,tmpA,tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6;\
1066
        int16_t *s = tmp-2*tmpStride; \
1067
        uint8_t *d=dst;\
1068
        tmpB = *s; s+=tmpStride;\
1069
        tmpA = *s; s+=tmpStride;\
1070
        tmp0 = *s; s+=tmpStride;\
1071
        tmp1 = *s; s+=tmpStride;\
1072
        tmp2 = *s; s+=tmpStride;\
1073
        tmp3 = *s; s+=tmpStride;\
1074
        OP2(*d, (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));d+=dstStride;\
1075
        tmp4 = *s; s+=tmpStride;\
1076
        OP2(*d, (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));d+=dstStride;\
1077
        tmp5 = *s; s+=tmpStride;\
1078
        OP2(*d, (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));d+=dstStride;\
1079
        tmp6 = *s; s+=tmpStride;\
1080
        OP2(*d, (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));d+=dstStride;\
1081
      if (h>4) { \
1082
        int tmp7,tmp8,tmp9,tmp10; \
1083
        tmp7 = *s; s+=tmpStride;\
1084
        OP2(*d, (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));d+=dstStride;\
1085
        tmp8 = *s; s+=tmpStride;\
1086
        OP2(*d, (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));d+=dstStride;\
1087
        tmp9 = *s; s+=tmpStride;\
1088
        OP2(*d, (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));d+=dstStride;\
1089
        tmp10 = *s; s+=tmpStride;\
1090
        OP2(*d, (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));d+=dstStride;\
1091
       if (h>8) { \
1092
        int tmp11,tmp12,tmp13,tmp14,tmp15,tmp16,tmp17,tmp18; \
1093
        tmp11 = *s; s+=tmpStride;\
1094
        OP2(*d , (tmp8 +tmp9 )*20 - (tmp7 +tmp10)*5 + (tmp6 +tmp11));d+=dstStride;\
1095
        tmp12 = *s; s+=tmpStride;\
1096
        OP2(*d , (tmp9 +tmp10)*20 - (tmp8 +tmp11)*5 + (tmp7 +tmp12));d+=dstStride;\
1097
        tmp13 = *s; s+=tmpStride;\
1098
        OP2(*d, (tmp10+tmp11)*20 - (tmp9 +tmp12)*5 + (tmp8 +tmp13));d+=dstStride;\
1099
        tmp14 = *s; s+=tmpStride;\
1100
        OP2(*d, (tmp11+tmp12)*20 - (tmp10+tmp13)*5 + (tmp9 +tmp14));d+=dstStride;\
1101
        tmp15 = *s; s+=tmpStride;\
1102
        OP2(*d, (tmp12+tmp13)*20 - (tmp11+tmp14)*5 + (tmp10+tmp15));d+=dstStride;\
1103
        tmp16 = *s; s+=tmpStride;\
1104
        OP2(*d, (tmp13+tmp14)*20 - (tmp12+tmp15)*5 + (tmp11+tmp16));d+=dstStride;\
1105
        tmp17 = *s; s+=tmpStride;\
1106
        OP2(*d, (tmp14+tmp15)*20 - (tmp13+tmp16)*5 + (tmp12+tmp17));d+=dstStride;\
1107
        tmp18 = *s; s+=tmpStride;\
1108
        OP2(*d, (tmp15+tmp16)*20 - (tmp14+tmp17)*5 + (tmp13+tmp18));d+=dstStride;\
1109
       } \
1110
      } \
1111
        dst++;\
1112
        tmp++;\
1113
    }while(--i);\
1114
}\
1115
\
1116
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1117
    OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,4,4); \
1118
}\
1119
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1120
   OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,8,8); \
1121
}\
1122
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1123
   OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,16,16); \
1124
}\
1125
\
1126
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1127
   OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,4,4); \
1128
}\
1129
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1130
   OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,8,8); \
1131
}\
1132
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1133
   OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,16,16); \
1134
}\
1135
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1136
   OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,4,4); \
1137
}\
1138
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1139
   OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,8,8); \
1140
}\
1141
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1142
   OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,16,16); \
1143
}\
1144

    
1145
#define H264_MC(OPNAME, SIZE) \
1146
static void OPNAME ## h264_qpel ## SIZE ## _mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
1147
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
1148
}\
1149
\
1150
static void OPNAME ## h264_qpel ## SIZE ## _mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
1151
    uint8_t half[SIZE*SIZE];\
1152
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1153
    OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src, half, stride, stride, SIZE, SIZE);\
1154
}\
1155
\
1156
static void OPNAME ## h264_qpel ## SIZE ## _mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
1157
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
1158
}\
1159
\
1160
static void OPNAME ## h264_qpel ## SIZE ## _mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
1161
    uint8_t half[SIZE*SIZE];\
1162
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1163
    OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src+1, half, stride, stride, SIZE, SIZE);\
1164
}\
1165
\
1166
static void OPNAME ## h264_qpel ## SIZE ## _mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
1167
    uint8_t full[SIZE*(SIZE+5)];\
1168
    uint8_t * const full_mid= full + SIZE*2;\
1169
    uint8_t half[SIZE*SIZE];\
1170
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1171
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1172
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
1173
}\
1174
\
1175
static void OPNAME ## h264_qpel ## SIZE ## _mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
1176
    uint8_t full[SIZE*(SIZE+5)];\
1177
    uint8_t * const full_mid= full + SIZE*2;\
1178
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1179
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
1180
}\
1181
\
1182
static void OPNAME ## h264_qpel ## SIZE ## _mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
1183
    uint8_t full[SIZE*(SIZE+5)];\
1184
    uint8_t * const full_mid= full + SIZE*2;\
1185
    uint8_t half[SIZE*SIZE];\
1186
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1187
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1188
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
1189
}\
1190
\
1191
static void OPNAME ## h264_qpel ## SIZE ## _mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
1192
    uint8_t full[SIZE*(SIZE+5)];\
1193
    uint8_t * const full_mid= full + SIZE*2;\
1194
    uint8_t halfH[SIZE*SIZE];\
1195
    uint8_t halfV[SIZE*SIZE];\
1196
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1197
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1198
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1199
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1200
}\
1201
\
1202
static void OPNAME ## h264_qpel ## SIZE ## _mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
1203
    uint8_t full[SIZE*(SIZE+5)];\
1204
    uint8_t * const full_mid= full + SIZE*2;\
1205
    uint8_t halfH[SIZE*SIZE];\
1206
    uint8_t halfV[SIZE*SIZE];\
1207
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1208
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1209
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1210
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1211
}\
1212
\
1213
static void OPNAME ## h264_qpel ## SIZE ## _mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
1214
    uint8_t full[SIZE*(SIZE+5)];\
1215
    uint8_t * const full_mid= full + SIZE*2;\
1216
    uint8_t halfH[SIZE*SIZE];\
1217
    uint8_t halfV[SIZE*SIZE];\
1218
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1219
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1220
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1221
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1222
}\
1223
\
1224
static void OPNAME ## h264_qpel ## SIZE ## _mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
1225
    uint8_t full[SIZE*(SIZE+5)];\
1226
    uint8_t * const full_mid= full + SIZE*2;\
1227
    uint8_t halfH[SIZE*SIZE];\
1228
    uint8_t halfV[SIZE*SIZE];\
1229
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1230
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1231
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1232
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1233
}\
1234
\
1235
static void OPNAME ## h264_qpel ## SIZE ## _mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
1236
    int16_t tmp[SIZE*(SIZE+5)];\
1237
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
1238
}\
1239
\
1240
static void OPNAME ## h264_qpel ## SIZE ## _mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
1241
    int16_t tmp[SIZE*(SIZE+5)];\
1242
    uint8_t halfH[SIZE*SIZE];\
1243
    uint8_t halfHV[SIZE*SIZE];\
1244
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1245
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1246
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1247
}\
1248
\
1249
static void OPNAME ## h264_qpel ## SIZE ## _mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
1250
    int16_t tmp[SIZE*(SIZE+5)];\
1251
    uint8_t halfH[SIZE*SIZE];\
1252
    uint8_t halfHV[SIZE*SIZE];\
1253
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1254
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1255
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1256
}\
1257
\
1258
static void OPNAME ## h264_qpel ## SIZE ## _mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
1259
    uint8_t full[SIZE*(SIZE+5)];\
1260
    uint8_t * const full_mid= full + SIZE*2;\
1261
    int16_t tmp[SIZE*(SIZE+5)];\
1262
    uint8_t halfV[SIZE*SIZE];\
1263
    uint8_t halfHV[SIZE*SIZE];\
1264
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1265
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1266
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1267
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1268
}\
1269
\
1270
static void OPNAME ## h264_qpel ## SIZE ## _mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
1271
    uint8_t full[SIZE*(SIZE+5)];\
1272
    uint8_t * const full_mid= full + SIZE*2;\
1273
    int16_t tmp[SIZE*(SIZE+5)];\
1274
    uint8_t halfV[SIZE*SIZE];\
1275
    uint8_t halfHV[SIZE*SIZE];\
1276
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1277
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1278
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1279
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1280
}\
1281

    
1282
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1283
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
1284
#define op_put(a, b)  a = cm[((b) + 16)>>5]
1285
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
1286
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
1287

    
1288
H264_LOWPASS(put_       , op_put, op2_put)
1289
H264_LOWPASS(avg_       , op_avg, op2_avg)
1290
H264_MC(put_, 4)
1291
H264_MC(put_, 8)
1292
H264_MC(put_, 16)
1293
H264_MC(avg_, 4)
1294
H264_MC(avg_, 8)
1295
H264_MC(avg_, 16)
1296

    
1297
#undef op_avg
1298
#undef op_put
1299
#undef op2_avg
1300
#undef op2_put
1301
#endif
1302

    
1303
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1304
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1305

    
1306
    do{
1307
        int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
1308
        uint8_t *s = src;
1309
        src_1 = s[-1];
1310
        src0 = *s++;
1311
        src1 = *s++;
1312
        src2 = *s++;
1313
        dst[0]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1314
        src3 = *s++;
1315
        dst[1]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1316
        src4 = *s++;
1317
        dst[2]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1318
        src5 = *s++;
1319
        dst[3]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1320
        src6 = *s++;
1321
        dst[4]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1322
        src7 = *s++;
1323
        dst[5]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1324
        src8 = *s++;
1325
        dst[6]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1326
        src9 = *s++;
1327
        dst[7]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1328
        dst+=dstStride;
1329
        src+=srcStride;
1330
    }while(--h);
1331
}
1332

    
1333
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1334
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1335

    
1336
    do{
1337
        int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
1338
        uint8_t *s = src,*d = dst;
1339
        src_1 = *(s-srcStride);
1340
        src0 = *s; s+=srcStride;
1341
        src1 = *s; s+=srcStride;
1342
        src2 = *s; s+=srcStride;
1343
        *d= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4]; d+=dstStride;
1344
        src3 = *s; s+=srcStride;
1345
        *d= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4]; d+=dstStride;
1346
        src4 = *s; s+=srcStride;
1347
        *d= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4]; d+=dstStride;
1348
        src5 = *s; s+=srcStride;
1349
        *d= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4]; d+=dstStride;
1350
        src6 = *s; s+=srcStride;
1351
        *d= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4]; d+=dstStride;
1352
        src7 = *s; s+=srcStride;
1353
        *d= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4]; d+=dstStride;
1354
        src8 = *s; s+=srcStride;
1355
        *d= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4]; d+=dstStride;
1356
        src9 = *s;
1357
        *d= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4]; d+=dstStride;
1358
        src++;
1359
        dst++;
1360
    }while(--w);
1361
}
1362

    
1363
static void put_mspel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){
1364
    put_pixels8_c(dst, src, stride, 8);
1365
}
1366

    
1367
static void put_mspel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){
1368
    uint8_t half[64];
1369
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1370
    put_pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);
1371
}
1372

    
1373
static void put_mspel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){
1374
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1375
}
1376

    
1377
static void put_mspel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){
1378
    uint8_t half[64];
1379
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1380
    put_pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);
1381
}
1382

    
1383
static void put_mspel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){
1384
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1385
}
1386

    
1387
static void put_mspel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){
1388
    uint8_t halfH[88];
1389
    uint8_t halfV[64];
1390
    uint8_t halfHV[64];
1391
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1392
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1393
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1394
    put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
1395
}
1396
static void put_mspel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){
1397
    uint8_t halfH[88];
1398
    uint8_t halfV[64];
1399
    uint8_t halfHV[64];
1400
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1401
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1402
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1403
    put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
1404
}
1405
static void put_mspel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){
1406
    uint8_t halfH[88];
1407
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1408
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1409
}