Statistics
| Branch: | Revision:

ffmpeg / libavcodec / sh4 / qpel.c @ 55fde95e

History | View | Annotate | Download (70.5 KB)

1
/*
2
 * This is optimized for sh, which have post increment addressing (*p++).
3
 * Some CPU may be index (p[n]) faster than post increment (*p++).
4
 *
5
 * copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
6
 *
7
 * This file is part of FFmpeg.
8
 *
9
 * FFmpeg is free software; you can redistribute it and/or
10
 * modify it under the terms of the GNU Lesser General Public
11
 * License as published by the Free Software Foundation; either
12
 * version 2.1 of the License, or (at your option) any later version.
13
 *
14
 * FFmpeg is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 * Lesser General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU Lesser General Public
20
 * License along with FFmpeg; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
 */
23

    
24
#define         LD(adr) *(uint32_t*)(adr)
25

    
26
#define PIXOP2(OPNAME, OP) \
27
/*static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
28
{\
29
        do {\
30
                OP(LP(dst  ),no_rnd_avg32(LD32(src1  ),LD32(src2  )) ); \
31
                OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \
32
                src1+=src_stride1; \
33
                src2+=src_stride2; \
34
                dst+=dst_stride; \
35
        } while(--h); \
36
}\
37
\
38
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
39
{\
40
        do {\
41
                OP(LP(dst  ),rnd_avg32(LD32(src1  ),LD32(src2  )) ); \
42
                OP(LP(dst+4),rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \
43
                src1+=src_stride1; \
44
                src2+=src_stride2; \
45
                dst+=dst_stride; \
46
        } while(--h); \
47
}\
48
\
49
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
50
{\
51
        do {\
52
                OP(LP(dst  ),rnd_avg32(LD32(src1  ),LD32(src2  )) ); \
53
                src1+=src_stride1; \
54
                src2+=src_stride2; \
55
                dst+=dst_stride; \
56
        } while(--h); \
57
}\
58
\
59
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
60
{\
61
        do {\
62
                OP(LP(dst  ),no_rnd_avg32(LD32(src1  ),LD32(src2  )) ); \
63
                OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \
64
                OP(LP(dst+8),no_rnd_avg32(LD32(src1+8),LD32(src2+8)) ); \
65
                OP(LP(dst+12),no_rnd_avg32(LD32(src1+12),LD32(src2+12)) ); \
66
                src1+=src_stride1; \
67
                src2+=src_stride2; \
68
                dst+=dst_stride; \
69
        } while(--h); \
70
}\
71
\
72
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
73
{\
74
        do {\
75
                OP(LP(dst  ),rnd_avg32(LD32(src1  ),LD32(src2  )) ); \
76
                OP(LP(dst+4),rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \
77
                OP(LP(dst+8),rnd_avg32(LD32(src1+8),LD32(src2+8)) ); \
78
                OP(LP(dst+12),rnd_avg32(LD32(src1+12),LD32(src2+12)) ); \
79
                src1+=src_stride1; \
80
                src2+=src_stride2; \
81
                dst+=dst_stride; \
82
        } while(--h); \
83
}*/\
84
\
85
static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
86
{\
87
        do {\
88
                OP(LP(dst  ),rnd_avg32(LP(src1  ),LP(src2  )) ); \
89
                src1+=src_stride1; \
90
                src2+=src_stride2; \
91
                dst+=dst_stride; \
92
        } while(--h); \
93
}\
94
\
95
static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
96
{\
97
        do {\
98
                OP(LP(dst  ),rnd_avg32(LD32(src1  ),LP(src2  )) ); \
99
                src1+=src_stride1; \
100
                src2+=src_stride2; \
101
                dst+=dst_stride; \
102
        } while(--h); \
103
}\
104
\
105
static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
106
{\
107
        do {\
108
                OP(LP(dst  ),no_rnd_avg32(LD32(src1  ),LP(src2  )) ); \
109
                OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LP(src2+4)) ); \
110
                OP(LP(dst+8),no_rnd_avg32(LD32(src1+8),LP(src2+8)) ); \
111
                OP(LP(dst+12),no_rnd_avg32(LD32(src1+12),LP(src2+12)) ); \
112
                src1+=src_stride1; \
113
                src2+=src_stride2; \
114
                dst+=dst_stride; \
115
        } while(--h); \
116
}\
117
\
118
static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
119
{\
120
        do {\
121
                OP(LP(dst  ),rnd_avg32(LD32(src1  ),LP(src2  )) ); \
122
                OP(LP(dst+4),rnd_avg32(LD32(src1+4),LP(src2+4)) ); \
123
                OP(LP(dst+8),rnd_avg32(LD32(src1+8),LP(src2+8)) ); \
124
                OP(LP(dst+12),rnd_avg32(LD32(src1+12),LP(src2+12)) ); \
125
                src1+=src_stride1; \
126
                src2+=src_stride2; \
127
                dst+=dst_stride; \
128
        } while(--h); \
129
}\
130
\
131
static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
132
{\
133
        do { /* onlye src2 aligned */\
134
                OP(LP(dst  ),no_rnd_avg32(LD32(src1  ),LP(src2  )) ); \
135
                OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LP(src2+4)) ); \
136
                src1+=src_stride1; \
137
                src2+=src_stride2; \
138
                dst+=dst_stride; \
139
        } while(--h); \
140
}\
141
\
142
static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
143
{\
144
        do {\
145
                OP(LP(dst  ),rnd_avg32(LD32(src1  ),LP(src2  )) ); \
146
                OP(LP(dst+4),rnd_avg32(LD32(src1+4),LP(src2+4)) ); \
147
                src1+=src_stride1; \
148
                src2+=src_stride2; \
149
                dst+=dst_stride; \
150
        } while(--h); \
151
}\
152
\
153
static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
154
{\
155
        do {\
156
                OP(LP(dst  ),no_rnd_avg32(LP(src1  ),LP(src2  )) ); \
157
                OP(LP(dst+4),no_rnd_avg32(LP(src1+4),LP(src2+4)) ); \
158
                src1+=src_stride1; \
159
                src2+=src_stride2; \
160
                dst+=dst_stride; \
161
        } while(--h); \
162
}\
163
\
164
static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
165
{\
166
        do {\
167
                OP(LP(dst  ),rnd_avg32(LP(src1  ),LP(src2  )) ); \
168
                OP(LP(dst+4),rnd_avg32(LP(src1+4),LP(src2+4)) ); \
169
                src1+=src_stride1; \
170
                src2+=src_stride2; \
171
                dst+=dst_stride; \
172
        } while(--h); \
173
}\
174
\
175
static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
176
{\
177
        do {\
178
                OP(LP(dst  ),no_rnd_avg32(LP(src1  ),LP(src2  )) ); \
179
                OP(LP(dst+4),no_rnd_avg32(LP(src1+4),LP(src2+4)) ); \
180
                OP(LP(dst+8),no_rnd_avg32(LP(src1+8),LP(src2+8)) ); \
181
                OP(LP(dst+12),no_rnd_avg32(LP(src1+12),LP(src2+12)) ); \
182
                src1+=src_stride1; \
183
                src2+=src_stride2; \
184
                dst+=dst_stride; \
185
        } while(--h); \
186
}\
187
\
188
static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
189
{\
190
        do {\
191
                OP(LP(dst  ),rnd_avg32(LP(src1  ),LP(src2  )) ); \
192
                OP(LP(dst+4),rnd_avg32(LP(src1+4),LP(src2+4)) ); \
193
                OP(LP(dst+8),rnd_avg32(LP(src1+8),LP(src2+8)) ); \
194
                OP(LP(dst+12),rnd_avg32(LP(src1+12),LP(src2+12)) ); \
195
                src1+=src_stride1; \
196
                src2+=src_stride2; \
197
                dst+=dst_stride; \
198
        } while(--h); \
199
}\
200
\
201
static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
202
{ OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
203
\
204
static inline void OPNAME ## _pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
205
{ OPNAME ## _pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
206
\
207
static inline void OPNAME ## _no_rnd_pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
208
{ OPNAME ## _no_rnd_pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
209
\
210
static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
211
{ OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
212
\
213
static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
214
        do { \
215
                uint32_t a0,a1,a2,a3; \
216
                UNPACK(a0,a1,LP(src1),LP(src2)); \
217
                UNPACK(a2,a3,LP(src3),LP(src4)); \
218
                OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
219
                UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
220
                UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
221
                OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
222
                src1+=src_stride1;\
223
                src2+=src_stride2;\
224
                src3+=src_stride3;\
225
                src4+=src_stride4;\
226
                dst+=dst_stride;\
227
        } while(--h); \
228
} \
229
\
230
static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
231
        do { \
232
                uint32_t a0,a1,a2,a3; \
233
                UNPACK(a0,a1,LP(src1),LP(src2)); \
234
                UNPACK(a2,a3,LP(src3),LP(src4)); \
235
                OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
236
                UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
237
                UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
238
                OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
239
                src1+=src_stride1;\
240
                src2+=src_stride2;\
241
                src3+=src_stride3;\
242
                src4+=src_stride4;\
243
                dst+=dst_stride;\
244
        } while(--h); \
245
} \
246
\
247
static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
248
        do { \
249
                uint32_t a0,a1,a2,a3; /* src1 only not aligned */\
250
                UNPACK(a0,a1,LD32(src1),LP(src2)); \
251
                UNPACK(a2,a3,LP(src3),LP(src4)); \
252
                OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
253
                UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
254
                UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
255
                OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
256
                src1+=src_stride1;\
257
                src2+=src_stride2;\
258
                src3+=src_stride3;\
259
                src4+=src_stride4;\
260
                dst+=dst_stride;\
261
        } while(--h); \
262
} \
263
\
264
static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
265
        do { \
266
                uint32_t a0,a1,a2,a3; \
267
                UNPACK(a0,a1,LD32(src1),LP(src2)); \
268
                UNPACK(a2,a3,LP(src3),LP(src4)); \
269
                OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
270
                UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
271
                UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
272
                OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
273
                src1+=src_stride1;\
274
                src2+=src_stride2;\
275
                src3+=src_stride3;\
276
                src4+=src_stride4;\
277
                dst+=dst_stride;\
278
        } while(--h); \
279
} \
280
\
281
static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
282
        do { \
283
                uint32_t a0,a1,a2,a3; \
284
                UNPACK(a0,a1,LP(src1),LP(src2)); \
285
                UNPACK(a2,a3,LP(src3),LP(src4)); \
286
                OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
287
                UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
288
                UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
289
                OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
290
                UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \
291
                UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
292
                OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
293
                UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \
294
                UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
295
                OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
296
                src1+=src_stride1;\
297
                src2+=src_stride2;\
298
                src3+=src_stride3;\
299
                src4+=src_stride4;\
300
                dst+=dst_stride;\
301
        } while(--h); \
302
} \
303
\
304
static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
305
        do { \
306
                uint32_t a0,a1,a2,a3; \
307
                UNPACK(a0,a1,LP(src1),LP(src2)); \
308
                UNPACK(a2,a3,LP(src3),LP(src4)); \
309
                OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
310
                UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
311
                UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
312
                OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
313
                UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \
314
                UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
315
                OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
316
                UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \
317
                UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
318
                OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
319
                src1+=src_stride1;\
320
                src2+=src_stride2;\
321
                src3+=src_stride3;\
322
                src4+=src_stride4;\
323
                dst+=dst_stride;\
324
        } while(--h); \
325
} \
326
\
327
static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
328
        do { /* src1 is unaligned */\
329
                uint32_t a0,a1,a2,a3; \
330
                UNPACK(a0,a1,LD32(src1),LP(src2)); \
331
                UNPACK(a2,a3,LP(src3),LP(src4)); \
332
                OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
333
                UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
334
                UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
335
                OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
336
                UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \
337
                UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
338
                OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
339
                UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \
340
                UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
341
                OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
342
                src1+=src_stride1;\
343
                src2+=src_stride2;\
344
                src3+=src_stride3;\
345
                src4+=src_stride4;\
346
                dst+=dst_stride;\
347
        } while(--h); \
348
} \
349
\
350
static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
351
        do { \
352
                uint32_t a0,a1,a2,a3; \
353
                UNPACK(a0,a1,LD32(src1),LP(src2)); \
354
                UNPACK(a2,a3,LP(src3),LP(src4)); \
355
                OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
356
                UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
357
                UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
358
                OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
359
                UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \
360
                UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
361
                OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
362
                UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \
363
                UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
364
                OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
365
                src1+=src_stride1;\
366
                src2+=src_stride2;\
367
                src3+=src_stride3;\
368
                src4+=src_stride4;\
369
                dst+=dst_stride;\
370
        } while(--h); \
371
} \
372
\
373

    
374
#define op_avg(a, b) a = rnd_avg32(a,b)
375
#define op_put(a, b) a = b
376

    
377
PIXOP2(avg, op_avg)
378
PIXOP2(put, op_put)
379
#undef op_avg
380
#undef op_put
381

    
382
#define avg2(a,b) ((a+b+1)>>1)
383
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
384

    
385

    
386
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
387
{
388
    const int A=(16-x16)*(16-y16);
389
    const int B=(   x16)*(16-y16);
390
    const int C=(16-x16)*(   y16);
391
    const int D=(   x16)*(   y16);
392

    
393
    do {
394
        int t0,t1,t2,t3;
395
        uint8_t *s0 = src;
396
        uint8_t *s1 = src+stride;
397
        t0 = *s0++; t2 = *s1++;
398
        t1 = *s0++; t3 = *s1++;
399
        dst[0]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
400
        t0 = *s0++; t2 = *s1++;
401
        dst[1]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
402
        t1 = *s0++; t3 = *s1++;
403
        dst[2]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
404
        t0 = *s0++; t2 = *s1++;
405
        dst[3]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
406
        t1 = *s0++; t3 = *s1++;
407
        dst[4]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
408
        t0 = *s0++; t2 = *s1++;
409
        dst[5]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
410
        t1 = *s0++; t3 = *s1++;
411
        dst[6]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
412
        t0 = *s0++; t2 = *s1++;
413
        dst[7]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
414
        dst+= stride;
415
        src+= stride;
416
    }while(--h);
417
}
418

    
419
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
420
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
421
{
422
    int y, vx, vy;
423
    const int s= 1<<shift;
424

    
425
    width--;
426
    height--;
427

    
428
    for(y=0; y<h; y++){
429
        int x;
430

    
431
        vx= ox;
432
        vy= oy;
433
        for(x=0; x<8; x++){ //XXX FIXME optimize
434
            int src_x, src_y, frac_x, frac_y, index;
435

    
436
            src_x= vx>>16;
437
            src_y= vy>>16;
438
            frac_x= src_x&(s-1);
439
            frac_y= src_y&(s-1);
440
            src_x>>=shift;
441
            src_y>>=shift;
442

    
443
            if((unsigned)src_x < width){
444
                if((unsigned)src_y < height){
445
                    index= src_x + src_y*stride;
446
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
447
                                           + src[index       +1]*   frac_x )*(s-frac_y)
448
                                        + (  src[index+stride  ]*(s-frac_x)
449
                                           + src[index+stride+1]*   frac_x )*   frac_y
450
                                        + r)>>(shift*2);
451
                }else{
452
                    index= src_x + clip(src_y, 0, height)*stride;
453
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
454
                                          + src[index       +1]*   frac_x )*s
455
                                        + r)>>(shift*2);
456
                }
457
            }else{
458
                if((unsigned)src_y < height){
459
                    index= clip(src_x, 0, width) + src_y*stride;
460
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
461
                                           + src[index+stride  ]*   frac_y )*s
462
                                        + r)>>(shift*2);
463
                }else{
464
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
465
                    dst[y*stride + x]=    src[index         ];
466
                }
467
            }
468

    
469
            vx+= dxx;
470
            vy+= dyx;
471
        }
472
        ox += dxy;
473
        oy += dyy;
474
    }
475
}
476
#define H264_CHROMA_MC(OPNAME, OP)\
477
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
478
    const int A=(8-x)*(8-y);\
479
    const int B=(  x)*(8-y);\
480
    const int C=(8-x)*(  y);\
481
    const int D=(  x)*(  y);\
482
    \
483
    assert(x<8 && y<8 && x>=0 && y>=0);\
484
\
485
    do {\
486
        int t0,t1,t2,t3; \
487
        uint8_t *s0 = src; \
488
        uint8_t *s1 = src+stride; \
489
        t0 = *s0++; t2 = *s1++; \
490
        t1 = *s0++; t3 = *s1++; \
491
        OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
492
        t0 = *s0++; t2 = *s1++; \
493
        OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
494
        dst+= stride;\
495
        src+= stride;\
496
    }while(--h);\
497
}\
498
\
499
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
500
    const int A=(8-x)*(8-y);\
501
    const int B=(  x)*(8-y);\
502
    const int C=(8-x)*(  y);\
503
    const int D=(  x)*(  y);\
504
    \
505
    assert(x<8 && y<8 && x>=0 && y>=0);\
506
\
507
    do {\
508
        int t0,t1,t2,t3; \
509
        uint8_t *s0 = src; \
510
        uint8_t *s1 = src+stride; \
511
        t0 = *s0++; t2 = *s1++; \
512
        t1 = *s0++; t3 = *s1++; \
513
        OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
514
        t0 = *s0++; t2 = *s1++; \
515
        OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
516
        t1 = *s0++; t3 = *s1++; \
517
        OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
518
        t0 = *s0++; t2 = *s1++; \
519
        OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
520
        dst+= stride;\
521
        src+= stride;\
522
    }while(--h);\
523
}\
524
\
525
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
526
    const int A=(8-x)*(8-y);\
527
    const int B=(  x)*(8-y);\
528
    const int C=(8-x)*(  y);\
529
    const int D=(  x)*(  y);\
530
    \
531
    assert(x<8 && y<8 && x>=0 && y>=0);\
532
\
533
    do {\
534
        int t0,t1,t2,t3; \
535
        uint8_t *s0 = src; \
536
        uint8_t *s1 = src+stride; \
537
        t0 = *s0++; t2 = *s1++; \
538
        t1 = *s0++; t3 = *s1++; \
539
        OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
540
        t0 = *s0++; t2 = *s1++; \
541
        OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
542
        t1 = *s0++; t3 = *s1++; \
543
        OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
544
        t0 = *s0++; t2 = *s1++; \
545
        OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
546
        t1 = *s0++; t3 = *s1++; \
547
        OP(dst[4], (A*t0 + B*t1 + C*t2 + D*t3));\
548
        t0 = *s0++; t2 = *s1++; \
549
        OP(dst[5], (A*t1 + B*t0 + C*t3 + D*t2));\
550
        t1 = *s0++; t3 = *s1++; \
551
        OP(dst[6], (A*t0 + B*t1 + C*t2 + D*t3));\
552
        t0 = *s0++; t2 = *s1++; \
553
        OP(dst[7], (A*t1 + B*t0 + C*t3 + D*t2));\
554
        dst+= stride;\
555
        src+= stride;\
556
    }while(--h);\
557
}
558

    
559
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
560
#define op_put(a, b) a = (((b) + 32)>>6)
561

    
562
H264_CHROMA_MC(put_       , op_put)
563
H264_CHROMA_MC(avg_       , op_avg)
564
#undef op_avg
565
#undef op_put
566

    
567
/* not yet optimized */
568
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
569
{
570
    int i;
571
    for(i=0; i<h; i++)
572
    {
573
        ST32(dst   , LD32(src   ));
574
        dst+=dstStride;
575
        src+=srcStride;
576
    }
577
}
578

    
579
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
580
{
581
    int i;
582
    for(i=0; i<h; i++)
583
    {
584
        ST32(dst   , LD32(src   ));
585
        ST32(dst+4 , LD32(src+4 ));
586
        dst+=dstStride;
587
        src+=srcStride;
588
    }
589
}
590

    
591
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
592
{
593
    int i;
594
    for(i=0; i<h; i++)
595
    {
596
        ST32(dst   , LD32(src   ));
597
        ST32(dst+4 , LD32(src+4 ));
598
        ST32(dst+8 , LD32(src+8 ));
599
        ST32(dst+12, LD32(src+12));
600
        dst+=dstStride;
601
        src+=srcStride;
602
    }
603
}
604

    
605
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
606
{
607
    int i;
608
    for(i=0; i<h; i++)
609
    {
610
        ST32(dst   , LD32(src   ));
611
        ST32(dst+4 , LD32(src+4 ));
612
        ST32(dst+8 , LD32(src+8 ));
613
        ST32(dst+12, LD32(src+12));
614
        dst[16]= src[16];
615
        dst+=dstStride;
616
        src+=srcStride;
617
    }
618
}
619

    
620
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
621
{
622
    int i;
623
    for(i=0; i<h; i++)
624
    {
625
        ST32(dst   , LD32(src   ));
626
        ST32(dst+4 , LD32(src+4 ));
627
        dst[8]= src[8];
628
        dst+=dstStride;
629
        src+=srcStride;
630
    }
631
}
632
/* end not optimized */
633

    
634
#define QPEL_MC(r, OPNAME, RND, OP) \
635
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
636
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
637
    do {\
638
        uint8_t *s = src; \
639
        int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
640
        src0= *s++;\
641
        src1= *s++;\
642
        src2= *s++;\
643
        src3= *s++;\
644
        src4= *s++;\
645
        OP(dst[0], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
646
        src5= *s++;\
647
        OP(dst[1], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
648
        src6= *s++;\
649
        OP(dst[2], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
650
        src7= *s++;\
651
        OP(dst[3], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
652
        src8= *s++;\
653
        OP(dst[4], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
654
        OP(dst[5], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
655
        OP(dst[6], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
656
        OP(dst[7], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
657
        dst+=dstStride;\
658
        src+=srcStride;\
659
    }while(--h);\
660
}\
661
\
662
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
663
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
664
    int w=8;\
665
    do{\
666
        uint8_t *s = src, *d=dst;\
667
        int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
668
        src0 = *s; s+=srcStride; \
669
        src1 = *s; s+=srcStride; \
670
        src2 = *s; s+=srcStride; \
671
        src3 = *s; s+=srcStride; \
672
        src4 = *s; s+=srcStride; \
673
        OP(*d, (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));d+=dstStride;\
674
        src5 = *s; s+=srcStride; \
675
        OP(*d, (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));d+=dstStride;\
676
        src6 = *s; s+=srcStride; \
677
        OP(*d, (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));d+=dstStride;\
678
        src7 = *s; s+=srcStride; \
679
        OP(*d, (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));d+=dstStride;\
680
        src8 = *s; \
681
        OP(*d, (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));d+=dstStride;\
682
        OP(*d, (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));d+=dstStride;\
683
        OP(*d, (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));d+=dstStride;\
684
        OP(*d, (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
685
        dst++;\
686
        src++;\
687
    }while(--w);\
688
}\
689
\
690
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
691
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
692
    do {\
693
        uint8_t *s = src;\
694
        int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
695
        int src9,src10,src11,src12,src13,src14,src15,src16;\
696
        src0= *s++;\
697
        src1= *s++;\
698
        src2= *s++;\
699
        src3= *s++;\
700
        src4= *s++;\
701
        OP(dst[ 0], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
702
        src5= *s++;\
703
        OP(dst[ 1], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
704
        src6= *s++;\
705
        OP(dst[ 2], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
706
        src7= *s++;\
707
        OP(dst[ 3], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
708
        src8= *s++;\
709
        OP(dst[ 4], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
710
        src9= *s++;\
711
        OP(dst[ 5], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
712
        src10= *s++;\
713
        OP(dst[ 6], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
714
        src11= *s++;\
715
        OP(dst[ 7], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
716
        src12= *s++;\
717
        OP(dst[ 8], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
718
        src13= *s++;\
719
        OP(dst[ 9], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
720
        src14= *s++;\
721
        OP(dst[10], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
722
        src15= *s++;\
723
        OP(dst[11], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
724
        src16= *s++;\
725
        OP(dst[12], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
726
        OP(dst[13], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
727
        OP(dst[14], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
728
        OP(dst[15], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
729
        dst+=dstStride;\
730
        src+=srcStride;\
731
    }while(--h);\
732
}\
733
\
734
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
735
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
736
    int w=16;\
737
    do {\
738
        uint8_t *s = src, *d=dst;\
739
        int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
740
        int src9,src10,src11,src12,src13,src14,src15,src16;\
741
        src0 = *s; s+=srcStride; \
742
        src1 = *s; s+=srcStride; \
743
        src2 = *s; s+=srcStride; \
744
        src3 = *s; s+=srcStride; \
745
        src4 = *s; s+=srcStride; \
746
        OP(*d, (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));d+=dstStride;\
747
        src5 = *s; s+=srcStride; \
748
        OP(*d, (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));d+=dstStride;\
749
        src6 = *s; s+=srcStride; \
750
        OP(*d, (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));d+=dstStride;\
751
        src7 = *s; s+=srcStride; \
752
        OP(*d, (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));d+=dstStride;\
753
        src8 = *s; s+=srcStride; \
754
        OP(*d, (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));d+=dstStride;\
755
        src9 = *s; s+=srcStride; \
756
        OP(*d, (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));d+=dstStride;\
757
        src10 = *s; s+=srcStride; \
758
        OP(*d, (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));d+=dstStride;\
759
        src11 = *s; s+=srcStride; \
760
        OP(*d, (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));d+=dstStride;\
761
        src12 = *s; s+=srcStride; \
762
        OP(*d, (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));d+=dstStride;\
763
        src13 = *s; s+=srcStride; \
764
        OP(*d, (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));d+=dstStride;\
765
        src14 = *s; s+=srcStride; \
766
        OP(*d, (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));d+=dstStride;\
767
        src15 = *s; s+=srcStride; \
768
        OP(*d, (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));d+=dstStride;\
769
        src16 = *s; \
770
        OP(*d, (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));d+=dstStride;\
771
        OP(*d, (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));d+=dstStride;\
772
        OP(*d, (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));d+=dstStride;\
773
        OP(*d, (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
774
        dst++;\
775
        src++;\
776
    }while(--w);\
777
}\
778
\
779
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
780
    OPNAME ## pixels8_c(dst, src, stride, 8);\
781
}\
782
\
783
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
784
    uint8_t half[64];\
785
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
786
    OPNAME ## pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);\
787
}\
788
\
789
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
790
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
791
}\
792
\
793
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
794
    uint8_t half[64];\
795
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
796
    OPNAME ## pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);\
797
}\
798
\
799
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
800
    uint8_t full[16*9];\
801
    uint8_t half[64];\
802
    copy_block9(full, src, 16, stride, 9);\
803
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
804
    OPNAME ## pixels8_l2_aligned(dst, full, half, stride, 16, 8, 8);\
805
}\
806
\
807
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
808
    uint8_t full[16*9];\
809
    copy_block9(full, src, 16, stride, 9);\
810
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
811
}\
812
\
813
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
814
    uint8_t full[16*9];\
815
    uint8_t half[64];\
816
    copy_block9(full, src, 16, stride, 9);\
817
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
818
    OPNAME ## pixels8_l2_aligned(dst, full+16, half, stride, 16, 8, 8);\
819
}\
820
static void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
821
    uint8_t full[16*9];\
822
    uint8_t halfH[72];\
823
    uint8_t halfV[64];\
824
    uint8_t halfHV[64];\
825
    copy_block9(full, src, 16, stride, 9);\
826
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
827
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
828
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
829
    OPNAME ## pixels8_l4_aligned(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
830
}\
831
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
832
    uint8_t full[16*9];\
833
    uint8_t halfH[72];\
834
    uint8_t halfHV[64];\
835
    copy_block9(full, src, 16, stride, 9);\
836
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
837
    put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
838
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
839
    OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
840
}\
841
static void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
842
    uint8_t full[16*9];\
843
    uint8_t halfH[72];\
844
    uint8_t halfV[64];\
845
    uint8_t halfHV[64];\
846
    copy_block9(full, src, 16, stride, 9);\
847
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
848
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
849
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
850
    OPNAME ## pixels8_l4_aligned0(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
851
}\
852
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
853
    uint8_t full[16*9];\
854
    uint8_t halfH[72];\
855
    uint8_t halfHV[64];\
856
    copy_block9(full, src, 16, stride, 9);\
857
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
858
    put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
859
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
860
    OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
861
}\
862
static void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
863
    uint8_t full[16*9];\
864
    uint8_t halfH[72];\
865
    uint8_t halfV[64];\
866
    uint8_t halfHV[64];\
867
    copy_block9(full, src, 16, stride, 9);\
868
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
869
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
870
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
871
    OPNAME ## pixels8_l4_aligned(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
872
}\
873
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
874
    uint8_t full[16*9];\
875
    uint8_t halfH[72];\
876
    uint8_t halfHV[64];\
877
    copy_block9(full, src, 16, stride, 9);\
878
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
879
    put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
880
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
881
    OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
882
}\
883
static void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
884
    uint8_t full[16*9];\
885
    uint8_t halfH[72];\
886
    uint8_t halfV[64];\
887
    uint8_t halfHV[64];\
888
    copy_block9(full, src, 16, stride, 9);\
889
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
890
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
891
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
892
    OPNAME ## pixels8_l4_aligned0(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
893
}\
894
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
895
    uint8_t full[16*9];\
896
    uint8_t halfH[72];\
897
    uint8_t halfHV[64];\
898
    copy_block9(full, src, 16, stride, 9);\
899
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
900
    put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
901
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
902
    OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
903
}\
904
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
905
    uint8_t halfH[72];\
906
    uint8_t halfHV[64];\
907
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
908
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
909
    OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
910
}\
911
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
912
    uint8_t halfH[72];\
913
    uint8_t halfHV[64];\
914
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
915
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
916
    OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
917
}\
918
static void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
919
    uint8_t full[16*9];\
920
    uint8_t halfH[72];\
921
    uint8_t halfV[64];\
922
    uint8_t halfHV[64];\
923
    copy_block9(full, src, 16, stride, 9);\
924
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
925
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
926
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
927
    OPNAME ## pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);\
928
}\
929
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
930
    uint8_t full[16*9];\
931
    uint8_t halfH[72];\
932
    copy_block9(full, src, 16, stride, 9);\
933
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
934
    put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
935
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
936
}\
937
static void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
938
    uint8_t full[16*9];\
939
    uint8_t halfH[72];\
940
    uint8_t halfV[64];\
941
    uint8_t halfHV[64];\
942
    copy_block9(full, src, 16, stride, 9);\
943
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
944
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
945
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
946
    OPNAME ## pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);\
947
}\
948
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
949
    uint8_t full[16*9];\
950
    uint8_t halfH[72];\
951
    copy_block9(full, src, 16, stride, 9);\
952
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
953
    put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
954
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
955
}\
956
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
957
    uint8_t halfH[72];\
958
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
959
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
960
}\
961
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
962
    OPNAME ## pixels16_c(dst, src, stride, 16);\
963
}\
964
\
965
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
966
    uint8_t half[256];\
967
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
968
    OPNAME ## pixels16_l2_aligned2(dst, src, half, stride, stride, 16, 16);\
969
}\
970
\
971
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
972
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
973
}\
974
\
975
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
976
    uint8_t half[256];\
977
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
978
    OPNAME ## pixels16_l2_aligned2(dst, src+1, half, stride, stride, 16, 16);\
979
}\
980
\
981
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
982
    uint8_t full[24*17];\
983
    uint8_t half[256];\
984
    copy_block17(full, src, 24, stride, 17);\
985
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
986
    OPNAME ## pixels16_l2_aligned(dst, full, half, stride, 24, 16, 16);\
987
}\
988
\
989
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
990
    uint8_t full[24*17];\
991
    copy_block17(full, src, 24, stride, 17);\
992
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
993
}\
994
\
995
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
996
    uint8_t full[24*17];\
997
    uint8_t half[256];\
998
    copy_block17(full, src, 24, stride, 17);\
999
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1000
    OPNAME ## pixels16_l2_aligned(dst, full+24, half, stride, 24, 16, 16);\
1001
}\
1002
static void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1003
    uint8_t full[24*17];\
1004
    uint8_t halfH[272];\
1005
    uint8_t halfV[256];\
1006
    uint8_t halfHV[256];\
1007
    copy_block17(full, src, 24, stride, 17);\
1008
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1009
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1010
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1011
    OPNAME ## pixels16_l4_aligned(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1012
}\
1013
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1014
    uint8_t full[24*17];\
1015
    uint8_t halfH[272];\
1016
    uint8_t halfHV[256];\
1017
    copy_block17(full, src, 24, stride, 17);\
1018
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1019
    put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
1020
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1021
    OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
1022
}\
1023
static void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1024
    uint8_t full[24*17];\
1025
    uint8_t halfH[272];\
1026
    uint8_t halfV[256];\
1027
    uint8_t halfHV[256];\
1028
    copy_block17(full, src, 24, stride, 17);\
1029
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1030
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1031
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1032
    OPNAME ## pixels16_l4_aligned0(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1033
}\
1034
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1035
    uint8_t full[24*17];\
1036
    uint8_t halfH[272];\
1037
    uint8_t halfHV[256];\
1038
    copy_block17(full, src, 24, stride, 17);\
1039
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1040
    put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
1041
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1042
    OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
1043
}\
1044
static void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1045
    uint8_t full[24*17];\
1046
    uint8_t halfH[272];\
1047
    uint8_t halfV[256];\
1048
    uint8_t halfHV[256];\
1049
    copy_block17(full, src, 24, stride, 17);\
1050
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1051
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1052
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1053
    OPNAME ## pixels16_l4_aligned(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1054
}\
1055
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1056
    uint8_t full[24*17];\
1057
    uint8_t halfH[272];\
1058
    uint8_t halfHV[256];\
1059
    copy_block17(full, src, 24, stride, 17);\
1060
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1061
    put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
1062
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1063
    OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1064
}\
1065
static void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1066
    uint8_t full[24*17];\
1067
    uint8_t halfH[272];\
1068
    uint8_t halfV[256];\
1069
    uint8_t halfHV[256];\
1070
    copy_block17(full, src, 24, stride, 17);\
1071
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1072
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1073
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1074
    OPNAME ## pixels16_l4_aligned0(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1075
}\
1076
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1077
    uint8_t full[24*17];\
1078
    uint8_t halfH[272];\
1079
    uint8_t halfHV[256];\
1080
    copy_block17(full, src, 24, stride, 17);\
1081
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1082
    put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
1083
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1084
    OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1085
}\
1086
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1087
    uint8_t halfH[272];\
1088
    uint8_t halfHV[256];\
1089
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1090
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1091
    OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
1092
}\
1093
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1094
    uint8_t halfH[272];\
1095
    uint8_t halfHV[256];\
1096
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1097
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1098
    OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1099
}\
1100
static void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1101
    uint8_t full[24*17];\
1102
    uint8_t halfH[272];\
1103
    uint8_t halfV[256];\
1104
    uint8_t halfHV[256];\
1105
    copy_block17(full, src, 24, stride, 17);\
1106
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1107
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1108
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1109
    OPNAME ## pixels16_l2_aligned(dst, halfV, halfHV, stride, 16, 16, 16);\
1110
}\
1111
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1112
    uint8_t full[24*17];\
1113
    uint8_t halfH[272];\
1114
    copy_block17(full, src, 24, stride, 17);\
1115
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1116
    put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
1117
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1118
}\
1119
static void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1120
    uint8_t full[24*17];\
1121
    uint8_t halfH[272];\
1122
    uint8_t halfV[256];\
1123
    uint8_t halfHV[256];\
1124
    copy_block17(full, src, 24, stride, 17);\
1125
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1126
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1127
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1128
    OPNAME ## pixels16_l2_aligned(dst, halfV, halfHV, stride, 16, 16, 16);\
1129
}\
1130
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1131
    uint8_t full[24*17];\
1132
    uint8_t halfH[272];\
1133
    copy_block17(full, src, 24, stride, 17);\
1134
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1135
    put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
1136
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1137
}\
1138
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1139
    uint8_t halfH[272];\
1140
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1141
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1142
}
1143

    
1144
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1145
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1146
#define op_put(a, b) a = cm[((b) + 16)>>5]
1147
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1148

    
1149
QPEL_MC(0, put_       , _       , op_put)
1150
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1151
QPEL_MC(0, avg_       , _       , op_avg)
1152
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1153
#undef op_avg
1154
#undef op_avg_no_rnd
1155
#undef op_put
1156
#undef op_put_no_rnd
1157

    
1158
#if 1
1159
#define H264_LOWPASS(OPNAME, OP, OP2) \
1160
static inline void OPNAME ## h264_qpel_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
1161
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1162
    do {\
1163
        int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
1164
        uint8_t *s = src-2;\
1165
        srcB = *s++;\
1166
        srcA = *s++;\
1167
        src0 = *s++;\
1168
        src1 = *s++;\
1169
        src2 = *s++;\
1170
        src3 = *s++;\
1171
        OP(dst[0], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1172
        src4 = *s++;\
1173
        OP(dst[1], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1174
        src5 = *s++;\
1175
        OP(dst[2], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1176
        src6 = *s++;\
1177
        OP(dst[3], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1178
      if (w>4) { /* it optimized */ \
1179
        int src7,src8,src9,src10; \
1180
        src7 = *s++;\
1181
        OP(dst[4], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1182
        src8 = *s++;\
1183
        OP(dst[5], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1184
        src9 = *s++;\
1185
        OP(dst[6], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1186
        src10 = *s++;\
1187
        OP(dst[7], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1188
       if (w>8) { \
1189
        int src11,src12,src13,src14,src15,src16,src17,src18; \
1190
        src11 = *s++;\
1191
        OP(dst[8] , (src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));\
1192
        src12 = *s++;\
1193
        OP(dst[9] , (src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));\
1194
        src13 = *s++;\
1195
        OP(dst[10], (src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));\
1196
        src14 = *s++;\
1197
        OP(dst[11], (src11+src12)*20 - (src10+src13)*5 + (src9 +src14));\
1198
        src15 = *s++;\
1199
        OP(dst[12], (src12+src13)*20 - (src11+src14)*5 + (src10+src15));\
1200
        src16 = *s++;\
1201
        OP(dst[13], (src13+src14)*20 - (src12+src15)*5 + (src11+src16));\
1202
        src17 = *s++;\
1203
        OP(dst[14], (src14+src15)*20 - (src13+src16)*5 + (src12+src17));\
1204
        src18 = *s++;\
1205
        OP(dst[15], (src15+src16)*20 - (src14+src17)*5 + (src13+src18));\
1206
       } \
1207
      } \
1208
        dst+=dstStride;\
1209
        src+=srcStride;\
1210
    }while(--h);\
1211
}\
1212
\
1213
static inline void OPNAME ## h264_qpel_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
1214
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1215
    do{\
1216
        int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
1217
        uint8_t *s = src-2*srcStride,*d=dst;\
1218
        srcB = *s; s+=srcStride;\
1219
        srcA = *s; s+=srcStride;\
1220
        src0 = *s; s+=srcStride;\
1221
        src1 = *s; s+=srcStride;\
1222
        src2 = *s; s+=srcStride;\
1223
        src3 = *s; s+=srcStride;\
1224
        OP(*d, (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));d+=dstStride;\
1225
        src4 = *s; s+=srcStride;\
1226
        OP(*d, (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));d+=dstStride;\
1227
        src5 = *s; s+=srcStride;\
1228
        OP(*d, (src2+src3)*20 - (src1+src4)*5 + (src0+src5));d+=dstStride;\
1229
        src6 = *s; s+=srcStride;\
1230
        OP(*d, (src3+src4)*20 - (src2+src5)*5 + (src1+src6));d+=dstStride;\
1231
      if (h>4) { \
1232
        int src7,src8,src9,src10; \
1233
        src7 = *s; s+=srcStride;\
1234
        OP(*d, (src4+src5)*20 - (src3+src6)*5 + (src2+src7));d+=dstStride;\
1235
        src8 = *s; s+=srcStride;\
1236
        OP(*d, (src5+src6)*20 - (src4+src7)*5 + (src3+src8));d+=dstStride;\
1237
        src9 = *s; s+=srcStride;\
1238
        OP(*d, (src6+src7)*20 - (src5+src8)*5 + (src4+src9));d+=dstStride;\
1239
        src10 = *s; s+=srcStride;\
1240
        OP(*d, (src7+src8)*20 - (src6+src9)*5 + (src5+src10));d+=dstStride;\
1241
       if (h>8) { \
1242
        int src11,src12,src13,src14,src15,src16,src17,src18; \
1243
        src11 = *s; s+=srcStride;\
1244
        OP(*d , (src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));d+=dstStride;\
1245
        src12 = *s; s+=srcStride;\
1246
        OP(*d , (src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));d+=dstStride;\
1247
        src13 = *s; s+=srcStride;\
1248
        OP(*d, (src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));d+=dstStride;\
1249
        src14 = *s; s+=srcStride;\
1250
        OP(*d, (src11+src12)*20 - (src10+src13)*5 + (src9 +src14));d+=dstStride;\
1251
        src15 = *s; s+=srcStride;\
1252
        OP(*d, (src12+src13)*20 - (src11+src14)*5 + (src10+src15));d+=dstStride;\
1253
        src16 = *s; s+=srcStride;\
1254
        OP(*d, (src13+src14)*20 - (src12+src15)*5 + (src11+src16));d+=dstStride;\
1255
        src17 = *s; s+=srcStride;\
1256
        OP(*d, (src14+src15)*20 - (src13+src16)*5 + (src12+src17));d+=dstStride;\
1257
        src18 = *s; s+=srcStride;\
1258
        OP(*d, (src15+src16)*20 - (src14+src17)*5 + (src13+src18));d+=dstStride;\
1259
       } \
1260
      } \
1261
        dst++;\
1262
        src++;\
1263
    }while(--w);\
1264
}\
1265
\
1266
static inline void OPNAME ## h264_qpel_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride,int w,int h){\
1267
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1268
    int i;\
1269
    src -= 2*srcStride;\
1270
    i= h+5; \
1271
    do {\
1272
        int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
1273
        uint8_t *s = src-2;\
1274
        srcB = *s++;\
1275
        srcA = *s++;\
1276
        src0 = *s++;\
1277
        src1 = *s++;\
1278
        src2 = *s++;\
1279
        src3 = *s++;\
1280
        tmp[0] = ((src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1281
        src4 = *s++;\
1282
        tmp[1] = ((src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1283
        src5 = *s++;\
1284
        tmp[2] = ((src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1285
        src6 = *s++;\
1286
        tmp[3] = ((src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1287
      if (w>4) { /* it optimized */ \
1288
        int src7,src8,src9,src10; \
1289
        src7 = *s++;\
1290
        tmp[4] = ((src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1291
        src8 = *s++;\
1292
        tmp[5] = ((src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1293
        src9 = *s++;\
1294
        tmp[6] = ((src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1295
        src10 = *s++;\
1296
        tmp[7] = ((src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1297
       if (w>8) { \
1298
        int src11,src12,src13,src14,src15,src16,src17,src18; \
1299
        src11 = *s++;\
1300
        tmp[8] = ((src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));\
1301
        src12 = *s++;\
1302
        tmp[9] = ((src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));\
1303
        src13 = *s++;\
1304
        tmp[10] = ((src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));\
1305
        src14 = *s++;\
1306
        tmp[11] = ((src11+src12)*20 - (src10+src13)*5 + (src9 +src14));\
1307
        src15 = *s++;\
1308
        tmp[12] = ((src12+src13)*20 - (src11+src14)*5 + (src10+src15));\
1309
        src16 = *s++;\
1310
        tmp[13] = ((src13+src14)*20 - (src12+src15)*5 + (src11+src16));\
1311
        src17 = *s++;\
1312
        tmp[14] = ((src14+src15)*20 - (src13+src16)*5 + (src12+src17));\
1313
        src18 = *s++;\
1314
        tmp[15] = ((src15+src16)*20 - (src14+src17)*5 + (src13+src18));\
1315
       } \
1316
      } \
1317
        tmp+=tmpStride;\
1318
        src+=srcStride;\
1319
    }while(--i);\
1320
    tmp -= tmpStride*(h+5-2);\
1321
    i = w; \
1322
    do {\
1323
        int tmpB,tmpA,tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6;\
1324
        int16_t *s = tmp-2*tmpStride; \
1325
        uint8_t *d=dst;\
1326
        tmpB = *s; s+=tmpStride;\
1327
        tmpA = *s; s+=tmpStride;\
1328
        tmp0 = *s; s+=tmpStride;\
1329
        tmp1 = *s; s+=tmpStride;\
1330
        tmp2 = *s; s+=tmpStride;\
1331
        tmp3 = *s; s+=tmpStride;\
1332
        OP2(*d, (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));d+=dstStride;\
1333
        tmp4 = *s; s+=tmpStride;\
1334
        OP2(*d, (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));d+=dstStride;\
1335
        tmp5 = *s; s+=tmpStride;\
1336
        OP2(*d, (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));d+=dstStride;\
1337
        tmp6 = *s; s+=tmpStride;\
1338
        OP2(*d, (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));d+=dstStride;\
1339
      if (h>4) { \
1340
        int tmp7,tmp8,tmp9,tmp10; \
1341
        tmp7 = *s; s+=tmpStride;\
1342
        OP2(*d, (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));d+=dstStride;\
1343
        tmp8 = *s; s+=tmpStride;\
1344
        OP2(*d, (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));d+=dstStride;\
1345
        tmp9 = *s; s+=tmpStride;\
1346
        OP2(*d, (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));d+=dstStride;\
1347
        tmp10 = *s; s+=tmpStride;\
1348
        OP2(*d, (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));d+=dstStride;\
1349
       if (h>8) { \
1350
        int tmp11,tmp12,tmp13,tmp14,tmp15,tmp16,tmp17,tmp18; \
1351
        tmp11 = *s; s+=tmpStride;\
1352
        OP2(*d , (tmp8 +tmp9 )*20 - (tmp7 +tmp10)*5 + (tmp6 +tmp11));d+=dstStride;\
1353
        tmp12 = *s; s+=tmpStride;\
1354
        OP2(*d , (tmp9 +tmp10)*20 - (tmp8 +tmp11)*5 + (tmp7 +tmp12));d+=dstStride;\
1355
        tmp13 = *s; s+=tmpStride;\
1356
        OP2(*d, (tmp10+tmp11)*20 - (tmp9 +tmp12)*5 + (tmp8 +tmp13));d+=dstStride;\
1357
        tmp14 = *s; s+=tmpStride;\
1358
        OP2(*d, (tmp11+tmp12)*20 - (tmp10+tmp13)*5 + (tmp9 +tmp14));d+=dstStride;\
1359
        tmp15 = *s; s+=tmpStride;\
1360
        OP2(*d, (tmp12+tmp13)*20 - (tmp11+tmp14)*5 + (tmp10+tmp15));d+=dstStride;\
1361
        tmp16 = *s; s+=tmpStride;\
1362
        OP2(*d, (tmp13+tmp14)*20 - (tmp12+tmp15)*5 + (tmp11+tmp16));d+=dstStride;\
1363
        tmp17 = *s; s+=tmpStride;\
1364
        OP2(*d, (tmp14+tmp15)*20 - (tmp13+tmp16)*5 + (tmp12+tmp17));d+=dstStride;\
1365
        tmp18 = *s; s+=tmpStride;\
1366
        OP2(*d, (tmp15+tmp16)*20 - (tmp14+tmp17)*5 + (tmp13+tmp18));d+=dstStride;\
1367
       } \
1368
      } \
1369
        dst++;\
1370
        tmp++;\
1371
    }while(--i);\
1372
}\
1373
\
1374
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1375
    OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,4,4); \
1376
}\
1377
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1378
   OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,8,8); \
1379
}\
1380
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1381
   OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,16,16); \
1382
}\
1383
\
1384
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1385
   OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,4,4); \
1386
}\
1387
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1388
   OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,8,8); \
1389
}\
1390
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1391
   OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,16,16); \
1392
}\
1393
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1394
   OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,4,4); \
1395
}\
1396
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1397
   OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,8,8); \
1398
}\
1399
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1400
   OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,16,16); \
1401
}\
1402

    
1403
#define H264_MC(OPNAME, SIZE) \
1404
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1405
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
1406
}\
1407
\
1408
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1409
    uint8_t half[SIZE*SIZE];\
1410
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1411
    OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src, half, stride, stride, SIZE, SIZE);\
1412
}\
1413
\
1414
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1415
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
1416
}\
1417
\
1418
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1419
    uint8_t half[SIZE*SIZE];\
1420
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1421
    OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src+1, half, stride, stride, SIZE, SIZE);\
1422
}\
1423
\
1424
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1425
    uint8_t full[SIZE*(SIZE+5)];\
1426
    uint8_t * const full_mid= full + SIZE*2;\
1427
    uint8_t half[SIZE*SIZE];\
1428
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1429
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1430
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
1431
}\
1432
\
1433
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1434
    uint8_t full[SIZE*(SIZE+5)];\
1435
    uint8_t * const full_mid= full + SIZE*2;\
1436
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1437
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
1438
}\
1439
\
1440
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1441
    uint8_t full[SIZE*(SIZE+5)];\
1442
    uint8_t * const full_mid= full + SIZE*2;\
1443
    uint8_t half[SIZE*SIZE];\
1444
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1445
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1446
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
1447
}\
1448
\
1449
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1450
    uint8_t full[SIZE*(SIZE+5)];\
1451
    uint8_t * const full_mid= full + SIZE*2;\
1452
    uint8_t halfH[SIZE*SIZE];\
1453
    uint8_t halfV[SIZE*SIZE];\
1454
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1455
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1456
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1457
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1458
}\
1459
\
1460
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1461
    uint8_t full[SIZE*(SIZE+5)];\
1462
    uint8_t * const full_mid= full + SIZE*2;\
1463
    uint8_t halfH[SIZE*SIZE];\
1464
    uint8_t halfV[SIZE*SIZE];\
1465
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1466
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1467
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1468
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1469
}\
1470
\
1471
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1472
    uint8_t full[SIZE*(SIZE+5)];\
1473
    uint8_t * const full_mid= full + SIZE*2;\
1474
    uint8_t halfH[SIZE*SIZE];\
1475
    uint8_t halfV[SIZE*SIZE];\
1476
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1477
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1478
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1479
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1480
}\
1481
\
1482
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1483
    uint8_t full[SIZE*(SIZE+5)];\
1484
    uint8_t * const full_mid= full + SIZE*2;\
1485
    uint8_t halfH[SIZE*SIZE];\
1486
    uint8_t halfV[SIZE*SIZE];\
1487
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1488
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1489
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1490
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1491
}\
1492
\
1493
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1494
    int16_t tmp[SIZE*(SIZE+5)];\
1495
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
1496
}\
1497
\
1498
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1499
    int16_t tmp[SIZE*(SIZE+5)];\
1500
    uint8_t halfH[SIZE*SIZE];\
1501
    uint8_t halfHV[SIZE*SIZE];\
1502
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1503
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1504
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1505
}\
1506
\
1507
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1508
    int16_t tmp[SIZE*(SIZE+5)];\
1509
    uint8_t halfH[SIZE*SIZE];\
1510
    uint8_t halfHV[SIZE*SIZE];\
1511
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1512
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1513
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1514
}\
1515
\
1516
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1517
    uint8_t full[SIZE*(SIZE+5)];\
1518
    uint8_t * const full_mid= full + SIZE*2;\
1519
    int16_t tmp[SIZE*(SIZE+5)];\
1520
    uint8_t halfV[SIZE*SIZE];\
1521
    uint8_t halfHV[SIZE*SIZE];\
1522
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1523
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1524
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1525
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1526
}\
1527
\
1528
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1529
    uint8_t full[SIZE*(SIZE+5)];\
1530
    uint8_t * const full_mid= full + SIZE*2;\
1531
    int16_t tmp[SIZE*(SIZE+5)];\
1532
    uint8_t halfV[SIZE*SIZE];\
1533
    uint8_t halfHV[SIZE*SIZE];\
1534
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1535
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1536
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1537
    OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1538
}\
1539

    
1540
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1541
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
1542
#define op_put(a, b)  a = cm[((b) + 16)>>5]
1543
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
1544
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
1545

    
1546
H264_LOWPASS(put_       , op_put, op2_put)
1547
H264_LOWPASS(avg_       , op_avg, op2_avg)
1548
H264_MC(put_, 4)
1549
H264_MC(put_, 8)
1550
H264_MC(put_, 16)
1551
H264_MC(avg_, 4)
1552
H264_MC(avg_, 8)
1553
H264_MC(avg_, 16)
1554

    
1555
#undef op_avg
1556
#undef op_put
1557
#undef op2_avg
1558
#undef op2_put
1559
#endif
1560

    
1561
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1562
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1563

    
1564
    do{
1565
        int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
1566
        uint8_t *s = src;
1567
        src_1 = s[-1];
1568
        src0 = *s++;
1569
        src1 = *s++;
1570
        src2 = *s++;
1571
        dst[0]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1572
        src3 = *s++;
1573
        dst[1]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1574
        src4 = *s++;
1575
        dst[2]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1576
        src5 = *s++;
1577
        dst[3]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1578
        src6 = *s++;
1579
        dst[4]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1580
        src7 = *s++;
1581
        dst[5]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1582
        src8 = *s++;
1583
        dst[6]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1584
        src9 = *s++;
1585
        dst[7]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1586
        dst+=dstStride;
1587
        src+=srcStride;
1588
    }while(--h);
1589
}
1590

    
1591
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1592
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1593

    
1594
    do{
1595
        int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
1596
        uint8_t *s = src,*d = dst;
1597
        src_1 = *(s-srcStride);
1598
        src0 = *s; s+=srcStride;
1599
        src1 = *s; s+=srcStride;
1600
        src2 = *s; s+=srcStride;
1601
        *d= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4]; d+=dstStride;
1602
        src3 = *s; s+=srcStride;
1603
        *d= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4]; d+=dstStride;
1604
        src4 = *s; s+=srcStride;
1605
        *d= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4]; d+=dstStride;
1606
        src5 = *s; s+=srcStride;
1607
        *d= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4]; d+=dstStride;
1608
        src6 = *s; s+=srcStride;
1609
        *d= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4]; d+=dstStride;
1610
        src7 = *s; s+=srcStride;
1611
        *d= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4]; d+=dstStride;
1612
        src8 = *s; s+=srcStride;
1613
        *d= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4]; d+=dstStride;
1614
        src9 = *s;
1615
        *d= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4]; d+=dstStride;
1616
        src++;
1617
        dst++;
1618
    }while(--w);
1619
}
1620

    
1621
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
1622
    put_pixels8_c(dst, src, stride, 8);
1623
}
1624

    
1625
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1626
    uint8_t half[64];
1627
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1628
    put_pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);
1629
}
1630

    
1631
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1632
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1633
}
1634

    
1635
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1636
    uint8_t half[64];
1637
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1638
    put_pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);
1639
}
1640

    
1641
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1642
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1643
}
1644

    
1645
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1646
    uint8_t halfH[88];
1647
    uint8_t halfV[64];
1648
    uint8_t halfHV[64];
1649
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1650
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1651
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1652
    put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
1653
}
1654
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1655
    uint8_t halfH[88];
1656
    uint8_t halfV[64];
1657
    uint8_t halfHV[64];
1658
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1659
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1660
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1661
    put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
1662
}
1663
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1664
    uint8_t halfH[88];
1665
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1666
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1667
}