Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_qpel_mmx.c @ 2912e87a

History | View | Annotate | Download (53.2 KB)

1
/*
2
 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
3
 *
4
 * This file is part of Libav.
5
 *
6
 * Libav is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * Libav is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with Libav; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "dsputil_mmx.h"
22

    
23
/***********************************/
24
/* motion compensation */
25

    
26
#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
27
        "mov"#q" "#C", "#T"         \n\t"\
28
        "mov"#d" (%0), "#F"         \n\t"\
29
        "paddw "#D", "#T"           \n\t"\
30
        "psllw $2, "#T"             \n\t"\
31
        "psubw "#B", "#T"           \n\t"\
32
        "psubw "#E", "#T"           \n\t"\
33
        "punpcklbw "#Z", "#F"       \n\t"\
34
        "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
35
        "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
36
        "add %2, %0                 \n\t"\
37
        "paddw "#F", "#A"           \n\t"\
38
        "paddw "#A", "#T"           \n\t"\
39
        "psraw $5, "#T"             \n\t"\
40
        "packuswb "#T", "#T"        \n\t"\
41
        OP(T, (%1), A, d)\
42
        "add %3, %1                 \n\t"
43

    
44
#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
45
        "mov"#q" "#C", "#T"         \n\t"\
46
        "mov"#d" (%0), "#F"         \n\t"\
47
        "paddw "#D", "#T"           \n\t"\
48
        "psllw $2, "#T"             \n\t"\
49
        "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
50
        "psubw "#B", "#T"           \n\t"\
51
        "psubw "#E", "#T"           \n\t"\
52
        "punpcklbw "#Z", "#F"       \n\t"\
53
        "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
54
        "paddw "#F", "#A"           \n\t"\
55
        "add %2, %0                 \n\t"\
56
        "paddw "#A", "#T"           \n\t"\
57
        "mov"#q" "#T", "#OF"(%1)    \n\t"
58

    
59
#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
60
#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
61
#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
62
#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
63

    
64

    
65
#define QPEL_H264(OPNAME, OP, MMX)\
66
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
67
    int h=4;\
68
\
69
    __asm__ volatile(\
70
        "pxor %%mm7, %%mm7          \n\t"\
71
        "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
72
        "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
73
        "1:                         \n\t"\
74
        "movd  -1(%0), %%mm1        \n\t"\
75
        "movd    (%0), %%mm2        \n\t"\
76
        "movd   1(%0), %%mm3        \n\t"\
77
        "movd   2(%0), %%mm0        \n\t"\
78
        "punpcklbw %%mm7, %%mm1     \n\t"\
79
        "punpcklbw %%mm7, %%mm2     \n\t"\
80
        "punpcklbw %%mm7, %%mm3     \n\t"\
81
        "punpcklbw %%mm7, %%mm0     \n\t"\
82
        "paddw %%mm0, %%mm1         \n\t"\
83
        "paddw %%mm3, %%mm2         \n\t"\
84
        "movd  -2(%0), %%mm0        \n\t"\
85
        "movd   3(%0), %%mm3        \n\t"\
86
        "punpcklbw %%mm7, %%mm0     \n\t"\
87
        "punpcklbw %%mm7, %%mm3     \n\t"\
88
        "paddw %%mm3, %%mm0         \n\t"\
89
        "psllw $2, %%mm2            \n\t"\
90
        "psubw %%mm1, %%mm2         \n\t"\
91
        "pmullw %%mm4, %%mm2        \n\t"\
92
        "paddw %%mm5, %%mm0         \n\t"\
93
        "paddw %%mm2, %%mm0         \n\t"\
94
        "psraw $5, %%mm0            \n\t"\
95
        "packuswb %%mm0, %%mm0      \n\t"\
96
        OP(%%mm0, (%1),%%mm6, d)\
97
        "add %3, %0                 \n\t"\
98
        "add %4, %1                 \n\t"\
99
        "decl %2                    \n\t"\
100
        " jnz 1b                    \n\t"\
101
        : "+a"(src), "+c"(dst), "+g"(h)\
102
        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
103
        : "memory"\
104
    );\
105
}\
106
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
107
    int h=4;\
108
    __asm__ volatile(\
109
        "pxor %%mm7, %%mm7          \n\t"\
110
        "movq %0, %%mm4             \n\t"\
111
        "movq %1, %%mm5             \n\t"\
112
        :: "m"(ff_pw_5), "m"(ff_pw_16)\
113
    );\
114
    do{\
115
    __asm__ volatile(\
116
        "movd  -1(%0), %%mm1        \n\t"\
117
        "movd    (%0), %%mm2        \n\t"\
118
        "movd   1(%0), %%mm3        \n\t"\
119
        "movd   2(%0), %%mm0        \n\t"\
120
        "punpcklbw %%mm7, %%mm1     \n\t"\
121
        "punpcklbw %%mm7, %%mm2     \n\t"\
122
        "punpcklbw %%mm7, %%mm3     \n\t"\
123
        "punpcklbw %%mm7, %%mm0     \n\t"\
124
        "paddw %%mm0, %%mm1         \n\t"\
125
        "paddw %%mm3, %%mm2         \n\t"\
126
        "movd  -2(%0), %%mm0        \n\t"\
127
        "movd   3(%0), %%mm3        \n\t"\
128
        "punpcklbw %%mm7, %%mm0     \n\t"\
129
        "punpcklbw %%mm7, %%mm3     \n\t"\
130
        "paddw %%mm3, %%mm0         \n\t"\
131
        "psllw $2, %%mm2            \n\t"\
132
        "psubw %%mm1, %%mm2         \n\t"\
133
        "pmullw %%mm4, %%mm2        \n\t"\
134
        "paddw %%mm5, %%mm0         \n\t"\
135
        "paddw %%mm2, %%mm0         \n\t"\
136
        "movd   (%2), %%mm3         \n\t"\
137
        "psraw $5, %%mm0            \n\t"\
138
        "packuswb %%mm0, %%mm0      \n\t"\
139
        PAVGB" %%mm3, %%mm0         \n\t"\
140
        OP(%%mm0, (%1),%%mm6, d)\
141
        "add %4, %0                 \n\t"\
142
        "add %4, %1                 \n\t"\
143
        "add %3, %2                 \n\t"\
144
        : "+a"(src), "+c"(dst), "+d"(src2)\
145
        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
146
        : "memory"\
147
    );\
148
    }while(--h);\
149
}\
150
static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
151
    src -= 2*srcStride;\
152
    __asm__ volatile(\
153
        "pxor %%mm7, %%mm7          \n\t"\
154
        "movd (%0), %%mm0           \n\t"\
155
        "add %2, %0                 \n\t"\
156
        "movd (%0), %%mm1           \n\t"\
157
        "add %2, %0                 \n\t"\
158
        "movd (%0), %%mm2           \n\t"\
159
        "add %2, %0                 \n\t"\
160
        "movd (%0), %%mm3           \n\t"\
161
        "add %2, %0                 \n\t"\
162
        "movd (%0), %%mm4           \n\t"\
163
        "add %2, %0                 \n\t"\
164
        "punpcklbw %%mm7, %%mm0     \n\t"\
165
        "punpcklbw %%mm7, %%mm1     \n\t"\
166
        "punpcklbw %%mm7, %%mm2     \n\t"\
167
        "punpcklbw %%mm7, %%mm3     \n\t"\
168
        "punpcklbw %%mm7, %%mm4     \n\t"\
169
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
170
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
171
        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
172
        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
173
         \
174
        : "+a"(src), "+c"(dst)\
175
        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
176
        : "memory"\
177
    );\
178
}\
179
static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
180
    int h=4;\
181
    int w=3;\
182
    src -= 2*srcStride+2;\
183
    while(w--){\
184
        __asm__ volatile(\
185
            "pxor %%mm7, %%mm7      \n\t"\
186
            "movd (%0), %%mm0       \n\t"\
187
            "add %2, %0             \n\t"\
188
            "movd (%0), %%mm1       \n\t"\
189
            "add %2, %0             \n\t"\
190
            "movd (%0), %%mm2       \n\t"\
191
            "add %2, %0             \n\t"\
192
            "movd (%0), %%mm3       \n\t"\
193
            "add %2, %0             \n\t"\
194
            "movd (%0), %%mm4       \n\t"\
195
            "add %2, %0             \n\t"\
196
            "punpcklbw %%mm7, %%mm0 \n\t"\
197
            "punpcklbw %%mm7, %%mm1 \n\t"\
198
            "punpcklbw %%mm7, %%mm2 \n\t"\
199
            "punpcklbw %%mm7, %%mm3 \n\t"\
200
            "punpcklbw %%mm7, %%mm4 \n\t"\
201
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
202
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
203
            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
204
            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
205
             \
206
            : "+a"(src)\
207
            : "c"(tmp), "S"((x86_reg)srcStride)\
208
            : "memory"\
209
        );\
210
        tmp += 4;\
211
        src += 4 - 9*srcStride;\
212
    }\
213
    tmp -= 3*4;\
214
    __asm__ volatile(\
215
        "1:                         \n\t"\
216
        "movq     (%0), %%mm0       \n\t"\
217
        "paddw  10(%0), %%mm0       \n\t"\
218
        "movq    2(%0), %%mm1       \n\t"\
219
        "paddw   8(%0), %%mm1       \n\t"\
220
        "movq    4(%0), %%mm2       \n\t"\
221
        "paddw   6(%0), %%mm2       \n\t"\
222
        "psubw %%mm1, %%mm0         \n\t"/*a-b   (abccba)*/\
223
        "psraw $2, %%mm0            \n\t"/*(a-b)/4 */\
224
        "psubw %%mm1, %%mm0         \n\t"/*(a-b)/4-b */\
225
        "paddsw %%mm2, %%mm0        \n\t"\
226
        "psraw $2, %%mm0            \n\t"/*((a-b)/4-b+c)/4 */\
227
        "paddw %%mm2, %%mm0         \n\t"/*(a-5*b+20*c)/16 */\
228
        "psraw $6, %%mm0            \n\t"\
229
        "packuswb %%mm0, %%mm0      \n\t"\
230
        OP(%%mm0, (%1),%%mm7, d)\
231
        "add $24, %0                \n\t"\
232
        "add %3, %1                 \n\t"\
233
        "decl %2                    \n\t"\
234
        " jnz 1b                    \n\t"\
235
        : "+a"(tmp), "+c"(dst), "+g"(h)\
236
        : "S"((x86_reg)dstStride)\
237
        : "memory"\
238
    );\
239
}\
240
\
241
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
242
    int h=8;\
243
    __asm__ volatile(\
244
        "pxor %%mm7, %%mm7          \n\t"\
245
        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
246
        "1:                         \n\t"\
247
        "movq    (%0), %%mm0        \n\t"\
248
        "movq   1(%0), %%mm2        \n\t"\
249
        "movq %%mm0, %%mm1          \n\t"\
250
        "movq %%mm2, %%mm3          \n\t"\
251
        "punpcklbw %%mm7, %%mm0     \n\t"\
252
        "punpckhbw %%mm7, %%mm1     \n\t"\
253
        "punpcklbw %%mm7, %%mm2     \n\t"\
254
        "punpckhbw %%mm7, %%mm3     \n\t"\
255
        "paddw %%mm2, %%mm0         \n\t"\
256
        "paddw %%mm3, %%mm1         \n\t"\
257
        "psllw $2, %%mm0            \n\t"\
258
        "psllw $2, %%mm1            \n\t"\
259
        "movq   -1(%0), %%mm2       \n\t"\
260
        "movq    2(%0), %%mm4       \n\t"\
261
        "movq %%mm2, %%mm3          \n\t"\
262
        "movq %%mm4, %%mm5          \n\t"\
263
        "punpcklbw %%mm7, %%mm2     \n\t"\
264
        "punpckhbw %%mm7, %%mm3     \n\t"\
265
        "punpcklbw %%mm7, %%mm4     \n\t"\
266
        "punpckhbw %%mm7, %%mm5     \n\t"\
267
        "paddw %%mm4, %%mm2         \n\t"\
268
        "paddw %%mm3, %%mm5         \n\t"\
269
        "psubw %%mm2, %%mm0         \n\t"\
270
        "psubw %%mm5, %%mm1         \n\t"\
271
        "pmullw %%mm6, %%mm0        \n\t"\
272
        "pmullw %%mm6, %%mm1        \n\t"\
273
        "movd   -2(%0), %%mm2       \n\t"\
274
        "movd    7(%0), %%mm5       \n\t"\
275
        "punpcklbw %%mm7, %%mm2     \n\t"\
276
        "punpcklbw %%mm7, %%mm5     \n\t"\
277
        "paddw %%mm3, %%mm2         \n\t"\
278
        "paddw %%mm5, %%mm4         \n\t"\
279
        "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
280
        "paddw %%mm5, %%mm2         \n\t"\
281
        "paddw %%mm5, %%mm4         \n\t"\
282
        "paddw %%mm2, %%mm0         \n\t"\
283
        "paddw %%mm4, %%mm1         \n\t"\
284
        "psraw $5, %%mm0            \n\t"\
285
        "psraw $5, %%mm1            \n\t"\
286
        "packuswb %%mm1, %%mm0      \n\t"\
287
        OP(%%mm0, (%1),%%mm5, q)\
288
        "add %3, %0                 \n\t"\
289
        "add %4, %1                 \n\t"\
290
        "decl %2                    \n\t"\
291
        " jnz 1b                    \n\t"\
292
        : "+a"(src), "+c"(dst), "+g"(h)\
293
        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
294
        : "memory"\
295
    );\
296
}\
297
\
298
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
299
    int h=8;\
300
    __asm__ volatile(\
301
        "pxor %%mm7, %%mm7          \n\t"\
302
        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
303
        "1:                         \n\t"\
304
        "movq    (%0), %%mm0        \n\t"\
305
        "movq   1(%0), %%mm2        \n\t"\
306
        "movq %%mm0, %%mm1          \n\t"\
307
        "movq %%mm2, %%mm3          \n\t"\
308
        "punpcklbw %%mm7, %%mm0     \n\t"\
309
        "punpckhbw %%mm7, %%mm1     \n\t"\
310
        "punpcklbw %%mm7, %%mm2     \n\t"\
311
        "punpckhbw %%mm7, %%mm3     \n\t"\
312
        "paddw %%mm2, %%mm0         \n\t"\
313
        "paddw %%mm3, %%mm1         \n\t"\
314
        "psllw $2, %%mm0            \n\t"\
315
        "psllw $2, %%mm1            \n\t"\
316
        "movq   -1(%0), %%mm2       \n\t"\
317
        "movq    2(%0), %%mm4       \n\t"\
318
        "movq %%mm2, %%mm3          \n\t"\
319
        "movq %%mm4, %%mm5          \n\t"\
320
        "punpcklbw %%mm7, %%mm2     \n\t"\
321
        "punpckhbw %%mm7, %%mm3     \n\t"\
322
        "punpcklbw %%mm7, %%mm4     \n\t"\
323
        "punpckhbw %%mm7, %%mm5     \n\t"\
324
        "paddw %%mm4, %%mm2         \n\t"\
325
        "paddw %%mm3, %%mm5         \n\t"\
326
        "psubw %%mm2, %%mm0         \n\t"\
327
        "psubw %%mm5, %%mm1         \n\t"\
328
        "pmullw %%mm6, %%mm0        \n\t"\
329
        "pmullw %%mm6, %%mm1        \n\t"\
330
        "movd   -2(%0), %%mm2       \n\t"\
331
        "movd    7(%0), %%mm5       \n\t"\
332
        "punpcklbw %%mm7, %%mm2     \n\t"\
333
        "punpcklbw %%mm7, %%mm5     \n\t"\
334
        "paddw %%mm3, %%mm2         \n\t"\
335
        "paddw %%mm5, %%mm4         \n\t"\
336
        "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
337
        "paddw %%mm5, %%mm2         \n\t"\
338
        "paddw %%mm5, %%mm4         \n\t"\
339
        "paddw %%mm2, %%mm0         \n\t"\
340
        "paddw %%mm4, %%mm1         \n\t"\
341
        "psraw $5, %%mm0            \n\t"\
342
        "psraw $5, %%mm1            \n\t"\
343
        "movq (%2), %%mm4           \n\t"\
344
        "packuswb %%mm1, %%mm0      \n\t"\
345
        PAVGB" %%mm4, %%mm0         \n\t"\
346
        OP(%%mm0, (%1),%%mm5, q)\
347
        "add %5, %0                 \n\t"\
348
        "add %5, %1                 \n\t"\
349
        "add %4, %2                 \n\t"\
350
        "decl %3                    \n\t"\
351
        "jg 1b                      \n\t"\
352
        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
353
        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
354
        : "memory"\
355
    );\
356
}\
357
\
358
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
359
    int w= 2;\
360
    src -= 2*srcStride;\
361
    \
362
    while(w--){\
363
        __asm__ volatile(\
364
            "pxor %%mm7, %%mm7          \n\t"\
365
            "movd (%0), %%mm0           \n\t"\
366
            "add %2, %0                 \n\t"\
367
            "movd (%0), %%mm1           \n\t"\
368
            "add %2, %0                 \n\t"\
369
            "movd (%0), %%mm2           \n\t"\
370
            "add %2, %0                 \n\t"\
371
            "movd (%0), %%mm3           \n\t"\
372
            "add %2, %0                 \n\t"\
373
            "movd (%0), %%mm4           \n\t"\
374
            "add %2, %0                 \n\t"\
375
            "punpcklbw %%mm7, %%mm0     \n\t"\
376
            "punpcklbw %%mm7, %%mm1     \n\t"\
377
            "punpcklbw %%mm7, %%mm2     \n\t"\
378
            "punpcklbw %%mm7, %%mm3     \n\t"\
379
            "punpcklbw %%mm7, %%mm4     \n\t"\
380
            QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
381
            QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
382
            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
383
            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
384
            QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
385
            QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
386
            QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
387
            QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
388
            "cmpl $16, %4               \n\t"\
389
            "jne 2f                     \n\t"\
390
            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
391
            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
392
            QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
393
            QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
394
            QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
395
            QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
396
            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
397
            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
398
            "2:                         \n\t"\
399
            \
400
            : "+a"(src), "+c"(dst)\
401
            : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "g"(h)\
402
            : "memory"\
403
        );\
404
        src += 4-(h+5)*srcStride;\
405
        dst += 4-h*dstStride;\
406
    }\
407
}\
408
static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
409
    int w = (size+8)>>2;\
410
    src -= 2*srcStride+2;\
411
    while(w--){\
412
        __asm__ volatile(\
413
            "pxor %%mm7, %%mm7      \n\t"\
414
            "movd (%0), %%mm0       \n\t"\
415
            "add %2, %0             \n\t"\
416
            "movd (%0), %%mm1       \n\t"\
417
            "add %2, %0             \n\t"\
418
            "movd (%0), %%mm2       \n\t"\
419
            "add %2, %0             \n\t"\
420
            "movd (%0), %%mm3       \n\t"\
421
            "add %2, %0             \n\t"\
422
            "movd (%0), %%mm4       \n\t"\
423
            "add %2, %0             \n\t"\
424
            "punpcklbw %%mm7, %%mm0 \n\t"\
425
            "punpcklbw %%mm7, %%mm1 \n\t"\
426
            "punpcklbw %%mm7, %%mm2 \n\t"\
427
            "punpcklbw %%mm7, %%mm3 \n\t"\
428
            "punpcklbw %%mm7, %%mm4 \n\t"\
429
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
430
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
431
            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
432
            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
433
            QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
434
            QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
435
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
436
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
437
            "cmpl $16, %3           \n\t"\
438
            "jne 2f                 \n\t"\
439
            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1,  8*48)\
440
            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2,  9*48)\
441
            QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
442
            QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
443
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
444
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
445
            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
446
            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
447
            "2:                     \n\t"\
448
            : "+a"(src)\
449
            : "c"(tmp), "S"((x86_reg)srcStride), "g"(size)\
450
            : "memory"\
451
            );\
452
        tmp += 4;\
453
        src += 4 - (size+5)*srcStride;\
454
    }\
455
}\
456
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
457
    int w = size>>4;\
458
    do{\
459
    int h = size;\
460
    __asm__ volatile(\
461
        "1:                         \n\t"\
462
        "movq     (%0), %%mm0       \n\t"\
463
        "movq    8(%0), %%mm3       \n\t"\
464
        "movq    2(%0), %%mm1       \n\t"\
465
        "movq   10(%0), %%mm4       \n\t"\
466
        "paddw   %%mm4, %%mm0       \n\t"\
467
        "paddw   %%mm3, %%mm1       \n\t"\
468
        "paddw  18(%0), %%mm3       \n\t"\
469
        "paddw  16(%0), %%mm4       \n\t"\
470
        "movq    4(%0), %%mm2       \n\t"\
471
        "movq   12(%0), %%mm5       \n\t"\
472
        "paddw   6(%0), %%mm2       \n\t"\
473
        "paddw  14(%0), %%mm5       \n\t"\
474
        "psubw %%mm1, %%mm0         \n\t"\
475
        "psubw %%mm4, %%mm3         \n\t"\
476
        "psraw $2, %%mm0            \n\t"\
477
        "psraw $2, %%mm3            \n\t"\
478
        "psubw %%mm1, %%mm0         \n\t"\
479
        "psubw %%mm4, %%mm3         \n\t"\
480
        "paddsw %%mm2, %%mm0        \n\t"\
481
        "paddsw %%mm5, %%mm3        \n\t"\
482
        "psraw $2, %%mm0            \n\t"\
483
        "psraw $2, %%mm3            \n\t"\
484
        "paddw %%mm2, %%mm0         \n\t"\
485
        "paddw %%mm5, %%mm3         \n\t"\
486
        "psraw $6, %%mm0            \n\t"\
487
        "psraw $6, %%mm3            \n\t"\
488
        "packuswb %%mm3, %%mm0      \n\t"\
489
        OP(%%mm0, (%1),%%mm7, q)\
490
        "add $48, %0                \n\t"\
491
        "add %3, %1                 \n\t"\
492
        "decl %2                    \n\t"\
493
        " jnz 1b                    \n\t"\
494
        : "+a"(tmp), "+c"(dst), "+g"(h)\
495
        : "S"((x86_reg)dstStride)\
496
        : "memory"\
497
    );\
498
    tmp += 8 - size*24;\
499
    dst += 8 - size*dstStride;\
500
    }while(w--);\
501
}\
502
\
503
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
504
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
505
}\
506
static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
507
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
508
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
509
}\
510
\
511
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
512
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
513
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
514
    src += 8*srcStride;\
515
    dst += 8*dstStride;\
516
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
517
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
518
}\
519
\
520
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
521
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
522
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
523
    src += 8*dstStride;\
524
    dst += 8*dstStride;\
525
    src2 += 8*src2Stride;\
526
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
527
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
528
}\
529
\
530
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
531
          put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
532
    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
533
}\
534
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
535
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
536
}\
537
\
538
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
539
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
540
}\
541
\
542
static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
543
{\
544
    __asm__ volatile(\
545
        "movq      (%1), %%mm0          \n\t"\
546
        "movq    24(%1), %%mm1          \n\t"\
547
        "psraw      $5,  %%mm0          \n\t"\
548
        "psraw      $5,  %%mm1          \n\t"\
549
        "packuswb %%mm0, %%mm0          \n\t"\
550
        "packuswb %%mm1, %%mm1          \n\t"\
551
        PAVGB"     (%0), %%mm0          \n\t"\
552
        PAVGB"  (%0,%3), %%mm1          \n\t"\
553
        OP(%%mm0, (%2),    %%mm4, d)\
554
        OP(%%mm1, (%2,%4), %%mm5, d)\
555
        "lea  (%0,%3,2), %0             \n\t"\
556
        "lea  (%2,%4,2), %2             \n\t"\
557
        "movq    48(%1), %%mm0          \n\t"\
558
        "movq    72(%1), %%mm1          \n\t"\
559
        "psraw      $5,  %%mm0          \n\t"\
560
        "psraw      $5,  %%mm1          \n\t"\
561
        "packuswb %%mm0, %%mm0          \n\t"\
562
        "packuswb %%mm1, %%mm1          \n\t"\
563
        PAVGB"     (%0), %%mm0          \n\t"\
564
        PAVGB"  (%0,%3), %%mm1          \n\t"\
565
        OP(%%mm0, (%2),    %%mm4, d)\
566
        OP(%%mm1, (%2,%4), %%mm5, d)\
567
        :"+a"(src8), "+c"(src16), "+d"(dst)\
568
        :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
569
        :"memory");\
570
}\
571
static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
572
{\
573
    do{\
574
    __asm__ volatile(\
575
        "movq      (%1), %%mm0          \n\t"\
576
        "movq     8(%1), %%mm1          \n\t"\
577
        "movq    48(%1), %%mm2          \n\t"\
578
        "movq  8+48(%1), %%mm3          \n\t"\
579
        "psraw      $5,  %%mm0          \n\t"\
580
        "psraw      $5,  %%mm1          \n\t"\
581
        "psraw      $5,  %%mm2          \n\t"\
582
        "psraw      $5,  %%mm3          \n\t"\
583
        "packuswb %%mm1, %%mm0          \n\t"\
584
        "packuswb %%mm3, %%mm2          \n\t"\
585
        PAVGB"     (%0), %%mm0          \n\t"\
586
        PAVGB"  (%0,%3), %%mm2          \n\t"\
587
        OP(%%mm0, (%2), %%mm5, q)\
588
        OP(%%mm2, (%2,%4), %%mm5, q)\
589
        ::"a"(src8), "c"(src16), "d"(dst),\
590
          "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
591
        :"memory");\
592
        src8 += 2L*src8Stride;\
593
        src16 += 48;\
594
        dst += 2L*dstStride;\
595
    }while(h-=2);\
596
}\
597
static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
598
{\
599
    OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
600
    OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
601
}\
602

    
603

    
604
#if ARCH_X86_64
605
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
606
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
607
    int h=16;\
608
    __asm__ volatile(\
609
        "pxor %%xmm15, %%xmm15      \n\t"\
610
        "movdqa %6, %%xmm14         \n\t"\
611
        "movdqa %7, %%xmm13         \n\t"\
612
        "1:                         \n\t"\
613
        "lddqu    6(%0), %%xmm1     \n\t"\
614
        "lddqu   -2(%0), %%xmm7     \n\t"\
615
        "movdqa  %%xmm1, %%xmm0     \n\t"\
616
        "punpckhbw %%xmm15, %%xmm1  \n\t"\
617
        "punpcklbw %%xmm15, %%xmm0  \n\t"\
618
        "punpcklbw %%xmm15, %%xmm7  \n\t"\
619
        "movdqa  %%xmm1, %%xmm2     \n\t"\
620
        "movdqa  %%xmm0, %%xmm6     \n\t"\
621
        "movdqa  %%xmm1, %%xmm3     \n\t"\
622
        "movdqa  %%xmm0, %%xmm8     \n\t"\
623
        "movdqa  %%xmm1, %%xmm4     \n\t"\
624
        "movdqa  %%xmm0, %%xmm9     \n\t"\
625
        "movdqa  %%xmm0, %%xmm12    \n\t"\
626
        "movdqa  %%xmm1, %%xmm11    \n\t"\
627
        "palignr $10,%%xmm0, %%xmm11\n\t"\
628
        "palignr $10,%%xmm7, %%xmm12\n\t"\
629
        "palignr $2, %%xmm0, %%xmm4 \n\t"\
630
        "palignr $2, %%xmm7, %%xmm9 \n\t"\
631
        "palignr $4, %%xmm0, %%xmm3 \n\t"\
632
        "palignr $4, %%xmm7, %%xmm8 \n\t"\
633
        "palignr $6, %%xmm0, %%xmm2 \n\t"\
634
        "palignr $6, %%xmm7, %%xmm6 \n\t"\
635
        "paddw   %%xmm0 ,%%xmm11    \n\t"\
636
        "palignr $8, %%xmm0, %%xmm1 \n\t"\
637
        "palignr $8, %%xmm7, %%xmm0 \n\t"\
638
        "paddw   %%xmm12,%%xmm7     \n\t"\
639
        "paddw   %%xmm3, %%xmm2     \n\t"\
640
        "paddw   %%xmm8, %%xmm6     \n\t"\
641
        "paddw   %%xmm4, %%xmm1     \n\t"\
642
        "paddw   %%xmm9, %%xmm0     \n\t"\
643
        "psllw   $2,     %%xmm2     \n\t"\
644
        "psllw   $2,     %%xmm6     \n\t"\
645
        "psubw   %%xmm1, %%xmm2     \n\t"\
646
        "psubw   %%xmm0, %%xmm6     \n\t"\
647
        "paddw   %%xmm13,%%xmm11    \n\t"\
648
        "paddw   %%xmm13,%%xmm7     \n\t"\
649
        "pmullw  %%xmm14,%%xmm2     \n\t"\
650
        "pmullw  %%xmm14,%%xmm6     \n\t"\
651
        "lddqu   (%2),   %%xmm3     \n\t"\
652
        "paddw   %%xmm11,%%xmm2     \n\t"\
653
        "paddw   %%xmm7, %%xmm6     \n\t"\
654
        "psraw   $5,     %%xmm2     \n\t"\
655
        "psraw   $5,     %%xmm6     \n\t"\
656
        "packuswb %%xmm2,%%xmm6     \n\t"\
657
        "pavgb   %%xmm3, %%xmm6     \n\t"\
658
        OP(%%xmm6, (%1), %%xmm4, dqa)\
659
        "add %5, %0                 \n\t"\
660
        "add %5, %1                 \n\t"\
661
        "add %4, %2                 \n\t"\
662
        "decl %3                    \n\t"\
663
        "jg 1b                      \n\t"\
664
        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
665
        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
666
          "m"(ff_pw_5), "m"(ff_pw_16)\
667
        : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , \
668
                       "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , \
669
                       "%xmm8" , "%xmm9" , "%xmm10", "%xmm11", \
670
                       "%xmm12", "%xmm13", "%xmm14", "%xmm15",)\
671
          "memory"\
672
    );\
673
}
674
#else // ARCH_X86_64
675
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
676
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
677
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
678
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
679
    src += 8*dstStride;\
680
    dst += 8*dstStride;\
681
    src2 += 8*src2Stride;\
682
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
683
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
684
}
685
#endif // ARCH_X86_64
686

    
687
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
688
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
689
    int h=8;\
690
    __asm__ volatile(\
691
        "pxor %%xmm7, %%xmm7        \n\t"\
692
        "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
693
        "1:                         \n\t"\
694
        "lddqu   -2(%0), %%xmm1     \n\t"\
695
        "movdqa  %%xmm1, %%xmm0     \n\t"\
696
        "punpckhbw %%xmm7, %%xmm1   \n\t"\
697
        "punpcklbw %%xmm7, %%xmm0   \n\t"\
698
        "movdqa  %%xmm1, %%xmm2     \n\t"\
699
        "movdqa  %%xmm1, %%xmm3     \n\t"\
700
        "movdqa  %%xmm1, %%xmm4     \n\t"\
701
        "movdqa  %%xmm1, %%xmm5     \n\t"\
702
        "palignr $2, %%xmm0, %%xmm4 \n\t"\
703
        "palignr $4, %%xmm0, %%xmm3 \n\t"\
704
        "palignr $6, %%xmm0, %%xmm2 \n\t"\
705
        "palignr $8, %%xmm0, %%xmm1 \n\t"\
706
        "palignr $10,%%xmm0, %%xmm5 \n\t"\
707
        "paddw   %%xmm5, %%xmm0     \n\t"\
708
        "paddw   %%xmm3, %%xmm2     \n\t"\
709
        "paddw   %%xmm4, %%xmm1     \n\t"\
710
        "psllw   $2,     %%xmm2     \n\t"\
711
        "movq    (%2),   %%xmm3     \n\t"\
712
        "psubw   %%xmm1, %%xmm2     \n\t"\
713
        "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
714
        "pmullw  %%xmm6, %%xmm2     \n\t"\
715
        "paddw   %%xmm0, %%xmm2     \n\t"\
716
        "psraw   $5,     %%xmm2     \n\t"\
717
        "packuswb %%xmm2, %%xmm2    \n\t"\
718
        "pavgb   %%xmm3, %%xmm2     \n\t"\
719
        OP(%%xmm2, (%1), %%xmm4, q)\
720
        "add %5, %0                 \n\t"\
721
        "add %5, %1                 \n\t"\
722
        "add %4, %2                 \n\t"\
723
        "decl %3                    \n\t"\
724
        "jg 1b                      \n\t"\
725
        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
726
        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
727
        : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
728
                       "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
729
          "memory"\
730
    );\
731
}\
732
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
733
\
734
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
735
    int h=8;\
736
    __asm__ volatile(\
737
        "pxor %%xmm7, %%xmm7        \n\t"\
738
        "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
739
        "1:                         \n\t"\
740
        "lddqu   -2(%0), %%xmm1     \n\t"\
741
        "movdqa  %%xmm1, %%xmm0     \n\t"\
742
        "punpckhbw %%xmm7, %%xmm1   \n\t"\
743
        "punpcklbw %%xmm7, %%xmm0   \n\t"\
744
        "movdqa  %%xmm1, %%xmm2     \n\t"\
745
        "movdqa  %%xmm1, %%xmm3     \n\t"\
746
        "movdqa  %%xmm1, %%xmm4     \n\t"\
747
        "movdqa  %%xmm1, %%xmm5     \n\t"\
748
        "palignr $2, %%xmm0, %%xmm4 \n\t"\
749
        "palignr $4, %%xmm0, %%xmm3 \n\t"\
750
        "palignr $6, %%xmm0, %%xmm2 \n\t"\
751
        "palignr $8, %%xmm0, %%xmm1 \n\t"\
752
        "palignr $10,%%xmm0, %%xmm5 \n\t"\
753
        "paddw   %%xmm5, %%xmm0     \n\t"\
754
        "paddw   %%xmm3, %%xmm2     \n\t"\
755
        "paddw   %%xmm4, %%xmm1     \n\t"\
756
        "psllw   $2,     %%xmm2     \n\t"\
757
        "psubw   %%xmm1, %%xmm2     \n\t"\
758
        "paddw   "MANGLE(ff_pw_16)", %%xmm0\n\t"\
759
        "pmullw  %%xmm6, %%xmm2     \n\t"\
760
        "paddw   %%xmm0, %%xmm2     \n\t"\
761
        "psraw   $5,     %%xmm2     \n\t"\
762
        "packuswb %%xmm2, %%xmm2    \n\t"\
763
        OP(%%xmm2, (%1), %%xmm4, q)\
764
        "add %3, %0                 \n\t"\
765
        "add %4, %1                 \n\t"\
766
        "decl %2                    \n\t"\
767
        " jnz 1b                    \n\t"\
768
        : "+a"(src), "+c"(dst), "+g"(h)\
769
        : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
770
        : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
771
                       "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
772
          "memory"\
773
    );\
774
}\
775
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
776
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
777
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
778
    src += 8*srcStride;\
779
    dst += 8*dstStride;\
780
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
781
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
782
}\
783

    
784
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
785
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
786
    src -= 2*srcStride;\
787
    \
788
    __asm__ volatile(\
789
        "pxor %%xmm7, %%xmm7        \n\t"\
790
        "movq (%0), %%xmm0          \n\t"\
791
        "add %2, %0                 \n\t"\
792
        "movq (%0), %%xmm1          \n\t"\
793
        "add %2, %0                 \n\t"\
794
        "movq (%0), %%xmm2          \n\t"\
795
        "add %2, %0                 \n\t"\
796
        "movq (%0), %%xmm3          \n\t"\
797
        "add %2, %0                 \n\t"\
798
        "movq (%0), %%xmm4          \n\t"\
799
        "add %2, %0                 \n\t"\
800
        "punpcklbw %%xmm7, %%xmm0   \n\t"\
801
        "punpcklbw %%xmm7, %%xmm1   \n\t"\
802
        "punpcklbw %%xmm7, %%xmm2   \n\t"\
803
        "punpcklbw %%xmm7, %%xmm3   \n\t"\
804
        "punpcklbw %%xmm7, %%xmm4   \n\t"\
805
        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
806
        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
807
        QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
808
        QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
809
        QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
810
        QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
811
        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
812
        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
813
        "cmpl $16, %4               \n\t"\
814
        "jne 2f                     \n\t"\
815
        QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
816
        QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
817
        QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
818
        QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
819
        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
820
        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
821
        QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
822
        QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
823
        "2:                          \n\t"\
824
        \
825
        : "+a"(src), "+c"(dst)\
826
        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "g"(h)\
827
        : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
828
                       "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
829
          "memory"\
830
    );\
831
}\
832
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
833
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
834
}\
835
static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
836
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
837
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
838
}
839

    
840
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
841
    int w = (size+8)>>3;
842
    src -= 2*srcStride+2;
843
    while(w--){
844
        __asm__ volatile(
845
            "pxor %%xmm7, %%xmm7        \n\t"
846
            "movq (%0), %%xmm0          \n\t"
847
            "add %2, %0                 \n\t"
848
            "movq (%0), %%xmm1          \n\t"
849
            "add %2, %0                 \n\t"
850
            "movq (%0), %%xmm2          \n\t"
851
            "add %2, %0                 \n\t"
852
            "movq (%0), %%xmm3          \n\t"
853
            "add %2, %0                 \n\t"
854
            "movq (%0), %%xmm4          \n\t"
855
            "add %2, %0                 \n\t"
856
            "punpcklbw %%xmm7, %%xmm0   \n\t"
857
            "punpcklbw %%xmm7, %%xmm1   \n\t"
858
            "punpcklbw %%xmm7, %%xmm2   \n\t"
859
            "punpcklbw %%xmm7, %%xmm3   \n\t"
860
            "punpcklbw %%xmm7, %%xmm4   \n\t"
861
            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
862
            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
863
            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
864
            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
865
            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
866
            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
867
            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
868
            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
869
            "cmpl $16, %3               \n\t"
870
            "jne 2f                     \n\t"
871
            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
872
            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
873
            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
874
            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
875
            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
876
            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
877
            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
878
            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
879
            "2:                         \n\t"
880
            : "+a"(src)
881
            : "c"(tmp), "S"((x86_reg)srcStride), "g"(size)
882
            : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
883
                           "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
884
              "memory"
885
        );
886
        tmp += 8;
887
        src += 8 - (size+5)*srcStride;
888
    }
889
}
890

    
891
#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
892
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
893
    int h = size;\
894
    if(size == 16){\
895
        __asm__ volatile(\
896
            "1:                         \n\t"\
897
            "movdqa 32(%0), %%xmm4      \n\t"\
898
            "movdqa 16(%0), %%xmm5      \n\t"\
899
            "movdqa   (%0), %%xmm7      \n\t"\
900
            "movdqa %%xmm4, %%xmm3      \n\t"\
901
            "movdqa %%xmm4, %%xmm2      \n\t"\
902
            "movdqa %%xmm4, %%xmm1      \n\t"\
903
            "movdqa %%xmm4, %%xmm0      \n\t"\
904
            "palignr $10, %%xmm5, %%xmm0 \n\t"\
905
            "palignr  $8, %%xmm5, %%xmm1 \n\t"\
906
            "palignr  $6, %%xmm5, %%xmm2 \n\t"\
907
            "palignr  $4, %%xmm5, %%xmm3 \n\t"\
908
            "palignr  $2, %%xmm5, %%xmm4 \n\t"\
909
            "paddw  %%xmm5, %%xmm0      \n\t"\
910
            "paddw  %%xmm4, %%xmm1      \n\t"\
911
            "paddw  %%xmm3, %%xmm2      \n\t"\
912
            "movdqa %%xmm5, %%xmm6      \n\t"\
913
            "movdqa %%xmm5, %%xmm4      \n\t"\
914
            "movdqa %%xmm5, %%xmm3      \n\t"\
915
            "palignr  $8, %%xmm7, %%xmm4 \n\t"\
916
            "palignr  $2, %%xmm7, %%xmm6 \n\t"\
917
            "palignr $10, %%xmm7, %%xmm3 \n\t"\
918
            "paddw  %%xmm6, %%xmm4      \n\t"\
919
            "movdqa %%xmm5, %%xmm6      \n\t"\
920
            "palignr  $6, %%xmm7, %%xmm5 \n\t"\
921
            "palignr  $4, %%xmm7, %%xmm6 \n\t"\
922
            "paddw  %%xmm7, %%xmm3      \n\t"\
923
            "paddw  %%xmm6, %%xmm5      \n\t"\
924
            \
925
            "psubw  %%xmm1, %%xmm0      \n\t"\
926
            "psubw  %%xmm4, %%xmm3      \n\t"\
927
            "psraw      $2, %%xmm0      \n\t"\
928
            "psraw      $2, %%xmm3      \n\t"\
929
            "psubw  %%xmm1, %%xmm0      \n\t"\
930
            "psubw  %%xmm4, %%xmm3      \n\t"\
931
            "paddw  %%xmm2, %%xmm0      \n\t"\
932
            "paddw  %%xmm5, %%xmm3      \n\t"\
933
            "psraw      $2, %%xmm0      \n\t"\
934
            "psraw      $2, %%xmm3      \n\t"\
935
            "paddw  %%xmm2, %%xmm0      \n\t"\
936
            "paddw  %%xmm5, %%xmm3      \n\t"\
937
            "psraw      $6, %%xmm0      \n\t"\
938
            "psraw      $6, %%xmm3      \n\t"\
939
            "packuswb %%xmm0, %%xmm3    \n\t"\
940
            OP(%%xmm3, (%1), %%xmm7, dqa)\
941
            "add $48, %0                \n\t"\
942
            "add %3, %1                 \n\t"\
943
            "decl %2                    \n\t"\
944
            " jnz 1b                    \n\t"\
945
            : "+a"(tmp), "+c"(dst), "+g"(h)\
946
            : "S"((x86_reg)dstStride)\
947
            : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
948
                           "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
949
              "memory"\
950
        );\
951
    }else{\
952
        __asm__ volatile(\
953
            "1:                         \n\t"\
954
            "movdqa 16(%0), %%xmm1      \n\t"\
955
            "movdqa   (%0), %%xmm0      \n\t"\
956
            "movdqa %%xmm1, %%xmm2      \n\t"\
957
            "movdqa %%xmm1, %%xmm3      \n\t"\
958
            "movdqa %%xmm1, %%xmm4      \n\t"\
959
            "movdqa %%xmm1, %%xmm5      \n\t"\
960
            "palignr $10, %%xmm0, %%xmm5 \n\t"\
961
            "palignr  $8, %%xmm0, %%xmm4 \n\t"\
962
            "palignr  $6, %%xmm0, %%xmm3 \n\t"\
963
            "palignr  $4, %%xmm0, %%xmm2 \n\t"\
964
            "palignr  $2, %%xmm0, %%xmm1 \n\t"\
965
            "paddw  %%xmm5, %%xmm0      \n\t"\
966
            "paddw  %%xmm4, %%xmm1      \n\t"\
967
            "paddw  %%xmm3, %%xmm2      \n\t"\
968
            "psubw  %%xmm1, %%xmm0      \n\t"\
969
            "psraw      $2, %%xmm0      \n\t"\
970
            "psubw  %%xmm1, %%xmm0      \n\t"\
971
            "paddw  %%xmm2, %%xmm0      \n\t"\
972
            "psraw      $2, %%xmm0      \n\t"\
973
            "paddw  %%xmm2, %%xmm0      \n\t"\
974
            "psraw      $6, %%xmm0      \n\t"\
975
            "packuswb %%xmm0, %%xmm0    \n\t"\
976
            OP(%%xmm0, (%1), %%xmm7, q)\
977
            "add $48, %0                \n\t"\
978
            "add %3, %1                 \n\t"\
979
            "decl %2                    \n\t"\
980
            " jnz 1b                    \n\t"\
981
            : "+a"(tmp), "+c"(dst), "+g"(h)\
982
            : "S"((x86_reg)dstStride)\
983
            : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
984
                           "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
985
              "memory"\
986
        );\
987
    }\
988
}
989

    
990
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
991
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
992
          put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
993
    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
994
}\
995
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
996
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
997
}\
998
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
999
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
1000
}\
1001

    
1002
#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
1003
#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
1004
#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
1005
#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
1006
#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
1007
#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
1008
#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
1009
#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
1010

    
1011
#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
1012
#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
1013
#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
1014
#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
1015
#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
1016
#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
1017
#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
1018
#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
1019

    
1020
#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
1021
#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
1022
#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
1023
#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
1024

    
1025
#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
1026
#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
1027
#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
1028
#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
1029

    
1030
#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
1031
#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
1032

    
1033
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
1034
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
1035
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
1036
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
1037
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
1038

    
1039
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
1040
    put_pixels16_sse2(dst, src, stride, 16);
1041
}
1042
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
1043
    avg_pixels16_sse2(dst, src, stride, 16);
1044
}
1045
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
1046
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
1047

    
1048
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
1049
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1050
    OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
1051
}\
1052

    
1053
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
1054
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1055
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
1056
}\
1057
\
1058
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1059
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
1060
}\
1061
\
1062
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1063
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
1064
}\
1065

    
1066
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
1067
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1068
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1069
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1070
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
1071
}\
1072
\
1073
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1074
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
1075
}\
1076
\
1077
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1078
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1079
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1080
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
1081
}\
1082

    
1083
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
1084
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1085
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1086
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1087
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
1088
}\
1089
\
1090
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1091
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1092
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
1093
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
1094
}\
1095
\
1096
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1097
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1098
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1099
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
1100
}\
1101
\
1102
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1103
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1104
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
1105
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
1106
}\
1107
\
1108
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1109
    DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
1110
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
1111
}\
1112
\
1113
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1114
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
1115
    uint8_t * const halfHV= temp;\
1116
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1117
    assert(((int)temp & 7) == 0);\
1118
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1119
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
1120
}\
1121
\
1122
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1123
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
1124
    uint8_t * const halfHV= temp;\
1125
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1126
    assert(((int)temp & 7) == 0);\
1127
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1128
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
1129
}\
1130
\
1131
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1132
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
1133
    uint8_t * const halfHV= temp;\
1134
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1135
    assert(((int)temp & 7) == 0);\
1136
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1137
    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
1138
}\
1139
\
1140
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1141
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
1142
    uint8_t * const halfHV= temp;\
1143
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1144
    assert(((int)temp & 7) == 0);\
1145
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1146
    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
1147
}\
1148

    
1149
#define H264_MC_4816(MMX)\
1150
H264_MC(put_, 4, MMX, 8)\
1151
H264_MC(put_, 8, MMX, 8)\
1152
H264_MC(put_, 16,MMX, 8)\
1153
H264_MC(avg_, 4, MMX, 8)\
1154
H264_MC(avg_, 8, MMX, 8)\
1155
H264_MC(avg_, 16,MMX, 8)\
1156

    
1157
#define H264_MC_816(QPEL, XMM)\
1158
QPEL(put_, 8, XMM, 16)\
1159
QPEL(put_, 16,XMM, 16)\
1160
QPEL(avg_, 8, XMM, 16)\
1161
QPEL(avg_, 16,XMM, 16)\
1162

    
1163

    
1164
#define AVG_3DNOW_OP(a,b,temp, size) \
1165
"mov" #size " " #b ", " #temp "   \n\t"\
1166
"pavgusb " #temp ", " #a "        \n\t"\
1167
"mov" #size " " #a ", " #b "      \n\t"
1168
#define AVG_MMX2_OP(a,b,temp, size) \
1169
"mov" #size " " #b ", " #temp "   \n\t"\
1170
"pavgb " #temp ", " #a "          \n\t"\
1171
"mov" #size " " #a ", " #b "      \n\t"
1172

    
1173
#define PAVGB "pavgusb"
1174
QPEL_H264(put_,       PUT_OP, 3dnow)
1175
QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
1176
#undef PAVGB
1177
#define PAVGB "pavgb"
1178
QPEL_H264(put_,       PUT_OP, mmx2)
1179
QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
1180
QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
1181
QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
1182
QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
1183
QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
1184
#if HAVE_SSSE3
1185
QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
1186
QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
1187
QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
1188
QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
1189
QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
1190
QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
1191
#endif
1192
#undef PAVGB
1193

    
1194
H264_MC_4816(3dnow)
1195
H264_MC_4816(mmx2)
1196
H264_MC_816(H264_MC_V, sse2)
1197
H264_MC_816(H264_MC_HV, sse2)
1198
#if HAVE_SSSE3
1199
H264_MC_816(H264_MC_H, ssse3)
1200
H264_MC_816(H264_MC_HV, ssse3)
1201
#endif