Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_qpel_mmx.c @ b32c9ca9

History | View | Annotate | Download (52.7 KB)

1
/*
2
 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "dsputil_mmx.h"
22

    
23
/***********************************/
24
/* motion compensation */
25

    
26
#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
27
        "mov"#q" "#C", "#T"         \n\t"\
28
        "mov"#d" (%0), "#F"         \n\t"\
29
        "paddw "#D", "#T"           \n\t"\
30
        "psllw $2, "#T"             \n\t"\
31
        "psubw "#B", "#T"           \n\t"\
32
        "psubw "#E", "#T"           \n\t"\
33
        "punpcklbw "#Z", "#F"       \n\t"\
34
        "pmullw %4, "#T"            \n\t"\
35
        "paddw %5, "#A"             \n\t"\
36
        "add %2, %0                 \n\t"\
37
        "paddw "#F", "#A"           \n\t"\
38
        "paddw "#A", "#T"           \n\t"\
39
        "psraw $5, "#T"             \n\t"\
40
        "packuswb "#T", "#T"        \n\t"\
41
        OP(T, (%1), A, d)\
42
        "add %3, %1                 \n\t"
43

    
44
#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
45
        "mov"#q" "#C", "#T"         \n\t"\
46
        "mov"#d" (%0), "#F"         \n\t"\
47
        "paddw "#D", "#T"           \n\t"\
48
        "psllw $2, "#T"             \n\t"\
49
        "paddw %4, "#A"             \n\t"\
50
        "psubw "#B", "#T"           \n\t"\
51
        "psubw "#E", "#T"           \n\t"\
52
        "punpcklbw "#Z", "#F"       \n\t"\
53
        "pmullw %3, "#T"            \n\t"\
54
        "paddw "#F", "#A"           \n\t"\
55
        "add %2, %0                 \n\t"\
56
        "paddw "#A", "#T"           \n\t"\
57
        "mov"#q" "#T", "#OF"(%1)    \n\t"
58

    
59
#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
60
#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
61
#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
62
#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
63

    
64

    
65
#define QPEL_H264(OPNAME, OP, MMX)\
66
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
67
    int h=4;\
68
\
69
    __asm__ volatile(\
70
        "pxor %%mm7, %%mm7          \n\t"\
71
        "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
72
        "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
73
        "1:                         \n\t"\
74
        "movd  -1(%0), %%mm1        \n\t"\
75
        "movd    (%0), %%mm2        \n\t"\
76
        "movd   1(%0), %%mm3        \n\t"\
77
        "movd   2(%0), %%mm0        \n\t"\
78
        "punpcklbw %%mm7, %%mm1     \n\t"\
79
        "punpcklbw %%mm7, %%mm2     \n\t"\
80
        "punpcklbw %%mm7, %%mm3     \n\t"\
81
        "punpcklbw %%mm7, %%mm0     \n\t"\
82
        "paddw %%mm0, %%mm1         \n\t"\
83
        "paddw %%mm3, %%mm2         \n\t"\
84
        "movd  -2(%0), %%mm0        \n\t"\
85
        "movd   3(%0), %%mm3        \n\t"\
86
        "punpcklbw %%mm7, %%mm0     \n\t"\
87
        "punpcklbw %%mm7, %%mm3     \n\t"\
88
        "paddw %%mm3, %%mm0         \n\t"\
89
        "psllw $2, %%mm2            \n\t"\
90
        "psubw %%mm1, %%mm2         \n\t"\
91
        "pmullw %%mm4, %%mm2        \n\t"\
92
        "paddw %%mm5, %%mm0         \n\t"\
93
        "paddw %%mm2, %%mm0         \n\t"\
94
        "psraw $5, %%mm0            \n\t"\
95
        "packuswb %%mm0, %%mm0      \n\t"\
96
        OP(%%mm0, (%1),%%mm6, d)\
97
        "add %3, %0                 \n\t"\
98
        "add %4, %1                 \n\t"\
99
        "decl %2                    \n\t"\
100
        " jnz 1b                    \n\t"\
101
        : "+a"(src), "+c"(dst), "+g"(h)\
102
        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
103
        : "memory"\
104
    );\
105
}\
106
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
107
    int h=4;\
108
    __asm__ volatile(\
109
        "pxor %%mm7, %%mm7          \n\t"\
110
        "movq %0, %%mm4             \n\t"\
111
        "movq %1, %%mm5             \n\t"\
112
        :: "m"(ff_pw_5), "m"(ff_pw_16)\
113
    );\
114
    do{\
115
    __asm__ volatile(\
116
        "movd  -1(%0), %%mm1        \n\t"\
117
        "movd    (%0), %%mm2        \n\t"\
118
        "movd   1(%0), %%mm3        \n\t"\
119
        "movd   2(%0), %%mm0        \n\t"\
120
        "punpcklbw %%mm7, %%mm1     \n\t"\
121
        "punpcklbw %%mm7, %%mm2     \n\t"\
122
        "punpcklbw %%mm7, %%mm3     \n\t"\
123
        "punpcklbw %%mm7, %%mm0     \n\t"\
124
        "paddw %%mm0, %%mm1         \n\t"\
125
        "paddw %%mm3, %%mm2         \n\t"\
126
        "movd  -2(%0), %%mm0        \n\t"\
127
        "movd   3(%0), %%mm3        \n\t"\
128
        "punpcklbw %%mm7, %%mm0     \n\t"\
129
        "punpcklbw %%mm7, %%mm3     \n\t"\
130
        "paddw %%mm3, %%mm0         \n\t"\
131
        "psllw $2, %%mm2            \n\t"\
132
        "psubw %%mm1, %%mm2         \n\t"\
133
        "pmullw %%mm4, %%mm2        \n\t"\
134
        "paddw %%mm5, %%mm0         \n\t"\
135
        "paddw %%mm2, %%mm0         \n\t"\
136
        "movd   (%2), %%mm3         \n\t"\
137
        "psraw $5, %%mm0            \n\t"\
138
        "packuswb %%mm0, %%mm0      \n\t"\
139
        PAVGB" %%mm3, %%mm0         \n\t"\
140
        OP(%%mm0, (%1),%%mm6, d)\
141
        "add %4, %0                 \n\t"\
142
        "add %4, %1                 \n\t"\
143
        "add %3, %2                 \n\t"\
144
        : "+a"(src), "+c"(dst), "+d"(src2)\
145
        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
146
        : "memory"\
147
    );\
148
    }while(--h);\
149
}\
150
static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
151
    src -= 2*srcStride;\
152
    __asm__ volatile(\
153
        "pxor %%mm7, %%mm7          \n\t"\
154
        "movd (%0), %%mm0           \n\t"\
155
        "add %2, %0                 \n\t"\
156
        "movd (%0), %%mm1           \n\t"\
157
        "add %2, %0                 \n\t"\
158
        "movd (%0), %%mm2           \n\t"\
159
        "add %2, %0                 \n\t"\
160
        "movd (%0), %%mm3           \n\t"\
161
        "add %2, %0                 \n\t"\
162
        "movd (%0), %%mm4           \n\t"\
163
        "add %2, %0                 \n\t"\
164
        "punpcklbw %%mm7, %%mm0     \n\t"\
165
        "punpcklbw %%mm7, %%mm1     \n\t"\
166
        "punpcklbw %%mm7, %%mm2     \n\t"\
167
        "punpcklbw %%mm7, %%mm3     \n\t"\
168
        "punpcklbw %%mm7, %%mm4     \n\t"\
169
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
170
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
171
        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
172
        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
173
         \
174
        : "+a"(src), "+c"(dst)\
175
        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
176
        : "memory"\
177
    );\
178
}\
179
static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
180
    int h=4;\
181
    int w=3;\
182
    src -= 2*srcStride+2;\
183
    while(w--){\
184
        __asm__ volatile(\
185
            "pxor %%mm7, %%mm7      \n\t"\
186
            "movd (%0), %%mm0       \n\t"\
187
            "add %2, %0             \n\t"\
188
            "movd (%0), %%mm1       \n\t"\
189
            "add %2, %0             \n\t"\
190
            "movd (%0), %%mm2       \n\t"\
191
            "add %2, %0             \n\t"\
192
            "movd (%0), %%mm3       \n\t"\
193
            "add %2, %0             \n\t"\
194
            "movd (%0), %%mm4       \n\t"\
195
            "add %2, %0             \n\t"\
196
            "punpcklbw %%mm7, %%mm0 \n\t"\
197
            "punpcklbw %%mm7, %%mm1 \n\t"\
198
            "punpcklbw %%mm7, %%mm2 \n\t"\
199
            "punpcklbw %%mm7, %%mm3 \n\t"\
200
            "punpcklbw %%mm7, %%mm4 \n\t"\
201
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
202
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
203
            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
204
            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
205
             \
206
            : "+a"(src)\
207
            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
208
            : "memory"\
209
        );\
210
        tmp += 4;\
211
        src += 4 - 9*srcStride;\
212
    }\
213
    tmp -= 3*4;\
214
    __asm__ volatile(\
215
        "1:                         \n\t"\
216
        "movq     (%0), %%mm0       \n\t"\
217
        "paddw  10(%0), %%mm0       \n\t"\
218
        "movq    2(%0), %%mm1       \n\t"\
219
        "paddw   8(%0), %%mm1       \n\t"\
220
        "movq    4(%0), %%mm2       \n\t"\
221
        "paddw   6(%0), %%mm2       \n\t"\
222
        "psubw %%mm1, %%mm0         \n\t"/*a-b   (abccba)*/\
223
        "psraw $2, %%mm0            \n\t"/*(a-b)/4 */\
224
        "psubw %%mm1, %%mm0         \n\t"/*(a-b)/4-b */\
225
        "paddsw %%mm2, %%mm0        \n\t"\
226
        "psraw $2, %%mm0            \n\t"/*((a-b)/4-b+c)/4 */\
227
        "paddw %%mm2, %%mm0         \n\t"/*(a-5*b+20*c)/16 */\
228
        "psraw $6, %%mm0            \n\t"\
229
        "packuswb %%mm0, %%mm0      \n\t"\
230
        OP(%%mm0, (%1),%%mm7, d)\
231
        "add $24, %0                \n\t"\
232
        "add %3, %1                 \n\t"\
233
        "decl %2                    \n\t"\
234
        " jnz 1b                    \n\t"\
235
        : "+a"(tmp), "+c"(dst), "+g"(h)\
236
        : "S"((x86_reg)dstStride)\
237
        : "memory"\
238
    );\
239
}\
240
\
241
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
242
    int h=8;\
243
    __asm__ volatile(\
244
        "pxor %%mm7, %%mm7          \n\t"\
245
        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
246
        "1:                         \n\t"\
247
        "movq    (%0), %%mm0        \n\t"\
248
        "movq   1(%0), %%mm2        \n\t"\
249
        "movq %%mm0, %%mm1          \n\t"\
250
        "movq %%mm2, %%mm3          \n\t"\
251
        "punpcklbw %%mm7, %%mm0     \n\t"\
252
        "punpckhbw %%mm7, %%mm1     \n\t"\
253
        "punpcklbw %%mm7, %%mm2     \n\t"\
254
        "punpckhbw %%mm7, %%mm3     \n\t"\
255
        "paddw %%mm2, %%mm0         \n\t"\
256
        "paddw %%mm3, %%mm1         \n\t"\
257
        "psllw $2, %%mm0            \n\t"\
258
        "psllw $2, %%mm1            \n\t"\
259
        "movq   -1(%0), %%mm2       \n\t"\
260
        "movq    2(%0), %%mm4       \n\t"\
261
        "movq %%mm2, %%mm3          \n\t"\
262
        "movq %%mm4, %%mm5          \n\t"\
263
        "punpcklbw %%mm7, %%mm2     \n\t"\
264
        "punpckhbw %%mm7, %%mm3     \n\t"\
265
        "punpcklbw %%mm7, %%mm4     \n\t"\
266
        "punpckhbw %%mm7, %%mm5     \n\t"\
267
        "paddw %%mm4, %%mm2         \n\t"\
268
        "paddw %%mm3, %%mm5         \n\t"\
269
        "psubw %%mm2, %%mm0         \n\t"\
270
        "psubw %%mm5, %%mm1         \n\t"\
271
        "pmullw %%mm6, %%mm0        \n\t"\
272
        "pmullw %%mm6, %%mm1        \n\t"\
273
        "movd   -2(%0), %%mm2       \n\t"\
274
        "movd    7(%0), %%mm5       \n\t"\
275
        "punpcklbw %%mm7, %%mm2     \n\t"\
276
        "punpcklbw %%mm7, %%mm5     \n\t"\
277
        "paddw %%mm3, %%mm2         \n\t"\
278
        "paddw %%mm5, %%mm4         \n\t"\
279
        "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
280
        "paddw %%mm5, %%mm2         \n\t"\
281
        "paddw %%mm5, %%mm4         \n\t"\
282
        "paddw %%mm2, %%mm0         \n\t"\
283
        "paddw %%mm4, %%mm1         \n\t"\
284
        "psraw $5, %%mm0            \n\t"\
285
        "psraw $5, %%mm1            \n\t"\
286
        "packuswb %%mm1, %%mm0      \n\t"\
287
        OP(%%mm0, (%1),%%mm5, q)\
288
        "add %3, %0                 \n\t"\
289
        "add %4, %1                 \n\t"\
290
        "decl %2                    \n\t"\
291
        " jnz 1b                    \n\t"\
292
        : "+a"(src), "+c"(dst), "+g"(h)\
293
        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
294
        : "memory"\
295
    );\
296
}\
297
\
298
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
299
    int h=8;\
300
    __asm__ volatile(\
301
        "pxor %%mm7, %%mm7          \n\t"\
302
        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
303
        "1:                         \n\t"\
304
        "movq    (%0), %%mm0        \n\t"\
305
        "movq   1(%0), %%mm2        \n\t"\
306
        "movq %%mm0, %%mm1          \n\t"\
307
        "movq %%mm2, %%mm3          \n\t"\
308
        "punpcklbw %%mm7, %%mm0     \n\t"\
309
        "punpckhbw %%mm7, %%mm1     \n\t"\
310
        "punpcklbw %%mm7, %%mm2     \n\t"\
311
        "punpckhbw %%mm7, %%mm3     \n\t"\
312
        "paddw %%mm2, %%mm0         \n\t"\
313
        "paddw %%mm3, %%mm1         \n\t"\
314
        "psllw $2, %%mm0            \n\t"\
315
        "psllw $2, %%mm1            \n\t"\
316
        "movq   -1(%0), %%mm2       \n\t"\
317
        "movq    2(%0), %%mm4       \n\t"\
318
        "movq %%mm2, %%mm3          \n\t"\
319
        "movq %%mm4, %%mm5          \n\t"\
320
        "punpcklbw %%mm7, %%mm2     \n\t"\
321
        "punpckhbw %%mm7, %%mm3     \n\t"\
322
        "punpcklbw %%mm7, %%mm4     \n\t"\
323
        "punpckhbw %%mm7, %%mm5     \n\t"\
324
        "paddw %%mm4, %%mm2         \n\t"\
325
        "paddw %%mm3, %%mm5         \n\t"\
326
        "psubw %%mm2, %%mm0         \n\t"\
327
        "psubw %%mm5, %%mm1         \n\t"\
328
        "pmullw %%mm6, %%mm0        \n\t"\
329
        "pmullw %%mm6, %%mm1        \n\t"\
330
        "movd   -2(%0), %%mm2       \n\t"\
331
        "movd    7(%0), %%mm5       \n\t"\
332
        "punpcklbw %%mm7, %%mm2     \n\t"\
333
        "punpcklbw %%mm7, %%mm5     \n\t"\
334
        "paddw %%mm3, %%mm2         \n\t"\
335
        "paddw %%mm5, %%mm4         \n\t"\
336
        "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
337
        "paddw %%mm5, %%mm2         \n\t"\
338
        "paddw %%mm5, %%mm4         \n\t"\
339
        "paddw %%mm2, %%mm0         \n\t"\
340
        "paddw %%mm4, %%mm1         \n\t"\
341
        "psraw $5, %%mm0            \n\t"\
342
        "psraw $5, %%mm1            \n\t"\
343
        "movq (%2), %%mm4           \n\t"\
344
        "packuswb %%mm1, %%mm0      \n\t"\
345
        PAVGB" %%mm4, %%mm0         \n\t"\
346
        OP(%%mm0, (%1),%%mm5, q)\
347
        "add %5, %0                 \n\t"\
348
        "add %5, %1                 \n\t"\
349
        "add %4, %2                 \n\t"\
350
        "decl %3                    \n\t"\
351
        "jg 1b                      \n\t"\
352
        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
353
        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
354
        : "memory"\
355
    );\
356
}\
357
\
358
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
359
    int w= 2;\
360
    src -= 2*srcStride;\
361
    \
362
    while(w--){\
363
      __asm__ volatile(\
364
        "pxor %%mm7, %%mm7          \n\t"\
365
        "movd (%0), %%mm0           \n\t"\
366
        "add %2, %0                 \n\t"\
367
        "movd (%0), %%mm1           \n\t"\
368
        "add %2, %0                 \n\t"\
369
        "movd (%0), %%mm2           \n\t"\
370
        "add %2, %0                 \n\t"\
371
        "movd (%0), %%mm3           \n\t"\
372
        "add %2, %0                 \n\t"\
373
        "movd (%0), %%mm4           \n\t"\
374
        "add %2, %0                 \n\t"\
375
        "punpcklbw %%mm7, %%mm0     \n\t"\
376
        "punpcklbw %%mm7, %%mm1     \n\t"\
377
        "punpcklbw %%mm7, %%mm2     \n\t"\
378
        "punpcklbw %%mm7, %%mm3     \n\t"\
379
        "punpcklbw %%mm7, %%mm4     \n\t"\
380
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
381
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
382
        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
383
        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
384
        QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
385
        QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
386
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
387
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
388
         \
389
        : "+a"(src), "+c"(dst)\
390
        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
391
        : "memory"\
392
     );\
393
     if(h==16){\
394
        __asm__ volatile(\
395
            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
396
            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
397
            QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
398
            QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
399
            QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
400
            QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
401
            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
402
            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
403
            \
404
           : "+a"(src), "+c"(dst)\
405
           : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
406
           : "memory"\
407
        );\
408
     }\
409
     src += 4-(h+5)*srcStride;\
410
     dst += 4-h*dstStride;\
411
   }\
412
}\
413
static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
414
    int w = (size+8)>>2;\
415
    src -= 2*srcStride+2;\
416
    while(w--){\
417
        __asm__ volatile(\
418
            "pxor %%mm7, %%mm7      \n\t"\
419
            "movd (%0), %%mm0       \n\t"\
420
            "add %2, %0             \n\t"\
421
            "movd (%0), %%mm1       \n\t"\
422
            "add %2, %0             \n\t"\
423
            "movd (%0), %%mm2       \n\t"\
424
            "add %2, %0             \n\t"\
425
            "movd (%0), %%mm3       \n\t"\
426
            "add %2, %0             \n\t"\
427
            "movd (%0), %%mm4       \n\t"\
428
            "add %2, %0             \n\t"\
429
            "punpcklbw %%mm7, %%mm0 \n\t"\
430
            "punpcklbw %%mm7, %%mm1 \n\t"\
431
            "punpcklbw %%mm7, %%mm2 \n\t"\
432
            "punpcklbw %%mm7, %%mm3 \n\t"\
433
            "punpcklbw %%mm7, %%mm4 \n\t"\
434
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
435
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
436
            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
437
            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
438
            QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
439
            QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
440
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
441
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
442
            : "+a"(src)\
443
            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
444
            : "memory"\
445
        );\
446
        if(size==16){\
447
            __asm__ volatile(\
448
                QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1,  8*48)\
449
                QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2,  9*48)\
450
                QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
451
                QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
452
                QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
453
                QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
454
                QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
455
                QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
456
                : "+a"(src)\
457
                : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
458
                : "memory"\
459
            );\
460
        }\
461
        tmp += 4;\
462
        src += 4 - (size+5)*srcStride;\
463
    }\
464
}\
465
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
466
    int w = size>>4;\
467
    do{\
468
    int h = size;\
469
    __asm__ volatile(\
470
        "1:                         \n\t"\
471
        "movq     (%0), %%mm0       \n\t"\
472
        "movq    8(%0), %%mm3       \n\t"\
473
        "movq    2(%0), %%mm1       \n\t"\
474
        "movq   10(%0), %%mm4       \n\t"\
475
        "paddw   %%mm4, %%mm0       \n\t"\
476
        "paddw   %%mm3, %%mm1       \n\t"\
477
        "paddw  18(%0), %%mm3       \n\t"\
478
        "paddw  16(%0), %%mm4       \n\t"\
479
        "movq    4(%0), %%mm2       \n\t"\
480
        "movq   12(%0), %%mm5       \n\t"\
481
        "paddw   6(%0), %%mm2       \n\t"\
482
        "paddw  14(%0), %%mm5       \n\t"\
483
        "psubw %%mm1, %%mm0         \n\t"\
484
        "psubw %%mm4, %%mm3         \n\t"\
485
        "psraw $2, %%mm0            \n\t"\
486
        "psraw $2, %%mm3            \n\t"\
487
        "psubw %%mm1, %%mm0         \n\t"\
488
        "psubw %%mm4, %%mm3         \n\t"\
489
        "paddsw %%mm2, %%mm0        \n\t"\
490
        "paddsw %%mm5, %%mm3        \n\t"\
491
        "psraw $2, %%mm0            \n\t"\
492
        "psraw $2, %%mm3            \n\t"\
493
        "paddw %%mm2, %%mm0         \n\t"\
494
        "paddw %%mm5, %%mm3         \n\t"\
495
        "psraw $6, %%mm0            \n\t"\
496
        "psraw $6, %%mm3            \n\t"\
497
        "packuswb %%mm3, %%mm0      \n\t"\
498
        OP(%%mm0, (%1),%%mm7, q)\
499
        "add $48, %0                \n\t"\
500
        "add %3, %1                 \n\t"\
501
        "decl %2                    \n\t"\
502
        " jnz 1b                    \n\t"\
503
        : "+a"(tmp), "+c"(dst), "+g"(h)\
504
        : "S"((x86_reg)dstStride)\
505
        : "memory"\
506
    );\
507
    tmp += 8 - size*24;\
508
    dst += 8 - size*dstStride;\
509
    }while(w--);\
510
}\
511
\
512
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
513
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
514
}\
515
static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
516
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
517
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
518
}\
519
\
520
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
521
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
522
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
523
    src += 8*srcStride;\
524
    dst += 8*dstStride;\
525
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
526
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
527
}\
528
\
529
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
530
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
531
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
532
    src += 8*dstStride;\
533
    dst += 8*dstStride;\
534
    src2 += 8*src2Stride;\
535
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
536
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
537
}\
538
\
539
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
540
          put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
541
    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
542
}\
543
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
544
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
545
}\
546
\
547
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
548
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
549
}\
550
\
551
static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
552
{\
553
    __asm__ volatile(\
554
        "movq      (%1), %%mm0          \n\t"\
555
        "movq    24(%1), %%mm1          \n\t"\
556
        "psraw      $5,  %%mm0          \n\t"\
557
        "psraw      $5,  %%mm1          \n\t"\
558
        "packuswb %%mm0, %%mm0          \n\t"\
559
        "packuswb %%mm1, %%mm1          \n\t"\
560
        PAVGB"     (%0), %%mm0          \n\t"\
561
        PAVGB"  (%0,%3), %%mm1          \n\t"\
562
        OP(%%mm0, (%2),    %%mm4, d)\
563
        OP(%%mm1, (%2,%4), %%mm5, d)\
564
        "lea  (%0,%3,2), %0             \n\t"\
565
        "lea  (%2,%4,2), %2             \n\t"\
566
        "movq    48(%1), %%mm0          \n\t"\
567
        "movq    72(%1), %%mm1          \n\t"\
568
        "psraw      $5,  %%mm0          \n\t"\
569
        "psraw      $5,  %%mm1          \n\t"\
570
        "packuswb %%mm0, %%mm0          \n\t"\
571
        "packuswb %%mm1, %%mm1          \n\t"\
572
        PAVGB"     (%0), %%mm0          \n\t"\
573
        PAVGB"  (%0,%3), %%mm1          \n\t"\
574
        OP(%%mm0, (%2),    %%mm4, d)\
575
        OP(%%mm1, (%2,%4), %%mm5, d)\
576
        :"+a"(src8), "+c"(src16), "+d"(dst)\
577
        :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
578
        :"memory");\
579
}\
580
static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
581
{\
582
    do{\
583
    __asm__ volatile(\
584
        "movq      (%1), %%mm0          \n\t"\
585
        "movq     8(%1), %%mm1          \n\t"\
586
        "movq    48(%1), %%mm2          \n\t"\
587
        "movq  8+48(%1), %%mm3          \n\t"\
588
        "psraw      $5,  %%mm0          \n\t"\
589
        "psraw      $5,  %%mm1          \n\t"\
590
        "psraw      $5,  %%mm2          \n\t"\
591
        "psraw      $5,  %%mm3          \n\t"\
592
        "packuswb %%mm1, %%mm0          \n\t"\
593
        "packuswb %%mm3, %%mm2          \n\t"\
594
        PAVGB"     (%0), %%mm0          \n\t"\
595
        PAVGB"  (%0,%3), %%mm2          \n\t"\
596
        OP(%%mm0, (%2), %%mm5, q)\
597
        OP(%%mm2, (%2,%4), %%mm5, q)\
598
        ::"a"(src8), "c"(src16), "d"(dst),\
599
          "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
600
        :"memory");\
601
        src8 += 2L*src8Stride;\
602
        src16 += 48;\
603
        dst += 2L*dstStride;\
604
    }while(h-=2);\
605
}\
606
static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
607
{\
608
    OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
609
    OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
610
}\
611

    
612

    
613
#if ARCH_X86_64
614
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
615
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
616
    int h=16;\
617
    __asm__ volatile(\
618
        "pxor %%xmm15, %%xmm15      \n\t"\
619
        "movdqa %6, %%xmm14         \n\t"\
620
        "movdqa %7, %%xmm13         \n\t"\
621
        "1:                         \n\t"\
622
        "lddqu    6(%0), %%xmm1     \n\t"\
623
        "lddqu   -2(%0), %%xmm7     \n\t"\
624
        "movdqa  %%xmm1, %%xmm0     \n\t"\
625
        "punpckhbw %%xmm15, %%xmm1  \n\t"\
626
        "punpcklbw %%xmm15, %%xmm0  \n\t"\
627
        "punpcklbw %%xmm15, %%xmm7  \n\t"\
628
        "movdqa  %%xmm1, %%xmm2     \n\t"\
629
        "movdqa  %%xmm0, %%xmm6     \n\t"\
630
        "movdqa  %%xmm1, %%xmm3     \n\t"\
631
        "movdqa  %%xmm0, %%xmm8     \n\t"\
632
        "movdqa  %%xmm1, %%xmm4     \n\t"\
633
        "movdqa  %%xmm0, %%xmm9     \n\t"\
634
        "movdqa  %%xmm0, %%xmm12    \n\t"\
635
        "movdqa  %%xmm1, %%xmm11    \n\t"\
636
        "palignr $10,%%xmm0, %%xmm11\n\t"\
637
        "palignr $10,%%xmm7, %%xmm12\n\t"\
638
        "palignr $2, %%xmm0, %%xmm4 \n\t"\
639
        "palignr $2, %%xmm7, %%xmm9 \n\t"\
640
        "palignr $4, %%xmm0, %%xmm3 \n\t"\
641
        "palignr $4, %%xmm7, %%xmm8 \n\t"\
642
        "palignr $6, %%xmm0, %%xmm2 \n\t"\
643
        "palignr $6, %%xmm7, %%xmm6 \n\t"\
644
        "paddw   %%xmm0 ,%%xmm11    \n\t"\
645
        "palignr $8, %%xmm0, %%xmm1 \n\t"\
646
        "palignr $8, %%xmm7, %%xmm0 \n\t"\
647
        "paddw   %%xmm12,%%xmm7     \n\t"\
648
        "paddw   %%xmm3, %%xmm2     \n\t"\
649
        "paddw   %%xmm8, %%xmm6     \n\t"\
650
        "paddw   %%xmm4, %%xmm1     \n\t"\
651
        "paddw   %%xmm9, %%xmm0     \n\t"\
652
        "psllw   $2,     %%xmm2     \n\t"\
653
        "psllw   $2,     %%xmm6     \n\t"\
654
        "psubw   %%xmm1, %%xmm2     \n\t"\
655
        "psubw   %%xmm0, %%xmm6     \n\t"\
656
        "paddw   %%xmm13,%%xmm11    \n\t"\
657
        "paddw   %%xmm13,%%xmm7     \n\t"\
658
        "pmullw  %%xmm14,%%xmm2     \n\t"\
659
        "pmullw  %%xmm14,%%xmm6     \n\t"\
660
        "lddqu   (%2),   %%xmm3     \n\t"\
661
        "paddw   %%xmm11,%%xmm2     \n\t"\
662
        "paddw   %%xmm7, %%xmm6     \n\t"\
663
        "psraw   $5,     %%xmm2     \n\t"\
664
        "psraw   $5,     %%xmm6     \n\t"\
665
        "packuswb %%xmm2,%%xmm6     \n\t"\
666
        "pavgb   %%xmm3, %%xmm6     \n\t"\
667
        OP(%%xmm6, (%1), %%xmm4, dqa)\
668
        "add %5, %0                 \n\t"\
669
        "add %5, %1                 \n\t"\
670
        "add %4, %2                 \n\t"\
671
        "decl %3                    \n\t"\
672
        "jg 1b                      \n\t"\
673
        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
674
        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
675
          "m"(ff_pw_5), "m"(ff_pw_16)\
676
        : "memory"\
677
    );\
678
}
679
#else // ARCH_X86_64
680
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
681
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
682
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
683
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
684
    src += 8*dstStride;\
685
    dst += 8*dstStride;\
686
    src2 += 8*src2Stride;\
687
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
688
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
689
}
690
#endif // ARCH_X86_64
691

    
692
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
693
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
694
    int h=8;\
695
    __asm__ volatile(\
696
        "pxor %%xmm7, %%xmm7        \n\t"\
697
        "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
698
        "1:                         \n\t"\
699
        "lddqu   -2(%0), %%xmm1     \n\t"\
700
        "movdqa  %%xmm1, %%xmm0     \n\t"\
701
        "punpckhbw %%xmm7, %%xmm1   \n\t"\
702
        "punpcklbw %%xmm7, %%xmm0   \n\t"\
703
        "movdqa  %%xmm1, %%xmm2     \n\t"\
704
        "movdqa  %%xmm1, %%xmm3     \n\t"\
705
        "movdqa  %%xmm1, %%xmm4     \n\t"\
706
        "movdqa  %%xmm1, %%xmm5     \n\t"\
707
        "palignr $2, %%xmm0, %%xmm4 \n\t"\
708
        "palignr $4, %%xmm0, %%xmm3 \n\t"\
709
        "palignr $6, %%xmm0, %%xmm2 \n\t"\
710
        "palignr $8, %%xmm0, %%xmm1 \n\t"\
711
        "palignr $10,%%xmm0, %%xmm5 \n\t"\
712
        "paddw   %%xmm5, %%xmm0     \n\t"\
713
        "paddw   %%xmm3, %%xmm2     \n\t"\
714
        "paddw   %%xmm4, %%xmm1     \n\t"\
715
        "psllw   $2,     %%xmm2     \n\t"\
716
        "movq    (%2),   %%xmm3     \n\t"\
717
        "psubw   %%xmm1, %%xmm2     \n\t"\
718
        "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
719
        "pmullw  %%xmm6, %%xmm2     \n\t"\
720
        "paddw   %%xmm0, %%xmm2     \n\t"\
721
        "psraw   $5,     %%xmm2     \n\t"\
722
        "packuswb %%xmm2, %%xmm2    \n\t"\
723
        "pavgb   %%xmm3, %%xmm2     \n\t"\
724
        OP(%%xmm2, (%1), %%xmm4, q)\
725
        "add %5, %0                 \n\t"\
726
        "add %5, %1                 \n\t"\
727
        "add %4, %2                 \n\t"\
728
        "decl %3                    \n\t"\
729
        "jg 1b                      \n\t"\
730
        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
731
        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
732
        : "memory"\
733
    );\
734
}\
735
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
736
\
737
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
738
    int h=8;\
739
    __asm__ volatile(\
740
        "pxor %%xmm7, %%xmm7        \n\t"\
741
        "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
742
        "1:                         \n\t"\
743
        "lddqu   -2(%0), %%xmm1     \n\t"\
744
        "movdqa  %%xmm1, %%xmm0     \n\t"\
745
        "punpckhbw %%xmm7, %%xmm1   \n\t"\
746
        "punpcklbw %%xmm7, %%xmm0   \n\t"\
747
        "movdqa  %%xmm1, %%xmm2     \n\t"\
748
        "movdqa  %%xmm1, %%xmm3     \n\t"\
749
        "movdqa  %%xmm1, %%xmm4     \n\t"\
750
        "movdqa  %%xmm1, %%xmm5     \n\t"\
751
        "palignr $2, %%xmm0, %%xmm4 \n\t"\
752
        "palignr $4, %%xmm0, %%xmm3 \n\t"\
753
        "palignr $6, %%xmm0, %%xmm2 \n\t"\
754
        "palignr $8, %%xmm0, %%xmm1 \n\t"\
755
        "palignr $10,%%xmm0, %%xmm5 \n\t"\
756
        "paddw   %%xmm5, %%xmm0     \n\t"\
757
        "paddw   %%xmm3, %%xmm2     \n\t"\
758
        "paddw   %%xmm4, %%xmm1     \n\t"\
759
        "psllw   $2,     %%xmm2     \n\t"\
760
        "psubw   %%xmm1, %%xmm2     \n\t"\
761
        "paddw   "MANGLE(ff_pw_16)", %%xmm0\n\t"\
762
        "pmullw  %%xmm6, %%xmm2     \n\t"\
763
        "paddw   %%xmm0, %%xmm2     \n\t"\
764
        "psraw   $5,     %%xmm2     \n\t"\
765
        "packuswb %%xmm2, %%xmm2    \n\t"\
766
        OP(%%xmm2, (%1), %%xmm4, q)\
767
        "add %3, %0                 \n\t"\
768
        "add %4, %1                 \n\t"\
769
        "decl %2                    \n\t"\
770
        " jnz 1b                    \n\t"\
771
        : "+a"(src), "+c"(dst), "+g"(h)\
772
        : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
773
        : "memory"\
774
    );\
775
}\
776
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
777
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
778
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
779
    src += 8*srcStride;\
780
    dst += 8*dstStride;\
781
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
782
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
783
}\
784

    
785
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
786
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
787
    src -= 2*srcStride;\
788
    \
789
    __asm__ volatile(\
790
        "pxor %%xmm7, %%xmm7        \n\t"\
791
        "movq (%0), %%xmm0          \n\t"\
792
        "add %2, %0                 \n\t"\
793
        "movq (%0), %%xmm1          \n\t"\
794
        "add %2, %0                 \n\t"\
795
        "movq (%0), %%xmm2          \n\t"\
796
        "add %2, %0                 \n\t"\
797
        "movq (%0), %%xmm3          \n\t"\
798
        "add %2, %0                 \n\t"\
799
        "movq (%0), %%xmm4          \n\t"\
800
        "add %2, %0                 \n\t"\
801
        "punpcklbw %%xmm7, %%xmm0   \n\t"\
802
        "punpcklbw %%xmm7, %%xmm1   \n\t"\
803
        "punpcklbw %%xmm7, %%xmm2   \n\t"\
804
        "punpcklbw %%xmm7, %%xmm3   \n\t"\
805
        "punpcklbw %%xmm7, %%xmm4   \n\t"\
806
        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
807
        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
808
        QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
809
        QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
810
        QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
811
        QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
812
        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
813
        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
814
         \
815
        : "+a"(src), "+c"(dst)\
816
        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
817
        : "memory"\
818
    );\
819
    if(h==16){\
820
        __asm__ volatile(\
821
            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
822
            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
823
            QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
824
            QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
825
            QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
826
            QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
827
            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
828
            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
829
            \
830
            : "+a"(src), "+c"(dst)\
831
            : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
832
            : "memory"\
833
        );\
834
    }\
835
}\
836
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
837
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
838
}\
839
static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
840
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
841
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
842
}
843

    
844
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
845
    int w = (size+8)>>3;
846
    src -= 2*srcStride+2;
847
    while(w--){
848
        __asm__ volatile(
849
            "pxor %%xmm7, %%xmm7        \n\t"
850
            "movq (%0), %%xmm0          \n\t"
851
            "add %2, %0                 \n\t"
852
            "movq (%0), %%xmm1          \n\t"
853
            "add %2, %0                 \n\t"
854
            "movq (%0), %%xmm2          \n\t"
855
            "add %2, %0                 \n\t"
856
            "movq (%0), %%xmm3          \n\t"
857
            "add %2, %0                 \n\t"
858
            "movq (%0), %%xmm4          \n\t"
859
            "add %2, %0                 \n\t"
860
            "punpcklbw %%xmm7, %%xmm0   \n\t"
861
            "punpcklbw %%xmm7, %%xmm1   \n\t"
862
            "punpcklbw %%xmm7, %%xmm2   \n\t"
863
            "punpcklbw %%xmm7, %%xmm3   \n\t"
864
            "punpcklbw %%xmm7, %%xmm4   \n\t"
865
            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
866
            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
867
            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
868
            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
869
            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
870
            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
871
            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
872
            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
873
            : "+a"(src)
874
            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
875
            : "memory"
876
        );
877
        if(size==16){
878
            __asm__ volatile(
879
                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
880
                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
881
                QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
882
                QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
883
                QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
884
                QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
885
                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
886
                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
887
                : "+a"(src)
888
                : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
889
                : "memory"
890
            );
891
        }
892
        tmp += 8;
893
        src += 8 - (size+5)*srcStride;
894
    }
895
}
896

    
897
#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
898
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
899
    int h = size;\
900
    if(size == 16){\
901
        __asm__ volatile(\
902
            "1:                         \n\t"\
903
            "movdqa 32(%0), %%xmm4      \n\t"\
904
            "movdqa 16(%0), %%xmm5      \n\t"\
905
            "movdqa   (%0), %%xmm7      \n\t"\
906
            "movdqa %%xmm4, %%xmm3      \n\t"\
907
            "movdqa %%xmm4, %%xmm2      \n\t"\
908
            "movdqa %%xmm4, %%xmm1      \n\t"\
909
            "movdqa %%xmm4, %%xmm0      \n\t"\
910
            "palignr $10, %%xmm5, %%xmm0 \n\t"\
911
            "palignr  $8, %%xmm5, %%xmm1 \n\t"\
912
            "palignr  $6, %%xmm5, %%xmm2 \n\t"\
913
            "palignr  $4, %%xmm5, %%xmm3 \n\t"\
914
            "palignr  $2, %%xmm5, %%xmm4 \n\t"\
915
            "paddw  %%xmm5, %%xmm0      \n\t"\
916
            "paddw  %%xmm4, %%xmm1      \n\t"\
917
            "paddw  %%xmm3, %%xmm2      \n\t"\
918
            "movdqa %%xmm5, %%xmm6      \n\t"\
919
            "movdqa %%xmm5, %%xmm4      \n\t"\
920
            "movdqa %%xmm5, %%xmm3      \n\t"\
921
            "palignr  $8, %%xmm7, %%xmm4 \n\t"\
922
            "palignr  $2, %%xmm7, %%xmm6 \n\t"\
923
            "palignr $10, %%xmm7, %%xmm3 \n\t"\
924
            "paddw  %%xmm6, %%xmm4      \n\t"\
925
            "movdqa %%xmm5, %%xmm6      \n\t"\
926
            "palignr  $6, %%xmm7, %%xmm5 \n\t"\
927
            "palignr  $4, %%xmm7, %%xmm6 \n\t"\
928
            "paddw  %%xmm7, %%xmm3      \n\t"\
929
            "paddw  %%xmm6, %%xmm5      \n\t"\
930
            \
931
            "psubw  %%xmm1, %%xmm0      \n\t"\
932
            "psubw  %%xmm4, %%xmm3      \n\t"\
933
            "psraw      $2, %%xmm0      \n\t"\
934
            "psraw      $2, %%xmm3      \n\t"\
935
            "psubw  %%xmm1, %%xmm0      \n\t"\
936
            "psubw  %%xmm4, %%xmm3      \n\t"\
937
            "paddw  %%xmm2, %%xmm0      \n\t"\
938
            "paddw  %%xmm5, %%xmm3      \n\t"\
939
            "psraw      $2, %%xmm0      \n\t"\
940
            "psraw      $2, %%xmm3      \n\t"\
941
            "paddw  %%xmm2, %%xmm0      \n\t"\
942
            "paddw  %%xmm5, %%xmm3      \n\t"\
943
            "psraw      $6, %%xmm0      \n\t"\
944
            "psraw      $6, %%xmm3      \n\t"\
945
            "packuswb %%xmm0, %%xmm3    \n\t"\
946
            OP(%%xmm3, (%1), %%xmm7, dqa)\
947
            "add $48, %0                \n\t"\
948
            "add %3, %1                 \n\t"\
949
            "decl %2                    \n\t"\
950
            " jnz 1b                    \n\t"\
951
            : "+a"(tmp), "+c"(dst), "+g"(h)\
952
            : "S"((x86_reg)dstStride)\
953
            : "memory"\
954
        );\
955
    }else{\
956
        __asm__ volatile(\
957
            "1:                         \n\t"\
958
            "movdqa 16(%0), %%xmm1      \n\t"\
959
            "movdqa   (%0), %%xmm0      \n\t"\
960
            "movdqa %%xmm1, %%xmm2      \n\t"\
961
            "movdqa %%xmm1, %%xmm3      \n\t"\
962
            "movdqa %%xmm1, %%xmm4      \n\t"\
963
            "movdqa %%xmm1, %%xmm5      \n\t"\
964
            "palignr $10, %%xmm0, %%xmm5 \n\t"\
965
            "palignr  $8, %%xmm0, %%xmm4 \n\t"\
966
            "palignr  $6, %%xmm0, %%xmm3 \n\t"\
967
            "palignr  $4, %%xmm0, %%xmm2 \n\t"\
968
            "palignr  $2, %%xmm0, %%xmm1 \n\t"\
969
            "paddw  %%xmm5, %%xmm0      \n\t"\
970
            "paddw  %%xmm4, %%xmm1      \n\t"\
971
            "paddw  %%xmm3, %%xmm2      \n\t"\
972
            "psubw  %%xmm1, %%xmm0      \n\t"\
973
            "psraw      $2, %%xmm0      \n\t"\
974
            "psubw  %%xmm1, %%xmm0      \n\t"\
975
            "paddw  %%xmm2, %%xmm0      \n\t"\
976
            "psraw      $2, %%xmm0      \n\t"\
977
            "paddw  %%xmm2, %%xmm0      \n\t"\
978
            "psraw      $6, %%xmm0      \n\t"\
979
            "packuswb %%xmm0, %%xmm0    \n\t"\
980
            OP(%%xmm0, (%1), %%xmm7, q)\
981
            "add $48, %0                \n\t"\
982
            "add %3, %1                 \n\t"\
983
            "decl %2                    \n\t"\
984
            " jnz 1b                    \n\t"\
985
            : "+a"(tmp), "+c"(dst), "+g"(h)\
986
            : "S"((x86_reg)dstStride)\
987
            : "memory"\
988
        );\
989
    }\
990
}
991

    
992
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
993
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
994
          put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
995
    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
996
}\
997
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
998
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
999
}\
1000
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1001
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
1002
}\
1003

    
1004
#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
1005
#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
1006
#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
1007
#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
1008
#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
1009
#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
1010
#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
1011
#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
1012

    
1013
#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
1014
#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
1015
#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
1016
#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
1017
#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
1018
#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
1019
#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
1020
#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
1021

    
1022
#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
1023
#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
1024
#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
1025
#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
1026

    
1027
#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
1028
#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
1029
#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
1030
#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
1031

    
1032
#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
1033
#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
1034

    
1035
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
1036
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
1037
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
1038
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
1039
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
1040

    
1041
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
1042
    put_pixels16_sse2(dst, src, stride, 16);
1043
}
1044
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
1045
    avg_pixels16_sse2(dst, src, stride, 16);
1046
}
1047
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
1048
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
1049

    
1050
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
1051
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1052
    OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
1053
}\
1054

    
1055
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
1056
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1057
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
1058
}\
1059
\
1060
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1061
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
1062
}\
1063
\
1064
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1065
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
1066
}\
1067

    
1068
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
1069
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1070
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1071
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1072
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
1073
}\
1074
\
1075
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1076
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
1077
}\
1078
\
1079
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1080
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1081
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1082
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
1083
}\
1084

    
1085
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
1086
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1087
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1088
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1089
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
1090
}\
1091
\
1092
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1093
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1094
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
1095
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
1096
}\
1097
\
1098
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1099
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1100
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1101
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
1102
}\
1103
\
1104
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1105
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1106
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
1107
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
1108
}\
1109
\
1110
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1111
    DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
1112
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
1113
}\
1114
\
1115
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1116
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
1117
    uint8_t * const halfHV= temp;\
1118
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1119
    assert(((int)temp & 7) == 0);\
1120
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1121
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
1122
}\
1123
\
1124
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1125
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
1126
    uint8_t * const halfHV= temp;\
1127
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1128
    assert(((int)temp & 7) == 0);\
1129
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1130
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
1131
}\
1132
\
1133
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1134
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
1135
    uint8_t * const halfHV= temp;\
1136
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1137
    assert(((int)temp & 7) == 0);\
1138
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1139
    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
1140
}\
1141
\
1142
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1143
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
1144
    uint8_t * const halfHV= temp;\
1145
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1146
    assert(((int)temp & 7) == 0);\
1147
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1148
    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
1149
}\
1150

    
1151
#define H264_MC_4816(MMX)\
1152
H264_MC(put_, 4, MMX, 8)\
1153
H264_MC(put_, 8, MMX, 8)\
1154
H264_MC(put_, 16,MMX, 8)\
1155
H264_MC(avg_, 4, MMX, 8)\
1156
H264_MC(avg_, 8, MMX, 8)\
1157
H264_MC(avg_, 16,MMX, 8)\
1158

    
1159
#define H264_MC_816(QPEL, XMM)\
1160
QPEL(put_, 8, XMM, 16)\
1161
QPEL(put_, 16,XMM, 16)\
1162
QPEL(avg_, 8, XMM, 16)\
1163
QPEL(avg_, 16,XMM, 16)\
1164

    
1165

    
1166
#define AVG_3DNOW_OP(a,b,temp, size) \
1167
"mov" #size " " #b ", " #temp "   \n\t"\
1168
"pavgusb " #temp ", " #a "        \n\t"\
1169
"mov" #size " " #a ", " #b "      \n\t"
1170
#define AVG_MMX2_OP(a,b,temp, size) \
1171
"mov" #size " " #b ", " #temp "   \n\t"\
1172
"pavgb " #temp ", " #a "          \n\t"\
1173
"mov" #size " " #a ", " #b "      \n\t"
1174

    
1175
#define PAVGB "pavgusb"
1176
QPEL_H264(put_,       PUT_OP, 3dnow)
1177
QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
1178
#undef PAVGB
1179
#define PAVGB "pavgb"
1180
QPEL_H264(put_,       PUT_OP, mmx2)
1181
QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
1182
QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
1183
QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
1184
QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
1185
QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
1186
#if HAVE_SSSE3
1187
QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
1188
QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
1189
QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
1190
QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
1191
QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
1192
QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
1193
#endif
1194
#undef PAVGB
1195

    
1196
H264_MC_4816(3dnow)
1197
H264_MC_4816(mmx2)
1198
H264_MC_816(H264_MC_V, sse2)
1199
H264_MC_816(H264_MC_HV, sse2)
1200
#if HAVE_SSSE3
1201
H264_MC_816(H264_MC_H, ssse3)
1202
H264_MC_816(H264_MC_HV, ssse3)
1203
#endif