Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_qpel_mmx.c @ cae05859

History | View | Annotate | Download (52.3 KB)

1 14bc1f24 Ronald S. Bultje
/*
2
 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20
21
#include "dsputil_mmx.h"
22
23
/***********************************/
24
/* motion compensation */
25
26
#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
27
        "mov"#q" "#C", "#T"         \n\t"\
28
        "mov"#d" (%0), "#F"         \n\t"\
29
        "paddw "#D", "#T"           \n\t"\
30
        "psllw $2, "#T"             \n\t"\
31
        "psubw "#B", "#T"           \n\t"\
32
        "psubw "#E", "#T"           \n\t"\
33
        "punpcklbw "#Z", "#F"       \n\t"\
34 cae05859 Ramiro Polla
        "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
35
        "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
36 14bc1f24 Ronald S. Bultje
        "add %2, %0                 \n\t"\
37
        "paddw "#F", "#A"           \n\t"\
38
        "paddw "#A", "#T"           \n\t"\
39
        "psraw $5, "#T"             \n\t"\
40
        "packuswb "#T", "#T"        \n\t"\
41
        OP(T, (%1), A, d)\
42
        "add %3, %1                 \n\t"
43
44
#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
45
        "mov"#q" "#C", "#T"         \n\t"\
46
        "mov"#d" (%0), "#F"         \n\t"\
47
        "paddw "#D", "#T"           \n\t"\
48
        "psllw $2, "#T"             \n\t"\
49 cae05859 Ramiro Polla
        "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
50 14bc1f24 Ronald S. Bultje
        "psubw "#B", "#T"           \n\t"\
51
        "psubw "#E", "#T"           \n\t"\
52
        "punpcklbw "#Z", "#F"       \n\t"\
53 cae05859 Ramiro Polla
        "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
54 14bc1f24 Ronald S. Bultje
        "paddw "#F", "#A"           \n\t"\
55
        "add %2, %0                 \n\t"\
56
        "paddw "#A", "#T"           \n\t"\
57
        "mov"#q" "#T", "#OF"(%1)    \n\t"
58
59
#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
60
#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
61
#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
62
#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
63
64
65
#define QPEL_H264(OPNAME, OP, MMX)\
66
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
67
    int h=4;\
68
\
69
    __asm__ volatile(\
70
        "pxor %%mm7, %%mm7          \n\t"\
71
        "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
72
        "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
73
        "1:                         \n\t"\
74
        "movd  -1(%0), %%mm1        \n\t"\
75
        "movd    (%0), %%mm2        \n\t"\
76
        "movd   1(%0), %%mm3        \n\t"\
77
        "movd   2(%0), %%mm0        \n\t"\
78
        "punpcklbw %%mm7, %%mm1     \n\t"\
79
        "punpcklbw %%mm7, %%mm2     \n\t"\
80
        "punpcklbw %%mm7, %%mm3     \n\t"\
81
        "punpcklbw %%mm7, %%mm0     \n\t"\
82
        "paddw %%mm0, %%mm1         \n\t"\
83
        "paddw %%mm3, %%mm2         \n\t"\
84
        "movd  -2(%0), %%mm0        \n\t"\
85
        "movd   3(%0), %%mm3        \n\t"\
86
        "punpcklbw %%mm7, %%mm0     \n\t"\
87
        "punpcklbw %%mm7, %%mm3     \n\t"\
88
        "paddw %%mm3, %%mm0         \n\t"\
89
        "psllw $2, %%mm2            \n\t"\
90
        "psubw %%mm1, %%mm2         \n\t"\
91
        "pmullw %%mm4, %%mm2        \n\t"\
92
        "paddw %%mm5, %%mm0         \n\t"\
93
        "paddw %%mm2, %%mm0         \n\t"\
94
        "psraw $5, %%mm0            \n\t"\
95
        "packuswb %%mm0, %%mm0      \n\t"\
96
        OP(%%mm0, (%1),%%mm6, d)\
97
        "add %3, %0                 \n\t"\
98
        "add %4, %1                 \n\t"\
99
        "decl %2                    \n\t"\
100
        " jnz 1b                    \n\t"\
101
        : "+a"(src), "+c"(dst), "+g"(h)\
102
        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
103
        : "memory"\
104
    );\
105
}\
106
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
107
    int h=4;\
108
    __asm__ volatile(\
109
        "pxor %%mm7, %%mm7          \n\t"\
110
        "movq %0, %%mm4             \n\t"\
111
        "movq %1, %%mm5             \n\t"\
112
        :: "m"(ff_pw_5), "m"(ff_pw_16)\
113
    );\
114
    do{\
115
    __asm__ volatile(\
116
        "movd  -1(%0), %%mm1        \n\t"\
117
        "movd    (%0), %%mm2        \n\t"\
118
        "movd   1(%0), %%mm3        \n\t"\
119
        "movd   2(%0), %%mm0        \n\t"\
120
        "punpcklbw %%mm7, %%mm1     \n\t"\
121
        "punpcklbw %%mm7, %%mm2     \n\t"\
122
        "punpcklbw %%mm7, %%mm3     \n\t"\
123
        "punpcklbw %%mm7, %%mm0     \n\t"\
124
        "paddw %%mm0, %%mm1         \n\t"\
125
        "paddw %%mm3, %%mm2         \n\t"\
126
        "movd  -2(%0), %%mm0        \n\t"\
127
        "movd   3(%0), %%mm3        \n\t"\
128
        "punpcklbw %%mm7, %%mm0     \n\t"\
129
        "punpcklbw %%mm7, %%mm3     \n\t"\
130
        "paddw %%mm3, %%mm0         \n\t"\
131
        "psllw $2, %%mm2            \n\t"\
132
        "psubw %%mm1, %%mm2         \n\t"\
133
        "pmullw %%mm4, %%mm2        \n\t"\
134
        "paddw %%mm5, %%mm0         \n\t"\
135
        "paddw %%mm2, %%mm0         \n\t"\
136
        "movd   (%2), %%mm3         \n\t"\
137
        "psraw $5, %%mm0            \n\t"\
138
        "packuswb %%mm0, %%mm0      \n\t"\
139
        PAVGB" %%mm3, %%mm0         \n\t"\
140
        OP(%%mm0, (%1),%%mm6, d)\
141
        "add %4, %0                 \n\t"\
142
        "add %4, %1                 \n\t"\
143
        "add %3, %2                 \n\t"\
144
        : "+a"(src), "+c"(dst), "+d"(src2)\
145
        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
146
        : "memory"\
147
    );\
148
    }while(--h);\
149
}\
150
static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
151
    src -= 2*srcStride;\
152
    __asm__ volatile(\
153
        "pxor %%mm7, %%mm7          \n\t"\
154
        "movd (%0), %%mm0           \n\t"\
155
        "add %2, %0                 \n\t"\
156
        "movd (%0), %%mm1           \n\t"\
157
        "add %2, %0                 \n\t"\
158
        "movd (%0), %%mm2           \n\t"\
159
        "add %2, %0                 \n\t"\
160
        "movd (%0), %%mm3           \n\t"\
161
        "add %2, %0                 \n\t"\
162
        "movd (%0), %%mm4           \n\t"\
163
        "add %2, %0                 \n\t"\
164
        "punpcklbw %%mm7, %%mm0     \n\t"\
165
        "punpcklbw %%mm7, %%mm1     \n\t"\
166
        "punpcklbw %%mm7, %%mm2     \n\t"\
167
        "punpcklbw %%mm7, %%mm3     \n\t"\
168
        "punpcklbw %%mm7, %%mm4     \n\t"\
169
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
170
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
171
        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
172
        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
173
         \
174
        : "+a"(src), "+c"(dst)\
175
        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
176
        : "memory"\
177
    );\
178
}\
179
static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
180
    int h=4;\
181
    int w=3;\
182
    src -= 2*srcStride+2;\
183
    while(w--){\
184
        __asm__ volatile(\
185
            "pxor %%mm7, %%mm7      \n\t"\
186
            "movd (%0), %%mm0       \n\t"\
187
            "add %2, %0             \n\t"\
188
            "movd (%0), %%mm1       \n\t"\
189
            "add %2, %0             \n\t"\
190
            "movd (%0), %%mm2       \n\t"\
191
            "add %2, %0             \n\t"\
192
            "movd (%0), %%mm3       \n\t"\
193
            "add %2, %0             \n\t"\
194
            "movd (%0), %%mm4       \n\t"\
195
            "add %2, %0             \n\t"\
196
            "punpcklbw %%mm7, %%mm0 \n\t"\
197
            "punpcklbw %%mm7, %%mm1 \n\t"\
198
            "punpcklbw %%mm7, %%mm2 \n\t"\
199
            "punpcklbw %%mm7, %%mm3 \n\t"\
200
            "punpcklbw %%mm7, %%mm4 \n\t"\
201
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
202
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
203
            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
204
            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
205
             \
206
            : "+a"(src)\
207 cae05859 Ramiro Polla
            : "c"(tmp), "S"((x86_reg)srcStride)\
208 14bc1f24 Ronald S. Bultje
            : "memory"\
209
        );\
210
        tmp += 4;\
211
        src += 4 - 9*srcStride;\
212
    }\
213
    tmp -= 3*4;\
214
    __asm__ volatile(\
215
        "1:                         \n\t"\
216
        "movq     (%0), %%mm0       \n\t"\
217
        "paddw  10(%0), %%mm0       \n\t"\
218
        "movq    2(%0), %%mm1       \n\t"\
219
        "paddw   8(%0), %%mm1       \n\t"\
220
        "movq    4(%0), %%mm2       \n\t"\
221
        "paddw   6(%0), %%mm2       \n\t"\
222
        "psubw %%mm1, %%mm0         \n\t"/*a-b   (abccba)*/\
223
        "psraw $2, %%mm0            \n\t"/*(a-b)/4 */\
224
        "psubw %%mm1, %%mm0         \n\t"/*(a-b)/4-b */\
225
        "paddsw %%mm2, %%mm0        \n\t"\
226
        "psraw $2, %%mm0            \n\t"/*((a-b)/4-b+c)/4 */\
227
        "paddw %%mm2, %%mm0         \n\t"/*(a-5*b+20*c)/16 */\
228
        "psraw $6, %%mm0            \n\t"\
229
        "packuswb %%mm0, %%mm0      \n\t"\
230
        OP(%%mm0, (%1),%%mm7, d)\
231
        "add $24, %0                \n\t"\
232
        "add %3, %1                 \n\t"\
233
        "decl %2                    \n\t"\
234
        " jnz 1b                    \n\t"\
235
        : "+a"(tmp), "+c"(dst), "+g"(h)\
236
        : "S"((x86_reg)dstStride)\
237
        : "memory"\
238
    );\
239
}\
240
\
241
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
242
    int h=8;\
243
    __asm__ volatile(\
244
        "pxor %%mm7, %%mm7          \n\t"\
245
        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
246
        "1:                         \n\t"\
247
        "movq    (%0), %%mm0        \n\t"\
248
        "movq   1(%0), %%mm2        \n\t"\
249
        "movq %%mm0, %%mm1          \n\t"\
250
        "movq %%mm2, %%mm3          \n\t"\
251
        "punpcklbw %%mm7, %%mm0     \n\t"\
252
        "punpckhbw %%mm7, %%mm1     \n\t"\
253
        "punpcklbw %%mm7, %%mm2     \n\t"\
254
        "punpckhbw %%mm7, %%mm3     \n\t"\
255
        "paddw %%mm2, %%mm0         \n\t"\
256
        "paddw %%mm3, %%mm1         \n\t"\
257
        "psllw $2, %%mm0            \n\t"\
258
        "psllw $2, %%mm1            \n\t"\
259
        "movq   -1(%0), %%mm2       \n\t"\
260
        "movq    2(%0), %%mm4       \n\t"\
261
        "movq %%mm2, %%mm3          \n\t"\
262
        "movq %%mm4, %%mm5          \n\t"\
263
        "punpcklbw %%mm7, %%mm2     \n\t"\
264
        "punpckhbw %%mm7, %%mm3     \n\t"\
265
        "punpcklbw %%mm7, %%mm4     \n\t"\
266
        "punpckhbw %%mm7, %%mm5     \n\t"\
267
        "paddw %%mm4, %%mm2         \n\t"\
268
        "paddw %%mm3, %%mm5         \n\t"\
269
        "psubw %%mm2, %%mm0         \n\t"\
270
        "psubw %%mm5, %%mm1         \n\t"\
271
        "pmullw %%mm6, %%mm0        \n\t"\
272
        "pmullw %%mm6, %%mm1        \n\t"\
273
        "movd   -2(%0), %%mm2       \n\t"\
274
        "movd    7(%0), %%mm5       \n\t"\
275
        "punpcklbw %%mm7, %%mm2     \n\t"\
276
        "punpcklbw %%mm7, %%mm5     \n\t"\
277
        "paddw %%mm3, %%mm2         \n\t"\
278
        "paddw %%mm5, %%mm4         \n\t"\
279
        "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
280
        "paddw %%mm5, %%mm2         \n\t"\
281
        "paddw %%mm5, %%mm4         \n\t"\
282
        "paddw %%mm2, %%mm0         \n\t"\
283
        "paddw %%mm4, %%mm1         \n\t"\
284
        "psraw $5, %%mm0            \n\t"\
285
        "psraw $5, %%mm1            \n\t"\
286
        "packuswb %%mm1, %%mm0      \n\t"\
287
        OP(%%mm0, (%1),%%mm5, q)\
288
        "add %3, %0                 \n\t"\
289
        "add %4, %1                 \n\t"\
290
        "decl %2                    \n\t"\
291
        " jnz 1b                    \n\t"\
292
        : "+a"(src), "+c"(dst), "+g"(h)\
293
        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
294
        : "memory"\
295
    );\
296
}\
297
\
298
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
299
    int h=8;\
300
    __asm__ volatile(\
301
        "pxor %%mm7, %%mm7          \n\t"\
302 b32c9ca9 Ramiro Polla
        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
303
        "1:                         \n\t"\
304 14bc1f24 Ronald S. Bultje
        "movq    (%0), %%mm0        \n\t"\
305
        "movq   1(%0), %%mm2        \n\t"\
306
        "movq %%mm0, %%mm1          \n\t"\
307
        "movq %%mm2, %%mm3          \n\t"\
308
        "punpcklbw %%mm7, %%mm0     \n\t"\
309
        "punpckhbw %%mm7, %%mm1     \n\t"\
310
        "punpcklbw %%mm7, %%mm2     \n\t"\
311
        "punpckhbw %%mm7, %%mm3     \n\t"\
312
        "paddw %%mm2, %%mm0         \n\t"\
313
        "paddw %%mm3, %%mm1         \n\t"\
314
        "psllw $2, %%mm0            \n\t"\
315
        "psllw $2, %%mm1            \n\t"\
316
        "movq   -1(%0), %%mm2       \n\t"\
317
        "movq    2(%0), %%mm4       \n\t"\
318
        "movq %%mm2, %%mm3          \n\t"\
319
        "movq %%mm4, %%mm5          \n\t"\
320
        "punpcklbw %%mm7, %%mm2     \n\t"\
321
        "punpckhbw %%mm7, %%mm3     \n\t"\
322
        "punpcklbw %%mm7, %%mm4     \n\t"\
323
        "punpckhbw %%mm7, %%mm5     \n\t"\
324
        "paddw %%mm4, %%mm2         \n\t"\
325
        "paddw %%mm3, %%mm5         \n\t"\
326
        "psubw %%mm2, %%mm0         \n\t"\
327
        "psubw %%mm5, %%mm1         \n\t"\
328
        "pmullw %%mm6, %%mm0        \n\t"\
329
        "pmullw %%mm6, %%mm1        \n\t"\
330
        "movd   -2(%0), %%mm2       \n\t"\
331
        "movd    7(%0), %%mm5       \n\t"\
332
        "punpcklbw %%mm7, %%mm2     \n\t"\
333
        "punpcklbw %%mm7, %%mm5     \n\t"\
334
        "paddw %%mm3, %%mm2         \n\t"\
335
        "paddw %%mm5, %%mm4         \n\t"\
336 b32c9ca9 Ramiro Polla
        "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
337 14bc1f24 Ronald S. Bultje
        "paddw %%mm5, %%mm2         \n\t"\
338
        "paddw %%mm5, %%mm4         \n\t"\
339
        "paddw %%mm2, %%mm0         \n\t"\
340
        "paddw %%mm4, %%mm1         \n\t"\
341
        "psraw $5, %%mm0            \n\t"\
342
        "psraw $5, %%mm1            \n\t"\
343
        "movq (%2), %%mm4           \n\t"\
344
        "packuswb %%mm1, %%mm0      \n\t"\
345
        PAVGB" %%mm4, %%mm0         \n\t"\
346
        OP(%%mm0, (%1),%%mm5, q)\
347 b32c9ca9 Ramiro Polla
        "add %5, %0                 \n\t"\
348
        "add %5, %1                 \n\t"\
349
        "add %4, %2                 \n\t"\
350
        "decl %3                    \n\t"\
351
        "jg 1b                      \n\t"\
352
        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
353
        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
354 14bc1f24 Ronald S. Bultje
        : "memory"\
355
    );\
356
}\
357
\
358
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
359
    int w= 2;\
360
    src -= 2*srcStride;\
361
    \
362
    while(w--){\
363
      __asm__ volatile(\
364
        "pxor %%mm7, %%mm7          \n\t"\
365
        "movd (%0), %%mm0           \n\t"\
366
        "add %2, %0                 \n\t"\
367
        "movd (%0), %%mm1           \n\t"\
368
        "add %2, %0                 \n\t"\
369
        "movd (%0), %%mm2           \n\t"\
370
        "add %2, %0                 \n\t"\
371
        "movd (%0), %%mm3           \n\t"\
372
        "add %2, %0                 \n\t"\
373
        "movd (%0), %%mm4           \n\t"\
374
        "add %2, %0                 \n\t"\
375
        "punpcklbw %%mm7, %%mm0     \n\t"\
376
        "punpcklbw %%mm7, %%mm1     \n\t"\
377
        "punpcklbw %%mm7, %%mm2     \n\t"\
378
        "punpcklbw %%mm7, %%mm3     \n\t"\
379
        "punpcklbw %%mm7, %%mm4     \n\t"\
380
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
381
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
382
        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
383
        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
384
        QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
385
        QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
386
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
387
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
388 cae05859 Ramiro Polla
        "cmpl $16, %4               \n\t"\
389
        "jne 2f                     \n\t"\
390 14bc1f24 Ronald S. Bultje
            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
391
            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
392
            QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
393
            QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
394
            QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
395
            QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
396
            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
397
            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
398 cae05859 Ramiro Polla
        "2:                         \n\t"\
399 14bc1f24 Ronald S. Bultje
            \
400
           : "+a"(src), "+c"(dst)\
401 cae05859 Ramiro Polla
           : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "g"(h)\
402 14bc1f24 Ronald S. Bultje
           : "memory"\
403
        );\
404
     src += 4-(h+5)*srcStride;\
405
     dst += 4-h*dstStride;\
406
   }\
407
}\
408
static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
409
    int w = (size+8)>>2;\
410
    src -= 2*srcStride+2;\
411
    while(w--){\
412
        __asm__ volatile(\
413
            "pxor %%mm7, %%mm7      \n\t"\
414
            "movd (%0), %%mm0       \n\t"\
415
            "add %2, %0             \n\t"\
416
            "movd (%0), %%mm1       \n\t"\
417
            "add %2, %0             \n\t"\
418
            "movd (%0), %%mm2       \n\t"\
419
            "add %2, %0             \n\t"\
420
            "movd (%0), %%mm3       \n\t"\
421
            "add %2, %0             \n\t"\
422
            "movd (%0), %%mm4       \n\t"\
423
            "add %2, %0             \n\t"\
424
            "punpcklbw %%mm7, %%mm0 \n\t"\
425
            "punpcklbw %%mm7, %%mm1 \n\t"\
426
            "punpcklbw %%mm7, %%mm2 \n\t"\
427
            "punpcklbw %%mm7, %%mm3 \n\t"\
428
            "punpcklbw %%mm7, %%mm4 \n\t"\
429
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
430
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
431
            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
432
            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
433
            QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
434
            QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
435
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
436
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
437 cae05859 Ramiro Polla
            "cmpl $16, %3           \n\t"\
438
            "jne 2f                 \n\t"\
439 14bc1f24 Ronald S. Bultje
                QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1,  8*48)\
440
                QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2,  9*48)\
441
                QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
442
                QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
443
                QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
444
                QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
445
                QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
446
                QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
447 cae05859 Ramiro Polla
            "2:                     \n\t"\
448 14bc1f24 Ronald S. Bultje
                : "+a"(src)\
449 cae05859 Ramiro Polla
                : "c"(tmp), "S"((x86_reg)srcStride), "g"(size)\
450 14bc1f24 Ronald S. Bultje
                : "memory"\
451
            );\
452
        tmp += 4;\
453
        src += 4 - (size+5)*srcStride;\
454
    }\
455
}\
456
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
457
    int w = size>>4;\
458
    do{\
459
    int h = size;\
460
    __asm__ volatile(\
461
        "1:                         \n\t"\
462
        "movq     (%0), %%mm0       \n\t"\
463
        "movq    8(%0), %%mm3       \n\t"\
464
        "movq    2(%0), %%mm1       \n\t"\
465
        "movq   10(%0), %%mm4       \n\t"\
466
        "paddw   %%mm4, %%mm0       \n\t"\
467
        "paddw   %%mm3, %%mm1       \n\t"\
468
        "paddw  18(%0), %%mm3       \n\t"\
469
        "paddw  16(%0), %%mm4       \n\t"\
470
        "movq    4(%0), %%mm2       \n\t"\
471
        "movq   12(%0), %%mm5       \n\t"\
472
        "paddw   6(%0), %%mm2       \n\t"\
473
        "paddw  14(%0), %%mm5       \n\t"\
474
        "psubw %%mm1, %%mm0         \n\t"\
475
        "psubw %%mm4, %%mm3         \n\t"\
476
        "psraw $2, %%mm0            \n\t"\
477
        "psraw $2, %%mm3            \n\t"\
478
        "psubw %%mm1, %%mm0         \n\t"\
479
        "psubw %%mm4, %%mm3         \n\t"\
480
        "paddsw %%mm2, %%mm0        \n\t"\
481
        "paddsw %%mm5, %%mm3        \n\t"\
482
        "psraw $2, %%mm0            \n\t"\
483
        "psraw $2, %%mm3            \n\t"\
484
        "paddw %%mm2, %%mm0         \n\t"\
485
        "paddw %%mm5, %%mm3         \n\t"\
486
        "psraw $6, %%mm0            \n\t"\
487
        "psraw $6, %%mm3            \n\t"\
488
        "packuswb %%mm3, %%mm0      \n\t"\
489
        OP(%%mm0, (%1),%%mm7, q)\
490
        "add $48, %0                \n\t"\
491
        "add %3, %1                 \n\t"\
492
        "decl %2                    \n\t"\
493
        " jnz 1b                    \n\t"\
494
        : "+a"(tmp), "+c"(dst), "+g"(h)\
495
        : "S"((x86_reg)dstStride)\
496
        : "memory"\
497
    );\
498
    tmp += 8 - size*24;\
499
    dst += 8 - size*dstStride;\
500
    }while(w--);\
501
}\
502
\
503
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
504
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
505
}\
506
static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
507
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
508
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
509
}\
510
\
511
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
512
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
513
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
514
    src += 8*srcStride;\
515
    dst += 8*dstStride;\
516
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
517
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
518
}\
519
\
520
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
521
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
522
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
523
    src += 8*dstStride;\
524
    dst += 8*dstStride;\
525
    src2 += 8*src2Stride;\
526
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
527
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
528
}\
529
\
530
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
531
          put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
532
    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
533
}\
534
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
535
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
536
}\
537
\
538
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
539
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
540
}\
541
\
542
static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
543
{\
544
    __asm__ volatile(\
545
        "movq      (%1), %%mm0          \n\t"\
546
        "movq    24(%1), %%mm1          \n\t"\
547
        "psraw      $5,  %%mm0          \n\t"\
548
        "psraw      $5,  %%mm1          \n\t"\
549
        "packuswb %%mm0, %%mm0          \n\t"\
550
        "packuswb %%mm1, %%mm1          \n\t"\
551
        PAVGB"     (%0), %%mm0          \n\t"\
552
        PAVGB"  (%0,%3), %%mm1          \n\t"\
553
        OP(%%mm0, (%2),    %%mm4, d)\
554
        OP(%%mm1, (%2,%4), %%mm5, d)\
555
        "lea  (%0,%3,2), %0             \n\t"\
556
        "lea  (%2,%4,2), %2             \n\t"\
557
        "movq    48(%1), %%mm0          \n\t"\
558
        "movq    72(%1), %%mm1          \n\t"\
559
        "psraw      $5,  %%mm0          \n\t"\
560
        "psraw      $5,  %%mm1          \n\t"\
561
        "packuswb %%mm0, %%mm0          \n\t"\
562
        "packuswb %%mm1, %%mm1          \n\t"\
563
        PAVGB"     (%0), %%mm0          \n\t"\
564
        PAVGB"  (%0,%3), %%mm1          \n\t"\
565
        OP(%%mm0, (%2),    %%mm4, d)\
566
        OP(%%mm1, (%2,%4), %%mm5, d)\
567
        :"+a"(src8), "+c"(src16), "+d"(dst)\
568
        :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
569
        :"memory");\
570
}\
571
static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
572
{\
573
    do{\
574
    __asm__ volatile(\
575
        "movq      (%1), %%mm0          \n\t"\
576
        "movq     8(%1), %%mm1          \n\t"\
577
        "movq    48(%1), %%mm2          \n\t"\
578
        "movq  8+48(%1), %%mm3          \n\t"\
579
        "psraw      $5,  %%mm0          \n\t"\
580
        "psraw      $5,  %%mm1          \n\t"\
581
        "psraw      $5,  %%mm2          \n\t"\
582
        "psraw      $5,  %%mm3          \n\t"\
583
        "packuswb %%mm1, %%mm0          \n\t"\
584
        "packuswb %%mm3, %%mm2          \n\t"\
585
        PAVGB"     (%0), %%mm0          \n\t"\
586
        PAVGB"  (%0,%3), %%mm2          \n\t"\
587
        OP(%%mm0, (%2), %%mm5, q)\
588
        OP(%%mm2, (%2,%4), %%mm5, q)\
589
        ::"a"(src8), "c"(src16), "d"(dst),\
590
          "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
591
        :"memory");\
592
        src8 += 2L*src8Stride;\
593
        src16 += 48;\
594
        dst += 2L*dstStride;\
595
    }while(h-=2);\
596
}\
597
static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
598
{\
599
    OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
600
    OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
601
}\
602
603
604
#if ARCH_X86_64
605
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
606
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
607
    int h=16;\
608
    __asm__ volatile(\
609
        "pxor %%xmm15, %%xmm15      \n\t"\
610
        "movdqa %6, %%xmm14         \n\t"\
611
        "movdqa %7, %%xmm13         \n\t"\
612
        "1:                         \n\t"\
613
        "lddqu    6(%0), %%xmm1     \n\t"\
614
        "lddqu   -2(%0), %%xmm7     \n\t"\
615
        "movdqa  %%xmm1, %%xmm0     \n\t"\
616
        "punpckhbw %%xmm15, %%xmm1  \n\t"\
617
        "punpcklbw %%xmm15, %%xmm0  \n\t"\
618
        "punpcklbw %%xmm15, %%xmm7  \n\t"\
619
        "movdqa  %%xmm1, %%xmm2     \n\t"\
620
        "movdqa  %%xmm0, %%xmm6     \n\t"\
621
        "movdqa  %%xmm1, %%xmm3     \n\t"\
622
        "movdqa  %%xmm0, %%xmm8     \n\t"\
623
        "movdqa  %%xmm1, %%xmm4     \n\t"\
624
        "movdqa  %%xmm0, %%xmm9     \n\t"\
625
        "movdqa  %%xmm0, %%xmm12    \n\t"\
626
        "movdqa  %%xmm1, %%xmm11    \n\t"\
627
        "palignr $10,%%xmm0, %%xmm11\n\t"\
628
        "palignr $10,%%xmm7, %%xmm12\n\t"\
629
        "palignr $2, %%xmm0, %%xmm4 \n\t"\
630
        "palignr $2, %%xmm7, %%xmm9 \n\t"\
631
        "palignr $4, %%xmm0, %%xmm3 \n\t"\
632
        "palignr $4, %%xmm7, %%xmm8 \n\t"\
633
        "palignr $6, %%xmm0, %%xmm2 \n\t"\
634
        "palignr $6, %%xmm7, %%xmm6 \n\t"\
635
        "paddw   %%xmm0 ,%%xmm11    \n\t"\
636
        "palignr $8, %%xmm0, %%xmm1 \n\t"\
637
        "palignr $8, %%xmm7, %%xmm0 \n\t"\
638
        "paddw   %%xmm12,%%xmm7     \n\t"\
639
        "paddw   %%xmm3, %%xmm2     \n\t"\
640
        "paddw   %%xmm8, %%xmm6     \n\t"\
641
        "paddw   %%xmm4, %%xmm1     \n\t"\
642
        "paddw   %%xmm9, %%xmm0     \n\t"\
643
        "psllw   $2,     %%xmm2     \n\t"\
644
        "psllw   $2,     %%xmm6     \n\t"\
645
        "psubw   %%xmm1, %%xmm2     \n\t"\
646
        "psubw   %%xmm0, %%xmm6     \n\t"\
647
        "paddw   %%xmm13,%%xmm11    \n\t"\
648
        "paddw   %%xmm13,%%xmm7     \n\t"\
649
        "pmullw  %%xmm14,%%xmm2     \n\t"\
650
        "pmullw  %%xmm14,%%xmm6     \n\t"\
651
        "lddqu   (%2),   %%xmm3     \n\t"\
652
        "paddw   %%xmm11,%%xmm2     \n\t"\
653
        "paddw   %%xmm7, %%xmm6     \n\t"\
654
        "psraw   $5,     %%xmm2     \n\t"\
655
        "psraw   $5,     %%xmm6     \n\t"\
656
        "packuswb %%xmm2,%%xmm6     \n\t"\
657
        "pavgb   %%xmm3, %%xmm6     \n\t"\
658
        OP(%%xmm6, (%1), %%xmm4, dqa)\
659
        "add %5, %0                 \n\t"\
660
        "add %5, %1                 \n\t"\
661
        "add %4, %2                 \n\t"\
662
        "decl %3                    \n\t"\
663
        "jg 1b                      \n\t"\
664
        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
665
        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
666
          "m"(ff_pw_5), "m"(ff_pw_16)\
667
        : "memory"\
668
    );\
669
}
670
#else // ARCH_X86_64
671
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
672
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
673
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
674
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
675
    src += 8*dstStride;\
676
    dst += 8*dstStride;\
677
    src2 += 8*src2Stride;\
678
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
679
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
680
}
681
#endif // ARCH_X86_64
682
683
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
684
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
685
    int h=8;\
686
    __asm__ volatile(\
687
        "pxor %%xmm7, %%xmm7        \n\t"\
688 b32c9ca9 Ramiro Polla
        "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
689
        "1:                         \n\t"\
690 14bc1f24 Ronald S. Bultje
        "lddqu   -2(%0), %%xmm1     \n\t"\
691
        "movdqa  %%xmm1, %%xmm0     \n\t"\
692
        "punpckhbw %%xmm7, %%xmm1   \n\t"\
693
        "punpcklbw %%xmm7, %%xmm0   \n\t"\
694
        "movdqa  %%xmm1, %%xmm2     \n\t"\
695
        "movdqa  %%xmm1, %%xmm3     \n\t"\
696
        "movdqa  %%xmm1, %%xmm4     \n\t"\
697
        "movdqa  %%xmm1, %%xmm5     \n\t"\
698
        "palignr $2, %%xmm0, %%xmm4 \n\t"\
699
        "palignr $4, %%xmm0, %%xmm3 \n\t"\
700
        "palignr $6, %%xmm0, %%xmm2 \n\t"\
701
        "palignr $8, %%xmm0, %%xmm1 \n\t"\
702
        "palignr $10,%%xmm0, %%xmm5 \n\t"\
703
        "paddw   %%xmm5, %%xmm0     \n\t"\
704
        "paddw   %%xmm3, %%xmm2     \n\t"\
705
        "paddw   %%xmm4, %%xmm1     \n\t"\
706
        "psllw   $2,     %%xmm2     \n\t"\
707
        "movq    (%2),   %%xmm3     \n\t"\
708
        "psubw   %%xmm1, %%xmm2     \n\t"\
709 b32c9ca9 Ramiro Polla
        "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
710 14bc1f24 Ronald S. Bultje
        "pmullw  %%xmm6, %%xmm2     \n\t"\
711
        "paddw   %%xmm0, %%xmm2     \n\t"\
712
        "psraw   $5,     %%xmm2     \n\t"\
713
        "packuswb %%xmm2, %%xmm2    \n\t"\
714
        "pavgb   %%xmm3, %%xmm2     \n\t"\
715
        OP(%%xmm2, (%1), %%xmm4, q)\
716 b32c9ca9 Ramiro Polla
        "add %5, %0                 \n\t"\
717
        "add %5, %1                 \n\t"\
718
        "add %4, %2                 \n\t"\
719
        "decl %3                    \n\t"\
720
        "jg 1b                      \n\t"\
721
        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
722
        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
723 14bc1f24 Ronald S. Bultje
        : "memory"\
724
    );\
725
}\
726
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
727
\
728
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
729
    int h=8;\
730
    __asm__ volatile(\
731
        "pxor %%xmm7, %%xmm7        \n\t"\
732
        "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
733
        "1:                         \n\t"\
734
        "lddqu   -2(%0), %%xmm1     \n\t"\
735
        "movdqa  %%xmm1, %%xmm0     \n\t"\
736
        "punpckhbw %%xmm7, %%xmm1   \n\t"\
737
        "punpcklbw %%xmm7, %%xmm0   \n\t"\
738
        "movdqa  %%xmm1, %%xmm2     \n\t"\
739
        "movdqa  %%xmm1, %%xmm3     \n\t"\
740
        "movdqa  %%xmm1, %%xmm4     \n\t"\
741
        "movdqa  %%xmm1, %%xmm5     \n\t"\
742
        "palignr $2, %%xmm0, %%xmm4 \n\t"\
743
        "palignr $4, %%xmm0, %%xmm3 \n\t"\
744
        "palignr $6, %%xmm0, %%xmm2 \n\t"\
745
        "palignr $8, %%xmm0, %%xmm1 \n\t"\
746
        "palignr $10,%%xmm0, %%xmm5 \n\t"\
747
        "paddw   %%xmm5, %%xmm0     \n\t"\
748
        "paddw   %%xmm3, %%xmm2     \n\t"\
749
        "paddw   %%xmm4, %%xmm1     \n\t"\
750
        "psllw   $2,     %%xmm2     \n\t"\
751
        "psubw   %%xmm1, %%xmm2     \n\t"\
752
        "paddw   "MANGLE(ff_pw_16)", %%xmm0\n\t"\
753
        "pmullw  %%xmm6, %%xmm2     \n\t"\
754
        "paddw   %%xmm0, %%xmm2     \n\t"\
755
        "psraw   $5,     %%xmm2     \n\t"\
756
        "packuswb %%xmm2, %%xmm2    \n\t"\
757
        OP(%%xmm2, (%1), %%xmm4, q)\
758
        "add %3, %0                 \n\t"\
759
        "add %4, %1                 \n\t"\
760
        "decl %2                    \n\t"\
761
        " jnz 1b                    \n\t"\
762
        : "+a"(src), "+c"(dst), "+g"(h)\
763
        : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
764
        : "memory"\
765
    );\
766
}\
767
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
768
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
769
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
770
    src += 8*srcStride;\
771
    dst += 8*dstStride;\
772
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
773
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
774
}\
775
776
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
777
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
778
    src -= 2*srcStride;\
779
    \
780
    __asm__ volatile(\
781
        "pxor %%xmm7, %%xmm7        \n\t"\
782
        "movq (%0), %%xmm0          \n\t"\
783
        "add %2, %0                 \n\t"\
784
        "movq (%0), %%xmm1          \n\t"\
785
        "add %2, %0                 \n\t"\
786
        "movq (%0), %%xmm2          \n\t"\
787
        "add %2, %0                 \n\t"\
788
        "movq (%0), %%xmm3          \n\t"\
789
        "add %2, %0                 \n\t"\
790
        "movq (%0), %%xmm4          \n\t"\
791
        "add %2, %0                 \n\t"\
792
        "punpcklbw %%xmm7, %%xmm0   \n\t"\
793
        "punpcklbw %%xmm7, %%xmm1   \n\t"\
794
        "punpcklbw %%xmm7, %%xmm2   \n\t"\
795
        "punpcklbw %%xmm7, %%xmm3   \n\t"\
796
        "punpcklbw %%xmm7, %%xmm4   \n\t"\
797
        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
798
        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
799
        QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
800
        QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
801
        QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
802
        QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
803
        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
804
        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
805 cae05859 Ramiro Polla
        "cmpl $16, %4               \n\t"\
806
        "jne 2f                     \n\t"\
807 14bc1f24 Ronald S. Bultje
            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
808
            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
809
            QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
810
            QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
811
            QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
812
            QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
813
            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
814
            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
815 cae05859 Ramiro Polla
        "2:                          \n\t"\
816 14bc1f24 Ronald S. Bultje
            \
817
            : "+a"(src), "+c"(dst)\
818 cae05859 Ramiro Polla
            : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "g"(h)\
819 14bc1f24 Ronald S. Bultje
            : "memory"\
820
        );\
821
}\
822
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
823
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
824
}\
825
static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
826
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
827
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
828
}
829
830
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
831
    int w = (size+8)>>3;
832
    src -= 2*srcStride+2;
833
    while(w--){
834
        __asm__ volatile(
835
            "pxor %%xmm7, %%xmm7        \n\t"
836
            "movq (%0), %%xmm0          \n\t"
837
            "add %2, %0                 \n\t"
838
            "movq (%0), %%xmm1          \n\t"
839
            "add %2, %0                 \n\t"
840
            "movq (%0), %%xmm2          \n\t"
841
            "add %2, %0                 \n\t"
842
            "movq (%0), %%xmm3          \n\t"
843
            "add %2, %0                 \n\t"
844
            "movq (%0), %%xmm4          \n\t"
845
            "add %2, %0                 \n\t"
846
            "punpcklbw %%xmm7, %%xmm0   \n\t"
847
            "punpcklbw %%xmm7, %%xmm1   \n\t"
848
            "punpcklbw %%xmm7, %%xmm2   \n\t"
849
            "punpcklbw %%xmm7, %%xmm3   \n\t"
850
            "punpcklbw %%xmm7, %%xmm4   \n\t"
851
            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
852
            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
853
            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
854
            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
855
            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
856
            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
857
            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
858
            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
859 cae05859 Ramiro Polla
            "cmpl $16, %3               \n\t"
860
            "jne 2f                     \n\t"
861 14bc1f24 Ronald S. Bultje
                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
862
                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
863
                QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
864
                QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
865
                QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
866
                QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
867
                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
868
                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
869 cae05859 Ramiro Polla
            "2:                         \n\t"
870 14bc1f24 Ronald S. Bultje
                : "+a"(src)
871 cae05859 Ramiro Polla
                : "c"(tmp), "S"((x86_reg)srcStride), "g"(size)
872 14bc1f24 Ronald S. Bultje
                : "memory"
873
            );
874
        tmp += 8;
875
        src += 8 - (size+5)*srcStride;
876
    }
877
}
878
879
#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
880
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
881
    int h = size;\
882
    if(size == 16){\
883
        __asm__ volatile(\
884
            "1:                         \n\t"\
885
            "movdqa 32(%0), %%xmm4      \n\t"\
886
            "movdqa 16(%0), %%xmm5      \n\t"\
887
            "movdqa   (%0), %%xmm7      \n\t"\
888
            "movdqa %%xmm4, %%xmm3      \n\t"\
889
            "movdqa %%xmm4, %%xmm2      \n\t"\
890
            "movdqa %%xmm4, %%xmm1      \n\t"\
891
            "movdqa %%xmm4, %%xmm0      \n\t"\
892
            "palignr $10, %%xmm5, %%xmm0 \n\t"\
893
            "palignr  $8, %%xmm5, %%xmm1 \n\t"\
894
            "palignr  $6, %%xmm5, %%xmm2 \n\t"\
895
            "palignr  $4, %%xmm5, %%xmm3 \n\t"\
896
            "palignr  $2, %%xmm5, %%xmm4 \n\t"\
897
            "paddw  %%xmm5, %%xmm0      \n\t"\
898
            "paddw  %%xmm4, %%xmm1      \n\t"\
899
            "paddw  %%xmm3, %%xmm2      \n\t"\
900
            "movdqa %%xmm5, %%xmm6      \n\t"\
901
            "movdqa %%xmm5, %%xmm4      \n\t"\
902
            "movdqa %%xmm5, %%xmm3      \n\t"\
903
            "palignr  $8, %%xmm7, %%xmm4 \n\t"\
904
            "palignr  $2, %%xmm7, %%xmm6 \n\t"\
905
            "palignr $10, %%xmm7, %%xmm3 \n\t"\
906
            "paddw  %%xmm6, %%xmm4      \n\t"\
907
            "movdqa %%xmm5, %%xmm6      \n\t"\
908
            "palignr  $6, %%xmm7, %%xmm5 \n\t"\
909
            "palignr  $4, %%xmm7, %%xmm6 \n\t"\
910
            "paddw  %%xmm7, %%xmm3      \n\t"\
911
            "paddw  %%xmm6, %%xmm5      \n\t"\
912
            \
913
            "psubw  %%xmm1, %%xmm0      \n\t"\
914
            "psubw  %%xmm4, %%xmm3      \n\t"\
915
            "psraw      $2, %%xmm0      \n\t"\
916
            "psraw      $2, %%xmm3      \n\t"\
917
            "psubw  %%xmm1, %%xmm0      \n\t"\
918
            "psubw  %%xmm4, %%xmm3      \n\t"\
919
            "paddw  %%xmm2, %%xmm0      \n\t"\
920
            "paddw  %%xmm5, %%xmm3      \n\t"\
921
            "psraw      $2, %%xmm0      \n\t"\
922
            "psraw      $2, %%xmm3      \n\t"\
923
            "paddw  %%xmm2, %%xmm0      \n\t"\
924
            "paddw  %%xmm5, %%xmm3      \n\t"\
925
            "psraw      $6, %%xmm0      \n\t"\
926
            "psraw      $6, %%xmm3      \n\t"\
927
            "packuswb %%xmm0, %%xmm3    \n\t"\
928
            OP(%%xmm3, (%1), %%xmm7, dqa)\
929
            "add $48, %0                \n\t"\
930
            "add %3, %1                 \n\t"\
931
            "decl %2                    \n\t"\
932
            " jnz 1b                    \n\t"\
933
            : "+a"(tmp), "+c"(dst), "+g"(h)\
934
            : "S"((x86_reg)dstStride)\
935
            : "memory"\
936
        );\
937
    }else{\
938
        __asm__ volatile(\
939
            "1:                         \n\t"\
940
            "movdqa 16(%0), %%xmm1      \n\t"\
941
            "movdqa   (%0), %%xmm0      \n\t"\
942
            "movdqa %%xmm1, %%xmm2      \n\t"\
943
            "movdqa %%xmm1, %%xmm3      \n\t"\
944
            "movdqa %%xmm1, %%xmm4      \n\t"\
945
            "movdqa %%xmm1, %%xmm5      \n\t"\
946
            "palignr $10, %%xmm0, %%xmm5 \n\t"\
947
            "palignr  $8, %%xmm0, %%xmm4 \n\t"\
948
            "palignr  $6, %%xmm0, %%xmm3 \n\t"\
949
            "palignr  $4, %%xmm0, %%xmm2 \n\t"\
950
            "palignr  $2, %%xmm0, %%xmm1 \n\t"\
951
            "paddw  %%xmm5, %%xmm0      \n\t"\
952
            "paddw  %%xmm4, %%xmm1      \n\t"\
953
            "paddw  %%xmm3, %%xmm2      \n\t"\
954
            "psubw  %%xmm1, %%xmm0      \n\t"\
955
            "psraw      $2, %%xmm0      \n\t"\
956
            "psubw  %%xmm1, %%xmm0      \n\t"\
957
            "paddw  %%xmm2, %%xmm0      \n\t"\
958
            "psraw      $2, %%xmm0      \n\t"\
959
            "paddw  %%xmm2, %%xmm0      \n\t"\
960
            "psraw      $6, %%xmm0      \n\t"\
961
            "packuswb %%xmm0, %%xmm0    \n\t"\
962
            OP(%%xmm0, (%1), %%xmm7, q)\
963
            "add $48, %0                \n\t"\
964
            "add %3, %1                 \n\t"\
965
            "decl %2                    \n\t"\
966
            " jnz 1b                    \n\t"\
967
            : "+a"(tmp), "+c"(dst), "+g"(h)\
968
            : "S"((x86_reg)dstStride)\
969
            : "memory"\
970
        );\
971
    }\
972
}
973
974
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
975
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
976
          put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
977
    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
978
}\
979
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
980
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
981
}\
982
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
983
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
984
}\
985
986
#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
987
#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
988
#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
989
#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
990
#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
991
#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
992
#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
993
#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
994
995
#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
996
#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
997
#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
998
#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
999
#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
1000
#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
1001
#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
1002
#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
1003
1004
#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
1005
#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
1006
#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
1007
#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
1008
1009
#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
1010
#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
1011
#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
1012
#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
1013
1014
#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
1015
#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
1016
1017
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
1018
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
1019
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
1020
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
1021
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
1022
1023
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
1024
    put_pixels16_sse2(dst, src, stride, 16);
1025
}
1026
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
1027
    avg_pixels16_sse2(dst, src, stride, 16);
1028
}
1029
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
1030
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
1031
1032
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
1033
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1034
    OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
1035
}\
1036
1037
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
1038
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1039
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
1040
}\
1041
\
1042
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1043
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
1044
}\
1045
\
1046
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1047
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
1048
}\
1049
1050
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
1051
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1052
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1053
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1054
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
1055
}\
1056
\
1057
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1058
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
1059
}\
1060
\
1061
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1062
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1063
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1064
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
1065
}\
1066
1067
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
1068
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1069
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1070
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1071
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
1072
}\
1073
\
1074
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1075
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1076
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
1077
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
1078
}\
1079
\
1080
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1081
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1082
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1083
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
1084
}\
1085
\
1086
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1087
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
1088
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
1089
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
1090
}\
1091
\
1092
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1093
    DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
1094
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
1095
}\
1096
\
1097
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1098
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
1099
    uint8_t * const halfHV= temp;\
1100
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1101
    assert(((int)temp & 7) == 0);\
1102
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1103
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
1104
}\
1105
\
1106
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1107
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
1108
    uint8_t * const halfHV= temp;\
1109
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1110
    assert(((int)temp & 7) == 0);\
1111
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1112
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
1113
}\
1114
\
1115
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1116
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
1117
    uint8_t * const halfHV= temp;\
1118
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1119
    assert(((int)temp & 7) == 0);\
1120
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1121
    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
1122
}\
1123
\
1124
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1125
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
1126
    uint8_t * const halfHV= temp;\
1127
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1128
    assert(((int)temp & 7) == 0);\
1129
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1130
    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
1131
}\
1132
1133
#define H264_MC_4816(MMX)\
1134
H264_MC(put_, 4, MMX, 8)\
1135
H264_MC(put_, 8, MMX, 8)\
1136
H264_MC(put_, 16,MMX, 8)\
1137
H264_MC(avg_, 4, MMX, 8)\
1138
H264_MC(avg_, 8, MMX, 8)\
1139
H264_MC(avg_, 16,MMX, 8)\
1140
1141
#define H264_MC_816(QPEL, XMM)\
1142
QPEL(put_, 8, XMM, 16)\
1143
QPEL(put_, 16,XMM, 16)\
1144
QPEL(avg_, 8, XMM, 16)\
1145
QPEL(avg_, 16,XMM, 16)\
1146
1147
1148
#define AVG_3DNOW_OP(a,b,temp, size) \
1149
"mov" #size " " #b ", " #temp "   \n\t"\
1150
"pavgusb " #temp ", " #a "        \n\t"\
1151
"mov" #size " " #a ", " #b "      \n\t"
1152
#define AVG_MMX2_OP(a,b,temp, size) \
1153
"mov" #size " " #b ", " #temp "   \n\t"\
1154
"pavgb " #temp ", " #a "          \n\t"\
1155
"mov" #size " " #a ", " #b "      \n\t"
1156
1157
#define PAVGB "pavgusb"
1158
QPEL_H264(put_,       PUT_OP, 3dnow)
1159
QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
1160
#undef PAVGB
1161
#define PAVGB "pavgb"
1162
QPEL_H264(put_,       PUT_OP, mmx2)
1163
QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
1164
QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
1165
QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
1166
QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
1167
QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
1168
#if HAVE_SSSE3
1169
QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
1170
QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
1171
QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
1172
QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
1173
QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
1174
QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
1175
#endif
1176
#undef PAVGB
1177
1178
H264_MC_4816(3dnow)
1179
H264_MC_4816(mmx2)
1180
H264_MC_816(H264_MC_V, sse2)
1181
H264_MC_816(H264_MC_HV, sse2)
1182
#if HAVE_SSSE3
1183
H264_MC_816(H264_MC_H, ssse3)
1184
H264_MC_816(H264_MC_HV, ssse3)
1185
#endif