ffmpeg / libavcodec / i386 / cavsdsp_mmx.c @ b550bfaa
History | View | Annotate | Download (20.7 KB)
1 |
/*
|
---|---|
2 |
* Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
|
3 |
* Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
|
4 |
*
|
5 |
* MMX optimised DSP functions, based on H.264 optimisations by
|
6 |
* Michael Niedermayer and Loren Merritt
|
7 |
*
|
8 |
* This file is part of FFmpeg.
|
9 |
*
|
10 |
* FFmpeg is free software; you can redistribute it and/or
|
11 |
* modify it under the terms of the GNU Lesser General Public
|
12 |
* License as published by the Free Software Foundation; either
|
13 |
* version 2.1 of the License, or (at your option) any later version.
|
14 |
*
|
15 |
* FFmpeg is distributed in the hope that it will be useful,
|
16 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
17 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
18 |
* Lesser General Public License for more details.
|
19 |
*
|
20 |
* You should have received a copy of the GNU Lesser General Public
|
21 |
* License along with FFmpeg; if not, write to the Free Software
|
22 |
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
23 |
*/
|
24 |
|
25 |
#include "dsputil.h" |
26 |
#include "common.h" |
27 |
|
28 |
DECLARE_ALIGNED_8(static const uint64_t,ff_pw_4 ) = 0x0004000400040004ULL; |
29 |
DECLARE_ALIGNED_8(static const uint64_t,ff_pw_5 ) = 0x0005000500050005ULL; |
30 |
DECLARE_ALIGNED_8(static const uint64_t,ff_pw_7 ) = 0x0007000700070007ULL; |
31 |
DECLARE_ALIGNED_8(static const uint64_t,ff_pw_42) = 0x002A002A002A002AULL; |
32 |
DECLARE_ALIGNED_8(static const uint64_t,ff_pw_64) = 0x0040004000400040ULL; |
33 |
DECLARE_ALIGNED_8(static const uint64_t,ff_pw_96) = 0x0060006000600060ULL; |
34 |
|
35 |
/*****************************************************************************
|
36 |
*
|
37 |
* inverse transform
|
38 |
*
|
39 |
****************************************************************************/
|
40 |
|
41 |
#define SUMSUB_BA( a, b ) \
|
42 |
"paddw "#b", "#a" \n\t"\ |
43 |
"paddw "#b", "#b" \n\t"\ |
44 |
"psubw "#a", "#b" \n\t" |
45 |
|
46 |
#define SBUTTERFLY(a,b,t,n)\
|
47 |
"movq " #a ", " #t " \n\t" /* abcd */\ |
48 |
"punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ |
49 |
"punpckh" #n " " #b ", " #t " \n\t" /* cgdh */ |
50 |
|
51 |
#define TRANSPOSE4(a,b,c,d,t)\
|
52 |
SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
|
53 |
SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
|
54 |
SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
|
55 |
SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
|
56 |
|
57 |
static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) |
58 |
{ |
59 |
asm volatile( |
60 |
"movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */ |
61 |
"movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */ |
62 |
"movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */ |
63 |
"movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */ |
64 |
"movq %%mm4, %%mm0 \n\t"
|
65 |
"movq %%mm5, %%mm3 \n\t"
|
66 |
"movq %%mm2, %%mm6 \n\t"
|
67 |
"movq %%mm7, %%mm1 \n\t"
|
68 |
|
69 |
"paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */ |
70 |
"paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */ |
71 |
"paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */ |
72 |
"paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */ |
73 |
"paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */ |
74 |
"paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */ |
75 |
"paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */ |
76 |
"paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */ |
77 |
"psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */ |
78 |
"paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */ |
79 |
"psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */ |
80 |
"paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */ |
81 |
|
82 |
"movq %%mm5, %%mm4 \n\t"
|
83 |
"movq %%mm7, %%mm6 \n\t"
|
84 |
"movq %%mm3, %%mm0 \n\t"
|
85 |
"movq %%mm1, %%mm2 \n\t"
|
86 |
SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */
|
87 |
"paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */ |
88 |
"paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */ |
89 |
"paddw %%mm7, %%mm7 \n\t"
|
90 |
"paddw %%mm5, %%mm5 \n\t"
|
91 |
"paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */ |
92 |
"paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */ |
93 |
|
94 |
SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */
|
95 |
"psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */ |
96 |
"movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */ |
97 |
"psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */ |
98 |
"paddw %%mm1, %%mm1 \n\t"
|
99 |
"paddw %%mm3, %%mm3 \n\t"
|
100 |
"psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */ |
101 |
"paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */ |
102 |
|
103 |
"movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */ |
104 |
"movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */ |
105 |
"movq %%mm2, %%mm4 \n\t"
|
106 |
"movq %%mm6, %%mm0 \n\t"
|
107 |
"psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */ |
108 |
"psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */ |
109 |
"paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */ |
110 |
"paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */ |
111 |
"paddw %%mm2, %%mm2 \n\t"
|
112 |
"paddw %%mm0, %%mm0 \n\t"
|
113 |
"psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */ |
114 |
"paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */ |
115 |
|
116 |
"movq (%0), %%mm2 \n\t" /* mm2 = src0 */ |
117 |
"movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */ |
118 |
SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */
|
119 |
"psllw $3, %%mm0 \n\t"
|
120 |
"psllw $3, %%mm2 \n\t"
|
121 |
"paddw %1, %%mm0 \n\t" /* add rounding bias */ |
122 |
"paddw %1, %%mm2 \n\t" /* add rounding bias */ |
123 |
|
124 |
SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */
|
125 |
SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */
|
126 |
SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */
|
127 |
SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */
|
128 |
SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */
|
129 |
SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */
|
130 |
:: "r"(block), "m"(bias) |
131 |
); |
132 |
} |
133 |
|
134 |
static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) |
135 |
{ |
136 |
int i;
|
137 |
DECLARE_ALIGNED_8(int16_t, b2[64]);
|
138 |
|
139 |
for(i=0; i<2; i++){ |
140 |
DECLARE_ALIGNED_8(uint64_t, tmp); |
141 |
|
142 |
cavs_idct8_1d(block+4*i, ff_pw_4);
|
143 |
|
144 |
asm volatile( |
145 |
"psraw $3, %%mm7 \n\t"
|
146 |
"psraw $3, %%mm6 \n\t"
|
147 |
"psraw $3, %%mm5 \n\t"
|
148 |
"psraw $3, %%mm4 \n\t"
|
149 |
"psraw $3, %%mm3 \n\t"
|
150 |
"psraw $3, %%mm2 \n\t"
|
151 |
"psraw $3, %%mm1 \n\t"
|
152 |
"psraw $3, %%mm0 \n\t"
|
153 |
"movq %%mm7, %0 \n\t"
|
154 |
TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) |
155 |
"movq %%mm0, 8(%1) \n\t"
|
156 |
"movq %%mm6, 24(%1) \n\t"
|
157 |
"movq %%mm7, 40(%1) \n\t"
|
158 |
"movq %%mm4, 56(%1) \n\t"
|
159 |
"movq %0, %%mm7 \n\t"
|
160 |
TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) |
161 |
"movq %%mm7, (%1) \n\t"
|
162 |
"movq %%mm1, 16(%1) \n\t"
|
163 |
"movq %%mm0, 32(%1) \n\t"
|
164 |
"movq %%mm3, 48(%1) \n\t"
|
165 |
: "=m"(tmp)
|
166 |
: "r"(b2+32*i) |
167 |
: "memory"
|
168 |
); |
169 |
} |
170 |
|
171 |
for(i=0; i<2; i++){ |
172 |
cavs_idct8_1d(b2+4*i, ff_pw_64);
|
173 |
|
174 |
asm volatile( |
175 |
"psraw $7, %%mm7 \n\t"
|
176 |
"psraw $7, %%mm6 \n\t"
|
177 |
"psraw $7, %%mm5 \n\t"
|
178 |
"psraw $7, %%mm4 \n\t"
|
179 |
"psraw $7, %%mm3 \n\t"
|
180 |
"psraw $7, %%mm2 \n\t"
|
181 |
"psraw $7, %%mm1 \n\t"
|
182 |
"psraw $7, %%mm0 \n\t"
|
183 |
"movq %%mm7, (%0) \n\t"
|
184 |
"movq %%mm5, 16(%0) \n\t"
|
185 |
"movq %%mm3, 32(%0) \n\t"
|
186 |
"movq %%mm1, 48(%0) \n\t"
|
187 |
"movq %%mm0, 64(%0) \n\t"
|
188 |
"movq %%mm2, 80(%0) \n\t"
|
189 |
"movq %%mm4, 96(%0) \n\t"
|
190 |
"movq %%mm6, 112(%0) \n\t"
|
191 |
:: "r"(b2+4*i) |
192 |
: "memory"
|
193 |
); |
194 |
} |
195 |
|
196 |
add_pixels_clamped_mmx(b2, dst, stride); |
197 |
|
198 |
/* clear block */
|
199 |
asm volatile( |
200 |
"pxor %%mm7, %%mm7 \n\t"
|
201 |
"movq %%mm7, (%0) \n\t"
|
202 |
"movq %%mm7, 8(%0) \n\t"
|
203 |
"movq %%mm7, 16(%0) \n\t"
|
204 |
"movq %%mm7, 24(%0) \n\t"
|
205 |
"movq %%mm7, 32(%0) \n\t"
|
206 |
"movq %%mm7, 40(%0) \n\t"
|
207 |
"movq %%mm7, 48(%0) \n\t"
|
208 |
"movq %%mm7, 56(%0) \n\t"
|
209 |
"movq %%mm7, 64(%0) \n\t"
|
210 |
"movq %%mm7, 72(%0) \n\t"
|
211 |
"movq %%mm7, 80(%0) \n\t"
|
212 |
"movq %%mm7, 88(%0) \n\t"
|
213 |
"movq %%mm7, 96(%0) \n\t"
|
214 |
"movq %%mm7, 104(%0) \n\t"
|
215 |
"movq %%mm7, 112(%0) \n\t"
|
216 |
"movq %%mm7, 120(%0) \n\t"
|
217 |
:: "r" (block)
|
218 |
); |
219 |
} |
220 |
|
221 |
/*****************************************************************************
|
222 |
*
|
223 |
* motion compensation
|
224 |
*
|
225 |
****************************************************************************/
|
226 |
|
227 |
/* vertical filter [-1 -2 96 42 -7 0] */
|
228 |
#define QPEL_CAVSV1(A,B,C,D,E,F,OP) \
|
229 |
"movd (%0), "#F" \n\t"\ |
230 |
"movq "#C", %%mm6 \n\t"\ |
231 |
"pmullw %5, %%mm6 \n\t"\
|
232 |
"movq "#D", %%mm7 \n\t"\ |
233 |
"pmullw %6, %%mm7 \n\t"\
|
234 |
"psllw $3, "#E" \n\t"\ |
235 |
"psubw "#E", %%mm6 \n\t"\ |
236 |
"psraw $3, "#E" \n\t"\ |
237 |
"paddw %%mm7, %%mm6 \n\t"\
|
238 |
"paddw "#E", %%mm6 \n\t"\ |
239 |
"paddw "#B", "#B" \n\t"\ |
240 |
"pxor %%mm7, %%mm7 \n\t"\
|
241 |
"add %2, %0 \n\t"\
|
242 |
"punpcklbw %%mm7, "#F" \n\t"\ |
243 |
"psubw "#B", %%mm6 \n\t"\ |
244 |
"psraw $1, "#B" \n\t"\ |
245 |
"psubw "#A", %%mm6 \n\t"\ |
246 |
"paddw %4, %%mm6 \n\t"\
|
247 |
"psraw $7, %%mm6 \n\t"\
|
248 |
"packuswb %%mm6, %%mm6 \n\t"\
|
249 |
OP(%%mm6, (%1), A, d) \
|
250 |
"add %3, %1 \n\t"
|
251 |
|
252 |
/* vertical filter [ 0 -1 5 5 -1 0] */
|
253 |
#define QPEL_CAVSV2(A,B,C,D,E,F,OP) \
|
254 |
"movd (%0), "#F" \n\t"\ |
255 |
"movq "#C", %%mm6 \n\t"\ |
256 |
"paddw "#D", %%mm6 \n\t"\ |
257 |
"pmullw %5, %%mm6 \n\t"\
|
258 |
"add %2, %0 \n\t"\
|
259 |
"punpcklbw %%mm7, "#F" \n\t"\ |
260 |
"psubw "#B", %%mm6 \n\t"\ |
261 |
"psubw "#E", %%mm6 \n\t"\ |
262 |
"paddw %4, %%mm6 \n\t"\
|
263 |
"psraw $3, %%mm6 \n\t"\
|
264 |
"packuswb %%mm6, %%mm6 \n\t"\
|
265 |
OP(%%mm6, (%1), A, d) \
|
266 |
"add %3, %1 \n\t"
|
267 |
|
268 |
/* vertical filter [ 0 -7 42 96 -2 -1] */
|
269 |
#define QPEL_CAVSV3(A,B,C,D,E,F,OP) \
|
270 |
"movd (%0), "#F" \n\t"\ |
271 |
"movq "#C", %%mm6 \n\t"\ |
272 |
"pmullw %6, %%mm6 \n\t"\
|
273 |
"movq "#D", %%mm7 \n\t"\ |
274 |
"pmullw %5, %%mm7 \n\t"\
|
275 |
"psllw $3, "#B" \n\t"\ |
276 |
"psubw "#B", %%mm6 \n\t"\ |
277 |
"psraw $3, "#B" \n\t"\ |
278 |
"paddw %%mm7, %%mm6 \n\t"\
|
279 |
"paddw "#B", %%mm6 \n\t"\ |
280 |
"paddw "#E", "#E" \n\t"\ |
281 |
"pxor %%mm7, %%mm7 \n\t"\
|
282 |
"add %2, %0 \n\t"\
|
283 |
"punpcklbw %%mm7, "#F" \n\t"\ |
284 |
"psubw "#E", %%mm6 \n\t"\ |
285 |
"psraw $1, "#E" \n\t"\ |
286 |
"psubw "#F", %%mm6 \n\t"\ |
287 |
"paddw %4, %%mm6 \n\t"\
|
288 |
"psraw $7, %%mm6 \n\t"\
|
289 |
"packuswb %%mm6, %%mm6 \n\t"\
|
290 |
OP(%%mm6, (%1), A, d) \
|
291 |
"add %3, %1 \n\t"
|
292 |
|
293 |
|
294 |
#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
|
295 |
int w= 2;\ |
296 |
src -= 2*srcStride;\
|
297 |
\ |
298 |
while(w--){\
|
299 |
asm volatile(\ |
300 |
"pxor %%mm7, %%mm7 \n\t"\
|
301 |
"movd (%0), %%mm0 \n\t"\
|
302 |
"add %2, %0 \n\t"\
|
303 |
"movd (%0), %%mm1 \n\t"\
|
304 |
"add %2, %0 \n\t"\
|
305 |
"movd (%0), %%mm2 \n\t"\
|
306 |
"add %2, %0 \n\t"\
|
307 |
"movd (%0), %%mm3 \n\t"\
|
308 |
"add %2, %0 \n\t"\
|
309 |
"movd (%0), %%mm4 \n\t"\
|
310 |
"add %2, %0 \n\t"\
|
311 |
"punpcklbw %%mm7, %%mm0 \n\t"\
|
312 |
"punpcklbw %%mm7, %%mm1 \n\t"\
|
313 |
"punpcklbw %%mm7, %%mm2 \n\t"\
|
314 |
"punpcklbw %%mm7, %%mm3 \n\t"\
|
315 |
"punpcklbw %%mm7, %%mm4 \n\t"\
|
316 |
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ |
317 |
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ |
318 |
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ |
319 |
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ |
320 |
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ |
321 |
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ |
322 |
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ |
323 |
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ |
324 |
\ |
325 |
: "+a"(src), "+c"(dst)\ |
326 |
: "S"((long)srcStride), "D"((long)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\ |
327 |
: "memory"\
|
328 |
);\ |
329 |
if(h==16){\ |
330 |
asm volatile(\ |
331 |
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ |
332 |
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ |
333 |
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ |
334 |
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ |
335 |
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ |
336 |
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ |
337 |
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ |
338 |
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ |
339 |
\ |
340 |
: "+a"(src), "+c"(dst)\ |
341 |
: "S"((long)srcStride), "D"((long)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\ |
342 |
: "memory"\
|
343 |
);\ |
344 |
}\ |
345 |
src += 4-(h+5)*srcStride;\ |
346 |
dst += 4-h*dstStride;\
|
347 |
} |
348 |
|
349 |
#define QPEL_CAVS(OPNAME, OP, MMX)\
|
350 |
static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
351 |
int h=8;\ |
352 |
asm volatile(\ |
353 |
"pxor %%mm7, %%mm7 \n\t"\
|
354 |
"movq %5, %%mm6 \n\t"\
|
355 |
"1: \n\t"\
|
356 |
"movq (%0), %%mm0 \n\t"\
|
357 |
"movq 1(%0), %%mm2 \n\t"\
|
358 |
"movq %%mm0, %%mm1 \n\t"\
|
359 |
"movq %%mm2, %%mm3 \n\t"\
|
360 |
"punpcklbw %%mm7, %%mm0 \n\t"\
|
361 |
"punpckhbw %%mm7, %%mm1 \n\t"\
|
362 |
"punpcklbw %%mm7, %%mm2 \n\t"\
|
363 |
"punpckhbw %%mm7, %%mm3 \n\t"\
|
364 |
"paddw %%mm2, %%mm0 \n\t"\
|
365 |
"paddw %%mm3, %%mm1 \n\t"\
|
366 |
"pmullw %%mm6, %%mm0 \n\t"\
|
367 |
"pmullw %%mm6, %%mm1 \n\t"\
|
368 |
"movq -1(%0), %%mm2 \n\t"\
|
369 |
"movq 2(%0), %%mm4 \n\t"\
|
370 |
"movq %%mm2, %%mm3 \n\t"\
|
371 |
"movq %%mm4, %%mm5 \n\t"\
|
372 |
"punpcklbw %%mm7, %%mm2 \n\t"\
|
373 |
"punpckhbw %%mm7, %%mm3 \n\t"\
|
374 |
"punpcklbw %%mm7, %%mm4 \n\t"\
|
375 |
"punpckhbw %%mm7, %%mm5 \n\t"\
|
376 |
"paddw %%mm4, %%mm2 \n\t"\
|
377 |
"paddw %%mm3, %%mm5 \n\t"\
|
378 |
"psubw %%mm2, %%mm0 \n\t"\
|
379 |
"psubw %%mm5, %%mm1 \n\t"\
|
380 |
"movq %6, %%mm5 \n\t"\
|
381 |
"paddw %%mm5, %%mm0 \n\t"\
|
382 |
"paddw %%mm5, %%mm1 \n\t"\
|
383 |
"psraw $3, %%mm0 \n\t"\
|
384 |
"psraw $3, %%mm1 \n\t"\
|
385 |
"packuswb %%mm1, %%mm0 \n\t"\
|
386 |
OP(%%mm0, (%1),%%mm5, q) \
|
387 |
"add %3, %0 \n\t"\
|
388 |
"add %4, %1 \n\t"\
|
389 |
"decl %2 \n\t"\
|
390 |
" jnz 1b \n\t"\
|
391 |
: "+a"(src), "+c"(dst), "+m"(h)\ |
392 |
: "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ |
393 |
: "memory"\
|
394 |
);\ |
395 |
}\ |
396 |
\ |
397 |
static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
398 |
QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ |
399 |
}\ |
400 |
\ |
401 |
static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
402 |
QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ |
403 |
}\ |
404 |
\ |
405 |
static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
406 |
QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ |
407 |
}\ |
408 |
\ |
409 |
static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
410 |
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ |
411 |
}\ |
412 |
static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
413 |
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ |
414 |
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ |
415 |
}\ |
416 |
\ |
417 |
static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
418 |
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ |
419 |
}\ |
420 |
static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
421 |
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ |
422 |
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ |
423 |
}\ |
424 |
\ |
425 |
static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
426 |
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ |
427 |
}\ |
428 |
static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
429 |
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ |
430 |
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ |
431 |
}\ |
432 |
\ |
433 |
static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
434 |
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ |
435 |
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ |
436 |
src += 8*srcStride;\
|
437 |
dst += 8*dstStride;\
|
438 |
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ |
439 |
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ |
440 |
}\ |
441 |
|
442 |
#define CAVS_MC(OPNAME, SIZE, MMX) \
|
443 |
static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
444 |
OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ |
445 |
}\ |
446 |
\ |
447 |
static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
448 |
OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ |
449 |
}\ |
450 |
\ |
451 |
static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
452 |
OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ |
453 |
}\ |
454 |
\ |
455 |
static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
456 |
OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ |
457 |
}\ |
458 |
|
459 |
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" |
460 |
#define AVG_3DNOW_OP(a,b,temp, size) \
|
461 |
"mov" #size " " #b ", " #temp " \n\t"\ |
462 |
"pavgusb " #temp ", " #a " \n\t"\ |
463 |
"mov" #size " " #a ", " #b " \n\t" |
464 |
#define AVG_MMX2_OP(a,b,temp, size) \
|
465 |
"mov" #size " " #b ", " #temp " \n\t"\ |
466 |
"pavgb " #temp ", " #a " \n\t"\ |
467 |
"mov" #size " " #a ", " #b " \n\t" |
468 |
|
469 |
QPEL_CAVS(put_, PUT_OP, 3dnow)
|
470 |
QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
|
471 |
QPEL_CAVS(put_, PUT_OP, mmx2) |
472 |
QPEL_CAVS(avg_, AVG_MMX2_OP, mmx2) |
473 |
|
474 |
CAVS_MC(put_, 8, 3dnow) |
475 |
CAVS_MC(put_, 16,3dnow) |
476 |
CAVS_MC(avg_, 8, 3dnow) |
477 |
CAVS_MC(avg_, 16,3dnow) |
478 |
CAVS_MC(put_, 8, mmx2)
|
479 |
CAVS_MC(put_, 16,mmx2)
|
480 |
CAVS_MC(avg_, 8, mmx2)
|
481 |
CAVS_MC(avg_, 16,mmx2)
|
482 |
|
483 |
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); |
484 |
void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); |
485 |
void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); |
486 |
void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); |
487 |
|
488 |
void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx) {
|
489 |
#define dspfunc(PFX, IDX, NUM) \
|
490 |
c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ |
491 |
c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \ |
492 |
c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \ |
493 |
c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \ |
494 |
c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \ |
495 |
|
496 |
dspfunc(put_cavs_qpel, 0, 16); |
497 |
dspfunc(put_cavs_qpel, 1, 8); |
498 |
dspfunc(avg_cavs_qpel, 0, 16); |
499 |
dspfunc(avg_cavs_qpel, 1, 8); |
500 |
#undef dspfunc
|
501 |
c->cavs_idct8_add = cavs_idct8_add_mmx; |
502 |
} |
503 |
|
504 |
void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx) {
|
505 |
#define dspfunc(PFX, IDX, NUM) \
|
506 |
c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ |
507 |
c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ |
508 |
c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ |
509 |
c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \ |
510 |
c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \ |
511 |
|
512 |
dspfunc(put_cavs_qpel, 0, 16); |
513 |
dspfunc(put_cavs_qpel, 1, 8); |
514 |
dspfunc(avg_cavs_qpel, 0, 16); |
515 |
dspfunc(avg_cavs_qpel, 1, 8); |
516 |
#undef dspfunc
|
517 |
c->cavs_idct8_add = cavs_idct8_add_mmx; |
518 |
} |