ffmpeg / libavcodec / i386 / idct_sse2_xvid.c @ bc314472
History  View  Annotate  Download (15.1 KB)
1 
/*


2 
* XVID MPEG4 VIDEO CODEC

3 
*  SSE2 inverse discrete cosine transform 

4 
*

5 
* Copyright(C) 2003 Pascal Massimino <skal@planetd.net>

6 
*

7 
* Conversion to gcc syntax with modifications

8 
* by Alexander Strange <astrange@ithinksw.com>

9 
*

10 
* Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.

11 
*

12 
* This file is part of FFmpeg.

13 
*

14 
* Vertical pass is an implementation of the scheme:

15 
* Loeffler C., Ligtenberg A., and Moschytz C.S.:

16 
* Practical Fast 1D DCT Algorithm with Eleven Multiplications,

17 
* Proc. ICASSP 1989, 988991.

18 
*

19 
* Horizontal pass is a double 4x4 vector/matrix multiplication,

20 
* (see also Intel's Application Note 922:

21 
* http://developer.intel.com/vtune/cbts/strmsimd/922down.htm

22 
* Copyright (C) 1999 Intel Corporation)

23 
*

24 
* More details at http://skal.planetd.net/coding/dct.html

25 
*

26 
* FFmpeg is free software; you can redistribute it and/or

27 
* modify it under the terms of the GNU Lesser General Public

28 
* License as published by the Free Software Foundation; either

29 
* version 2.1 of the License, or (at your option) any later version.

30 
*

31 
* FFmpeg is distributed in the hope that it will be useful,

32 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

33 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

34 
* Lesser General Public License for more details.

35 
*

36 
* You should have received a copy of the GNU Lesser General Public License

37 
* along with FFmpeg; if not, write to the Free Software Foundation,

38 
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

39 
*/

40  
41 
#include "libavcodec/dsputil.h" 
42 
#include "libavcodec/i386/idct_xvid.h" 
43  
44 
/*!

45 
* @file idct_sse2_xvid.c

46 
* @brief SSE2 idct compatible with xvidmmx

47 
*/

48  
49 
#define X8(x) x,x,x,x,x,x,x,x

50  
51 
#define ROW_SHIFT 11 
52 
#define COL_SHIFT 6 
53  
54 
DECLARE_ASM_CONST(16, int16_t, tan1[]) = {X8(13036)}; // tan( pi/16) 
55 
DECLARE_ASM_CONST(16, int16_t, tan2[]) = {X8(27146)}; // tan(2pi/16) = sqrt(2)1 
56 
DECLARE_ASM_CONST(16, int16_t, tan3[]) = {X8(43790)}; // tan(3pi/16)1 
57 
DECLARE_ASM_CONST(16, int16_t, sqrt2[])= {X8(23170)}; // 0.5/sqrt(2) 
58 
DECLARE_ASM_CONST(8, uint8_t, m127[]) = {X8(127)}; 
59  
60 
DECLARE_ASM_CONST(16, int16_t, iTab1[]) = {

61 
0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d, 
62 
0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61, 
63 
0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7, 
64 
0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b 
65 
}; 
66  
67 
DECLARE_ASM_CONST(16, int16_t, iTab2[]) = {

68 
0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5, 
69 
0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04, 
70 
0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41, 
71 
0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df 
72 
}; 
73  
74 
DECLARE_ASM_CONST(16, int16_t, iTab3[]) = {

75 
0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf, 
76 
0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf, 
77 
0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d, 
78 
0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 
79 
}; 
80  
81 
DECLARE_ASM_CONST(16, int16_t, iTab4[]) = {

82 
0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746, 
83 
0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac, 
84 
0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df, 
85 
0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e 
86 
}; 
87  
88 
DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders[]) = {

89 
65536, 65536, 65536, 65536, 
90 
3597, 3597, 3597, 3597, 
91 
2260, 2260, 2260, 2260, 
92 
1203, 1203, 1203, 1203, 
93 
120, 120, 120, 120, 
94 
512, 512, 512, 512 
95 
}; 
96  
97 
// Temporary storage before the column pass

98 
#define ROW1 "%%xmm6" 
99 
#define ROW3 "%%xmm4" 
100 
#define ROW5 "%%xmm5" 
101 
#define ROW7 "%%xmm7" 
102  
103 
#define CLEAR_ODD(r) "pxor "r","r" \n\t" 
104 
#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t" 
105  
106 
#ifdef ARCH_X86_64

107  
108 
# define ROW0 "%%xmm8" 
109 
# define REG0 ROW0

110 
# define ROW2 "%%xmm9" 
111 
# define REG2 ROW2

112 
# define ROW4 "%%xmm10" 
113 
# define REG4 ROW4

114 
# define ROW6 "%%xmm11" 
115 
# define REG6 ROW6

116 
# define CLEAR_EVEN(r) CLEAR_ODD(r)

117 
# define PUT_EVEN(dst) PUT_ODD(dst)

118 
# define XMMS "%%xmm12" 
119 
# define MOV_32_ONLY "#" 
120 
# define SREG2 REG2

121 
# define TAN3 "%%xmm13" 
122 
# define TAN1 "%%xmm14" 
123  
124 
#else

125  
126 
# define ROW0 "(%0)" 
127 
# define REG0 "%%xmm4" 
128 
# define ROW2 "2*16(%0)" 
129 
# define REG2 "%%xmm4" 
130 
# define ROW4 "4*16(%0)" 
131 
# define REG4 "%%xmm6" 
132 
# define ROW6 "6*16(%0)" 
133 
# define REG6 "%%xmm6" 
134 
# define CLEAR_EVEN(r)

135 
# define PUT_EVEN(dst) \

136 
"pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \

137 
"movdqa %%xmm2, "dst" \n\t" 
138 
# define XMMS "%%xmm2" 
139 
# define MOV_32_ONLY "movdqa " 
140 
# define SREG2 "%%xmm7" 
141 
# define TAN3 "%%xmm0" 
142 
# define TAN1 "%%xmm2" 
143  
144 
#endif

145  
146 
#define ROUND(x) "paddd "MANGLE(x) 
147  
148 
#define JZ(reg, to) \

149 
"testl "reg","reg" \n\t" \ 
150 
"jz "to" \n\t" 
151  
152 
#define JNZ(reg, to) \

153 
"testl "reg","reg" \n\t" \ 
154 
"jnz "to" \n\t" 
155  
156 
#define TEST_ONE_ROW(src, reg, clear) \

157 
clear \ 
158 
"movq "src", %%mm1 \n\t" \ 
159 
"por 8+"src", %%mm1 \n\t" \ 
160 
"paddusb %%mm0, %%mm1 \n\t" \

161 
"pmovmskb %%mm1, "reg" \n\t" 
162  
163 
#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \

164 
clear1 \ 
165 
clear2 \ 
166 
"movq "row1", %%mm1 \n\t" \ 
167 
"por 8+"row1", %%mm1 \n\t" \ 
168 
"movq "row2", %%mm2 \n\t" \ 
169 
"por 8+"row2", %%mm2 \n\t" \ 
170 
"paddusb %%mm0, %%mm1 \n\t" \

171 
"paddusb %%mm0, %%mm2 \n\t" \

172 
"pmovmskb %%mm1, "reg1" \n\t" \ 
173 
"pmovmskb %%mm2, "reg2" \n\t" 
174  
175 
///IDCT pass on rows.

176 
#define iMTX_MULT(src, table, rounder, put) \

177 
"movdqa "src", %%xmm3 \n\t" \ 
178 
"movdqa %%xmm3, %%xmm0 \n\t" \

179 
"pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \ 
180 
"punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \ 
181 
"pmaddwd "table", %%xmm0 \n\t" \ 
182 
"pmaddwd 16+"table", %%xmm1 \n\t" \ 
183 
"pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \ 
184 
"punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \ 
185 
"pmaddwd 32+"table", %%xmm2 \n\t" \ 
186 
"pmaddwd 48+"table", %%xmm3 \n\t" \ 
187 
"paddd %%xmm1, %%xmm0 \n\t" \

188 
"paddd %%xmm3, %%xmm2 \n\t" \

189 
rounder", %%xmm0 \n\t" \

190 
"movdqa %%xmm2, %%xmm3 \n\t" \

191 
"paddd %%xmm0, %%xmm2 \n\t" \

192 
"psubd %%xmm3, %%xmm0 \n\t" \

193 
"psrad $11, %%xmm2 \n\t" \

194 
"psrad $11, %%xmm0 \n\t" \

195 
"packssdw %%xmm0, %%xmm2 \n\t" \

196 
put \ 
197 
"1: \n\t"

198  
199 
#define iLLM_HEAD \

200 
"movdqa "MANGLE(tan3)", "TAN3" \n\t" \ 
201 
"movdqa "MANGLE(tan1)", "TAN1" \n\t" \ 
202  
203 
///IDCT pass on columns.

204 
#define iLLM_PASS(dct) \

205 
"movdqa "TAN3", %%xmm1 \n\t" \ 
206 
"movdqa "TAN1", %%xmm3 \n\t" \ 
207 
"pmulhw %%xmm4, "TAN3" \n\t" \ 
208 
"pmulhw %%xmm5, %%xmm1 \n\t" \

209 
"paddsw %%xmm4, "TAN3" \n\t" \ 
210 
"paddsw %%xmm5, %%xmm1 \n\t" \

211 
"psubsw %%xmm5, "TAN3" \n\t" \ 
212 
"paddsw %%xmm4, %%xmm1 \n\t" \

213 
"pmulhw %%xmm7, %%xmm3 \n\t" \

214 
"pmulhw %%xmm6, "TAN1" \n\t" \ 
215 
"paddsw %%xmm6, %%xmm3 \n\t" \

216 
"psubsw %%xmm7, "TAN1" \n\t" \ 
217 
"movdqa %%xmm3, %%xmm7 \n\t" \

218 
"movdqa "TAN1", %%xmm6 \n\t" \ 
219 
"psubsw %%xmm1, %%xmm3 \n\t" \

220 
"psubsw "TAN3", "TAN1" \n\t" \ 
221 
"paddsw %%xmm7, %%xmm1 \n\t" \

222 
"paddsw %%xmm6, "TAN3" \n\t" \ 
223 
"movdqa %%xmm3, %%xmm6 \n\t" \

224 
"psubsw "TAN3", %%xmm3 \n\t" \ 
225 
"paddsw %%xmm6, "TAN3" \n\t" \ 
226 
"movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ 
227 
"pmulhw %%xmm4, %%xmm3 \n\t" \

228 
"pmulhw %%xmm4, "TAN3" \n\t" \ 
229 
"paddsw "TAN3", "TAN3" \n\t" \ 
230 
"paddsw %%xmm3, %%xmm3 \n\t" \

231 
"movdqa "MANGLE(tan2)", %%xmm7 \n\t" \ 
232 
MOV_32_ONLY ROW2", "REG2" \n\t" \ 
233 
MOV_32_ONLY ROW6", "REG6" \n\t" \ 
234 
"movdqa %%xmm7, %%xmm5 \n\t" \

235 
"pmulhw "REG6", %%xmm7 \n\t" \ 
236 
"pmulhw "REG2", %%xmm5 \n\t" \ 
237 
"paddsw "REG2", %%xmm7 \n\t" \ 
238 
"psubsw "REG6", %%xmm5 \n\t" \ 
239 
MOV_32_ONLY ROW0", "REG0" \n\t" \ 
240 
MOV_32_ONLY ROW4", "REG4" \n\t" \ 
241 
MOV_32_ONLY" "TAN1", (%0) \n\t" \ 
242 
"movdqa "REG0", "XMMS" \n\t" \ 
243 
"psubsw "REG4", "REG0" \n\t" \ 
244 
"paddsw "XMMS", "REG4" \n\t" \ 
245 
"movdqa "REG4", "XMMS" \n\t" \ 
246 
"psubsw %%xmm7, "REG4" \n\t" \ 
247 
"paddsw "XMMS", %%xmm7 \n\t" \ 
248 
"movdqa "REG0", "XMMS" \n\t" \ 
249 
"psubsw %%xmm5, "REG0" \n\t" \ 
250 
"paddsw "XMMS", %%xmm5 \n\t" \ 
251 
"movdqa %%xmm5, "XMMS" \n\t" \ 
252 
"psubsw "TAN3", %%xmm5 \n\t" \ 
253 
"paddsw "XMMS", "TAN3" \n\t" \ 
254 
"movdqa "REG0", "XMMS" \n\t" \ 
255 
"psubsw %%xmm3, "REG0" \n\t" \ 
256 
"paddsw "XMMS", %%xmm3 \n\t" \ 
257 
MOV_32_ONLY" (%0), "TAN1" \n\t" \ 
258 
"psraw $6, %%xmm5 \n\t" \

259 
"psraw $6, "REG0" \n\t" \ 
260 
"psraw $6, "TAN3" \n\t" \ 
261 
"psraw $6, %%xmm3 \n\t" \

262 
"movdqa "TAN3", 1*16("dct") \n\t" \ 
263 
"movdqa %%xmm3, 2*16("dct") \n\t" \ 
264 
"movdqa "REG0", 5*16("dct") \n\t" \ 
265 
"movdqa %%xmm5, 6*16("dct") \n\t" \ 
266 
"movdqa %%xmm7, %%xmm0 \n\t" \

267 
"movdqa "REG4", %%xmm4 \n\t" \ 
268 
"psubsw %%xmm1, %%xmm7 \n\t" \

269 
"psubsw "TAN1", "REG4" \n\t" \ 
270 
"paddsw %%xmm0, %%xmm1 \n\t" \

271 
"paddsw %%xmm4, "TAN1" \n\t" \ 
272 
"psraw $6, %%xmm1 \n\t" \

273 
"psraw $6, %%xmm7 \n\t" \

274 
"psraw $6, "TAN1" \n\t" \ 
275 
"psraw $6, "REG4" \n\t" \ 
276 
"movdqa %%xmm1, ("dct") \n\t" \ 
277 
"movdqa "TAN1", 3*16("dct") \n\t" \ 
278 
"movdqa "REG4", 4*16("dct") \n\t" \ 
279 
"movdqa %%xmm7, 7*16("dct") \n\t" 
280  
281 
///IDCT pass on columns, assuming rows 47 are zero.

282 
#define iLLM_PASS_SPARSE(dct) \

283 
"pmulhw %%xmm4, "TAN3" \n\t" \ 
284 
"paddsw %%xmm4, "TAN3" \n\t" \ 
285 
"movdqa %%xmm6, %%xmm3 \n\t" \

286 
"pmulhw %%xmm6, "TAN1" \n\t" \ 
287 
"movdqa %%xmm4, %%xmm1 \n\t" \

288 
"psubsw %%xmm1, %%xmm3 \n\t" \

289 
"paddsw %%xmm6, %%xmm1 \n\t" \

290 
"movdqa "TAN1", %%xmm6 \n\t" \ 
291 
"psubsw "TAN3", "TAN1" \n\t" \ 
292 
"paddsw %%xmm6, "TAN3" \n\t" \ 
293 
"movdqa %%xmm3, %%xmm6 \n\t" \

294 
"psubsw "TAN3", %%xmm3 \n\t" \ 
295 
"paddsw %%xmm6, "TAN3" \n\t" \ 
296 
"movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ 
297 
"pmulhw %%xmm4, %%xmm3 \n\t" \

298 
"pmulhw %%xmm4, "TAN3" \n\t" \ 
299 
"paddsw "TAN3", "TAN3" \n\t" \ 
300 
"paddsw %%xmm3, %%xmm3 \n\t" \

301 
"movdqa "MANGLE(tan2)", %%xmm5 \n\t" \ 
302 
MOV_32_ONLY ROW2", "SREG2" \n\t" \ 
303 
"pmulhw "SREG2", %%xmm5 \n\t" \ 
304 
MOV_32_ONLY ROW0", "REG0" \n\t" \ 
305 
"movdqa "REG0", %%xmm6 \n\t" \ 
306 
"psubsw "SREG2", %%xmm6 \n\t" \ 
307 
"paddsw "REG0", "SREG2" \n\t" \ 
308 
MOV_32_ONLY" "TAN1", (%0) \n\t" \ 
309 
"movdqa "REG0", "XMMS" \n\t" \ 
310 
"psubsw %%xmm5, "REG0" \n\t" \ 
311 
"paddsw "XMMS", %%xmm5 \n\t" \ 
312 
"movdqa %%xmm5, "XMMS" \n\t" \ 
313 
"psubsw "TAN3", %%xmm5 \n\t" \ 
314 
"paddsw "XMMS", "TAN3" \n\t" \ 
315 
"movdqa "REG0", "XMMS" \n\t" \ 
316 
"psubsw %%xmm3, "REG0" \n\t" \ 
317 
"paddsw "XMMS", %%xmm3 \n\t" \ 
318 
MOV_32_ONLY" (%0), "TAN1" \n\t" \ 
319 
"psraw $6, %%xmm5 \n\t" \

320 
"psraw $6, "REG0" \n\t" \ 
321 
"psraw $6, "TAN3" \n\t" \ 
322 
"psraw $6, %%xmm3 \n\t" \

323 
"movdqa "TAN3", 1*16("dct") \n\t" \ 
324 
"movdqa %%xmm3, 2*16("dct") \n\t" \ 
325 
"movdqa "REG0", 5*16("dct") \n\t" \ 
326 
"movdqa %%xmm5, 6*16("dct") \n\t" \ 
327 
"movdqa "SREG2", %%xmm0 \n\t" \ 
328 
"movdqa %%xmm6, %%xmm4 \n\t" \

329 
"psubsw %%xmm1, "SREG2" \n\t" \ 
330 
"psubsw "TAN1", %%xmm6 \n\t" \ 
331 
"paddsw %%xmm0, %%xmm1 \n\t" \

332 
"paddsw %%xmm4, "TAN1" \n\t" \ 
333 
"psraw $6, %%xmm1 \n\t" \

334 
"psraw $6, "SREG2" \n\t" \ 
335 
"psraw $6, "TAN1" \n\t" \ 
336 
"psraw $6, %%xmm6 \n\t" \

337 
"movdqa %%xmm1, ("dct") \n\t" \ 
338 
"movdqa "TAN1", 3*16("dct") \n\t" \ 
339 
"movdqa %%xmm6, 4*16("dct") \n\t" \ 
340 
"movdqa "SREG2", 7*16("dct") \n\t" 
341  
342 
inline void ff_idct_xvid_sse2(short *block) 
343 
{ 
344 
asm volatile( 
345 
"movq "MANGLE(m127)", %%mm0 \n\t" 
346 
iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))

347 
iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1)) 
348 
iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2)) 
349  
350 
TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4)) 
351 
JZ("%%eax", "1f") 
352 
iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3)) 
353  
354 
TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6)) 
355 
TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7)) 
356 
iLLM_HEAD 
357 
ASMALIGN(4)

358 
JNZ("%%ecx", "2f") 
359 
JNZ("%%eax", "3f") 
360 
JNZ("%%edx", "4f") 
361 
JNZ("%%esi", "5f") 
362 
iLLM_PASS_SPARSE("%0")

363 
"jmp 6f \n\t"

364 
"2: \n\t"

365 
iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4)) 
366 
"3: \n\t"

367 
iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5)) 
368 
JZ("%%edx", "1f") 
369 
"4: \n\t"

370 
iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6)) 
371 
JZ("%%esi", "1f") 
372 
"5: \n\t"

373 
iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7)) 
374 
#ifndef ARCH_X86_64

375 
iLLM_HEAD 
376 
#endif

377 
iLLM_PASS("%0")

378 
"6: \n\t"

379 
: "+r"(block)

380 
: 
381 
: "%eax", "%ecx", "%edx", "%esi", "memory"); 
382 
} 
383  
384 
void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block) 
385 
{ 
386 
ff_idct_xvid_sse2(block); 
387 
put_pixels_clamped_mmx(block, dest, line_size); 
388 
} 
389  
390 
void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block) 
391 
{ 
392 
ff_idct_xvid_sse2(block); 
393 
add_pixels_clamped_mmx(block, dest, line_size); 
394 
} 