ffmpeg / libavcodec / x86 / dct32_sse.c @ c6a908be
History  View  Annotate  Download (11.5 KB)
1 
/*


2 
* 32 point SSEoptimized DCT transform

3 
* Copyright (c) 2010 Vitor Sessak

4 
*

5 
* This file is part of FFmpeg.

6 
*

7 
* FFmpeg is free software; you can redistribute it and/or

8 
* modify it under the terms of the GNU Lesser General Public

9 
* License as published by the Free Software Foundation; either

10 
* version 2.1 of the License, or (at your option) any later version.

11 
*

12 
* FFmpeg is distributed in the hope that it will be useful,

13 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

14 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15 
* Lesser General Public License for more details.

16 
*

17 
* You should have received a copy of the GNU Lesser General Public

18 
* License along with FFmpeg; if not, write to the Free Software

19 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

20 
*/

21  
22 
#include <stdint.h> 
23  
24 
#include "libavutil/x86_cpu.h" 
25 
#include "libavutil/mem.h" 
26 
#include "libavcodec/dsputil.h" 
27 
#include "fft.h" 
28  
29 
DECLARE_ALIGNED(16, static const float, b1)[] = { 
30 
0.500603, 0.505471, 0.515447, 0.531043, 
31 
0.553104, 0.582935, 0.622504, 0.674808, 
32 
1.169440, 0.972568, 0.839350, 0.744536, 
33 
10.190008, 3.407609, 2.057781, 1.484165, 
34 
0.502419, 0.522499, 0.566944, 0.646822, 
35 
0.788155, 1.060678, 1.722447, 5.101149, 
36 
0.509796, 0.601345, 0.899976, 2.562916, 
37 
1.000000, 1.000000, 1.306563, 0.541196, 
38 
1.000000, 0.707107, 1.000000, 0.707107 
39 
}; 
40  
41 
DECLARE_ALIGNED(16, static const int32_t, smask)[4] = { 
42 
0, 0, 0x80000000, 0x80000000 
43 
}; 
44  
45 
/* butterfly operator */

46 
#define BUTTERFLY(a,b,c,tmp) \

47 
"movaps %%" #a ", %%" #tmp " \n\t" \ 
48 
"subps %%" #b ", %%" #a " \n\t" \ 
49 
"addps %%" #tmp ", %%" #b " \n\t" \ 
50 
"mulps " #c ", %%" #a " \n\t" 
51  
52 
///* Same as BUTTERFLY when vectors a and b overlap */

53 
#define BUTTERFLY0(val, mask, cos, tmp, shuf) \

54 
"movaps %%" #val ", %%" #tmp " \n\t" \ 
55 
"shufps " #shuf ", %%" #val ",%%" #val " \n\t" \ 
56 
"xorps %%" #mask ", %%" #tmp " \n\t" /* flip signs */ \ 
57 
"addps %%" #tmp ", %%" #val " \n\t" \ 
58 
"mulps %%" #cos ", %%" #val " \n\t" 
59  
60 
#define BUTTERFLY2(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0x1b) 
61 
#define BUTTERFLY3(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0xb1) 
62  
63 
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) 
64 
{ 
65 
int32_t tmp1 = 0;

66 
__asm__ volatile(

67 
/* pass 1 */

68  
69 
"movaps (%4), %%xmm0 \n\t"

70 
"movaps 112(%4), %%xmm1 \n\t"

71 
"shufps $0x1b, %%xmm1, %%xmm1 \n\t"

72 
BUTTERFLY(xmm0, xmm1, (%2), xmm3)

73  
74 
"movaps 64(%4), %%xmm7 \n\t"

75 
"movaps 48(%4), %%xmm4 \n\t"

76 
"shufps $0x1b, %%xmm4, %%xmm4 \n\t"

77 
BUTTERFLY(xmm7, xmm4, 48(%2), xmm3) 
78  
79  
80 
/* pass 2 */

81 
"movaps 64(%2), %%xmm2 \n\t"

82 
BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3) 
83 
"movaps %%xmm1, 48(%1) \n\t"

84 
"movaps %%xmm4, (%1) \n\t"

85  
86 
/* pass 1 */

87 
"movaps 16(%4), %%xmm1 \n\t"

88 
"movaps 96(%4), %%xmm6 \n\t"

89 
"shufps $0x1b, %%xmm6, %%xmm6 \n\t"

90 
BUTTERFLY(xmm1, xmm6, 16(%2), xmm3) 
91  
92 
"movaps 80(%4), %%xmm4 \n\t"

93 
"movaps 32(%4), %%xmm5 \n\t"

94 
"shufps $0x1b, %%xmm5, %%xmm5 \n\t"

95 
BUTTERFLY(xmm4, xmm5, 32(%2), xmm3) 
96  
97 
/* pass 2 */

98 
BUTTERFLY(xmm0, xmm7, %%xmm2, xmm3) 
99  
100 
"movaps 80(%2), %%xmm2 \n\t"

101 
BUTTERFLY(xmm6, xmm5, %%xmm2, xmm3) 
102  
103 
BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3) 
104  
105 
/* pass 3 */

106 
"movaps 96(%2), %%xmm2 \n\t"

107 
"shufps $0x1b, %%xmm1, %%xmm1 \n\t"

108 
BUTTERFLY(xmm0, xmm1, %%xmm2, xmm3) 
109 
"movaps %%xmm0, 112(%1) \n\t"

110 
"movaps %%xmm1, 96(%1) \n\t"

111  
112 
"movaps 0(%1), %%xmm0 \n\t"

113 
"shufps $0x1b, %%xmm5, %%xmm5 \n\t"

114 
BUTTERFLY(xmm0, xmm5, %%xmm2, xmm3) 
115  
116 
"movaps 48(%1), %%xmm1 \n\t"

117 
"shufps $0x1b, %%xmm6, %%xmm6 \n\t"

118 
BUTTERFLY(xmm1, xmm6, %%xmm2, xmm3) 
119 
"movaps %%xmm1, 48(%1) \n\t"

120  
121 
"shufps $0x1b, %%xmm4, %%xmm4 \n\t"

122 
BUTTERFLY(xmm7, xmm4, %%xmm2, xmm3) 
123  
124 
/* pass 4 */

125 
"movaps (%3), %%xmm3 \n\t"

126 
"movaps 112(%2), %%xmm2 \n\t"

127  
128 
BUTTERFLY2(xmm5, xmm3, xmm2, xmm1) 
129  
130 
BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) 
131 
"movaps %%xmm0, 16(%1) \n\t"

132  
133 
BUTTERFLY2(xmm6, xmm3, xmm2, xmm1) 
134 
"movaps %%xmm6, 32(%1) \n\t"

135  
136 
"movaps 48(%1), %%xmm0 \n\t"

137 
BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) 
138 
"movaps %%xmm0, 48(%1) \n\t"

139  
140 
BUTTERFLY2(xmm4, xmm3, xmm2, xmm1) 
141  
142 
BUTTERFLY2(xmm7, xmm3, xmm2, xmm1) 
143  
144 
"movaps 96(%1), %%xmm6 \n\t"

145 
BUTTERFLY2(xmm6, xmm3, xmm2, xmm1) 
146  
147 
"movaps 112(%1), %%xmm0 \n\t"

148 
BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) 
149  
150 
/* pass 5 */

151 
"movaps 128(%2), %%xmm2 \n\t"

152 
"shufps $0xCC, %%xmm3,%%xmm3 \n\t"

153  
154 
BUTTERFLY3(xmm5, xmm3, xmm2, xmm1) 
155 
"movaps %%xmm5, (%1) \n\t"

156  
157 
"movaps 16(%1), %%xmm1 \n\t"

158 
BUTTERFLY3(xmm1, xmm3, xmm2, xmm5) 
159 
"movaps %%xmm1, 16(%1) \n\t"

160  
161 
BUTTERFLY3(xmm4, xmm3, xmm2, xmm5) 
162 
"movaps %%xmm4, 64(%1) \n\t"

163  
164 
BUTTERFLY3(xmm7, xmm3, xmm2, xmm5) 
165 
"movaps %%xmm7, 80(%1) \n\t"

166  
167 
"movaps 32(%1), %%xmm5 \n\t"

168 
BUTTERFLY3(xmm5, xmm3, xmm2, xmm7) 
169 
"movaps %%xmm5, 32(%1) \n\t"

170  
171 
"movaps 48(%1), %%xmm4 \n\t"

172 
BUTTERFLY3(xmm4, xmm3, xmm2, xmm7) 
173 
"movaps %%xmm4, 48(%1) \n\t"

174  
175 
BUTTERFLY3(xmm6, xmm3, xmm2, xmm7) 
176 
"movaps %%xmm6, 96(%1) \n\t"

177  
178 
BUTTERFLY3(xmm0, xmm3, xmm2, xmm7) 
179 
"movaps %%xmm0, 112(%1) \n\t"

180  
181  
182 
/* pass 6, no SIMD... */

183 
"movss 56(%1), %%xmm3 \n\t"

184 
"movl 4(%1), %0 \n\t"

185 
"addss 60(%1), %%xmm3 \n\t"

186 
"movss 72(%1), %%xmm7 \n\t"

187 
"addss %%xmm3, %%xmm4 \n\t"

188 
"movss 52(%1), %%xmm2 \n\t"

189 
"addss %%xmm3, %%xmm2 \n\t"

190 
"movss 24(%1), %%xmm3 \n\t"

191 
"addss 28(%1), %%xmm3 \n\t"

192 
"addss 76(%1), %%xmm7 \n\t"

193 
"addss %%xmm3, %%xmm1 \n\t"

194 
"addss %%xmm4, %%xmm5 \n\t"

195 
"movss %%xmm1, 16(%1) \n\t"

196 
"movss 20(%1), %%xmm1 \n\t"

197 
"addss %%xmm3, %%xmm1 \n\t"

198 
"movss 40(%1), %%xmm3 \n\t"

199 
"movss %%xmm1, 48(%1) \n\t"

200 
"addss 44(%1), %%xmm3 \n\t"

201 
"movss 20(%1), %%xmm1 \n\t"

202 
"addss %%xmm3, %%xmm4 \n\t"

203 
"addss %%xmm2, %%xmm3 \n\t"

204 
"addss 28(%1), %%xmm1 \n\t"

205 
"movss %%xmm3, 40(%1) \n\t"

206 
"addss 36(%1), %%xmm2 \n\t"

207 
"movss 8(%1), %%xmm3 \n\t"

208 
"movss %%xmm2, 56(%1) \n\t"

209 
"addss 12(%1), %%xmm3 \n\t"

210 
"movss %%xmm5, 8(%1) \n\t"

211 
"movss %%xmm3, 32(%1) \n\t"

212 
"movss 52(%1), %%xmm2 \n\t"

213 
"movss 80(%1), %%xmm3 \n\t"

214 
"movss 120(%1), %%xmm5 \n\t"

215 
"movss %%xmm1, 80(%1) \n\t"

216 
"movss %%xmm4, 24(%1) \n\t"

217 
"addss 124(%1), %%xmm5 \n\t"

218 
"movss 64(%1), %%xmm1 \n\t"

219 
"addss 60(%1), %%xmm2 \n\t"

220 
"addss %%xmm5, %%xmm0 \n\t"

221 
"addss 116(%1), %%xmm5 \n\t"

222 
"movl %0, 64(%1) \n\t"

223 
"addss %%xmm0, %%xmm6 \n\t"

224 
"addss %%xmm6, %%xmm1 \n\t"

225 
"movl 12(%1), %0 \n\t"

226 
"movss %%xmm1, 4(%1) \n\t"

227 
"movss 88(%1), %%xmm1 \n\t"

228 
"movl %0, 96(%1) \n\t"

229 
"addss 92(%1), %%xmm1 \n\t"

230 
"movss 104(%1), %%xmm4 \n\t"

231 
"movl 28(%1), %0 \n\t"

232 
"addss 108(%1), %%xmm4 \n\t"

233 
"addss %%xmm4, %%xmm0 \n\t"

234 
"addss %%xmm1, %%xmm3 \n\t"

235 
"addss 84(%1), %%xmm1 \n\t"

236 
"addss %%xmm5, %%xmm4 \n\t"

237 
"addss %%xmm3, %%xmm6 \n\t"

238 
"addss %%xmm0, %%xmm3 \n\t"

239 
"addss %%xmm7, %%xmm0 \n\t"

240 
"addss 100(%1), %%xmm5 \n\t"

241 
"addss %%xmm4, %%xmm7 \n\t"

242 
"movl %0, 112(%1) \n\t"

243 
"movss %%xmm0, 28(%1) \n\t"

244 
"movss 36(%1), %%xmm0 \n\t"

245 
"movss %%xmm7, 36(%1) \n\t"

246 
"addss %%xmm1, %%xmm4 \n\t"

247 
"movss 116(%1), %%xmm7 \n\t"

248 
"addss %%xmm2, %%xmm0 \n\t"

249 
"addss 124(%1), %%xmm7 \n\t"

250 
"movss %%xmm0, 72(%1) \n\t"

251 
"movss 44(%1), %%xmm0 \n\t"

252 
"movss %%xmm6, 12(%1) \n\t"

253 
"movss %%xmm3, 20(%1) \n\t"

254 
"addss %%xmm0, %%xmm2 \n\t"

255 
"movss %%xmm4, 44(%1) \n\t"

256 
"movss %%xmm2, 88(%1) \n\t"

257 
"addss 60(%1), %%xmm0 \n\t"

258 
"movl 60(%1), %0 \n\t"

259 
"movl %0, 120(%1) \n\t"

260 
"movss %%xmm0, 104(%1) \n\t"

261 
"addss %%xmm5, %%xmm1 \n\t"

262 
"addss 68(%1), %%xmm5 \n\t"

263 
"movss %%xmm1, 52(%1) \n\t"

264 
"movss %%xmm5, 60(%1) \n\t"

265 
"movss 68(%1), %%xmm1 \n\t"

266 
"movss 100(%1), %%xmm5 \n\t"

267 
"addss %%xmm7, %%xmm5 \n\t"

268 
"addss 108(%1), %%xmm7 \n\t"

269 
"addss %%xmm5, %%xmm1 \n\t"

270 
"movss 84(%1), %%xmm2 \n\t"

271 
"addss 92(%1), %%xmm2 \n\t"

272 
"addss %%xmm2, %%xmm5 \n\t"

273 
"movss %%xmm1, 68(%1) \n\t"

274 
"addss %%xmm7, %%xmm2 \n\t"

275 
"movss 76(%1), %%xmm1 \n\t"

276 
"movss %%xmm2, 84(%1) \n\t"

277 
"movss %%xmm5, 76(%1) \n\t"

278 
"movss 108(%1), %%xmm2 \n\t"

279 
"addss %%xmm1, %%xmm7 \n\t"

280 
"addss 124(%1), %%xmm2 \n\t"

281 
"addss %%xmm2, %%xmm1 \n\t"

282 
"addss 92(%1), %%xmm2 \n\t"

283 
"movss %%xmm1, 100(%1) \n\t"

284 
"movss %%xmm2, 108(%1) \n\t"

285 
"movss 92(%1), %%xmm2 \n\t"

286 
"movss %%xmm7, 92(%1) \n\t"

287 
"addss 124(%1), %%xmm2 \n\t"

288 
"movss %%xmm2, 116(%1) \n\t"

289 
:"+&r"(tmp1)

290 
:"r"(out), "r"(b1), "r"(smask), "r"(in) 
291 
:"memory"

292 
XMM_CLOBBERS(, "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
293 
"%xmm4", "%xmm5", "%xmm6", "%xmm7") 
294 
); 
295 
} 
296 