Revision 4dcc4f8e libavcodec/x86/fft_sse.c
libavcodec/x86/fft_sse.c  

20  20 
*/ 
21  21  
22  22 
#include "libavutil/x86_cpu.h" 
23 
#include "libavutil/common.h" 

23  24 
#include "libavcodec/dsputil.h" 
24  25 
#include "fft.h" 
25  26  
...  ...  
201  202 
); 
202  203 
} 
203  204  
205 
DECLARE_ALIGNED(16, static const float, b1)[] = { 

206 
0.500603, 0.505471, 0.515447, 0.531043, 

207 
0.553104, 0.582935, 0.622504, 0.674808, 

208 
1.169440, 0.972568, 0.839350, 0.744536, 

209 
10.190008, 3.407609, 2.057781, 1.484165, 

210 
0.502419, 0.522499, 0.566944, 0.646822, 

211 
0.788155, 1.060678, 1.722447, 5.101149, 

212 
0.509796, 0.601345, 0.899976, 2.562916, 

213 
1.000000, 1.000000, 1.306563, 0.541196, 

214 
1.000000, 0.707107, 1.000000, 0.707107 

215 
}; 

216  
217 
DECLARE_ALIGNED(16, static const int32_t, smask)[4] = { 

218 
0, 0, 0x80000000, 0x80000000 

219 
}; 

220  
221 
/* butterfly operator */ 

222 
#define BUTTERFLY(a,b,c,tmp) \ 

223 
"movaps %%" #a ", %%" #tmp " \n\t" \ 

224 
"subps %%" #b ", %%" #a " \n\t" \ 

225 
"addps %%" #tmp ", %%" #b " \n\t" \ 

226 
"mulps " #c ", %%" #a " \n\t" 

227  
228 
///* Same as BUTTERFLY when vectors a and b overlap */ 

229 
#define BUTTERFLY0(val, mask, cos, tmp, shuf) \ 

230 
"movaps %%" #val ", %%" #tmp " \n\t" \ 

231 
"shufps " #shuf ", %%" #val ",%%" #val " \n\t" \ 

232 
"xorps %%" #mask ", %%" #tmp " \n\t" /* flip signs */ \ 

233 
"addps %%" #tmp ", %%" #val " \n\t" \ 

234 
"mulps %%" #cos ", %%" #val " \n\t" 

235  
236 
#define BUTTERFLY2(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0x1b) 

237 
#define BUTTERFLY3(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0xb1) 

238  
239 
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) 

240 
{ 

241 
int32_t tmp1 = 0; 

242 
__asm__ volatile( 

243 
/* pass 1 */ 

244  
245 
"movaps (%4), %%xmm0 \n\t" 

246 
"movaps 112(%4), %%xmm1 \n\t" 

247 
"shufps $0x1b, %%xmm1, %%xmm1 \n\t" 

248 
BUTTERFLY(xmm0, xmm1, (%2), xmm3) 

249  
250 
"movaps 64(%4), %%xmm7 \n\t" 

251 
"movaps 48(%4), %%xmm4 \n\t" 

252 
"shufps $0x1b, %%xmm4, %%xmm4 \n\t" 

253 
BUTTERFLY(xmm7, xmm4, 48(%2), xmm3) 

254  
255  
256 
/* pass 2 */ 

257 
"movaps 64(%2), %%xmm2 \n\t" 

258 
BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3) 

259 
"movaps %%xmm1, 48(%1) \n\t" 

260 
"movaps %%xmm4, (%1) \n\t" 

261  
262 
/* pass 1 */ 

263 
"movaps 16(%4), %%xmm1 \n\t" 

264 
"movaps 96(%4), %%xmm6 \n\t" 

265 
"shufps $0x1b, %%xmm6, %%xmm6 \n\t" 

266 
BUTTERFLY(xmm1, xmm6, 16(%2), xmm3) 

267  
268 
"movaps 80(%4), %%xmm4 \n\t" 

269 
"movaps 32(%4), %%xmm5 \n\t" 

270 
"shufps $0x1b, %%xmm5, %%xmm5 \n\t" 

271 
BUTTERFLY(xmm4, xmm5, 32(%2), xmm3) 

272  
273 
/* pass 2 */ 

274 
BUTTERFLY(xmm0, xmm7, %%xmm2, xmm3) 

275  
276 
"movaps 80(%2), %%xmm2 \n\t" 

277 
BUTTERFLY(xmm6, xmm5, %%xmm2, xmm3) 

278  
279 
BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3) 

280  
281 
/* pass 3 */ 

282 
"movaps 96(%2), %%xmm2 \n\t" 

283 
"shufps $0x1b, %%xmm1, %%xmm1 \n\t" 

284 
BUTTERFLY(xmm0, xmm1, %%xmm2, xmm3) 

285 
"movaps %%xmm0, 112(%1) \n\t" 

286 
"movaps %%xmm1, 96(%1) \n\t" 

287  
288 
"movaps 0(%1), %%xmm0 \n\t" 

289 
"shufps $0x1b, %%xmm5, %%xmm5 \n\t" 

290 
BUTTERFLY(xmm0, xmm5, %%xmm2, xmm3) 

291  
292 
"movaps 48(%1), %%xmm1 \n\t" 

293 
"shufps $0x1b, %%xmm6, %%xmm6 \n\t" 

294 
BUTTERFLY(xmm1, xmm6, %%xmm2, xmm3) 

295 
"movaps %%xmm1, 48(%1) \n\t" 

296  
297 
"shufps $0x1b, %%xmm4, %%xmm4 \n\t" 

298 
BUTTERFLY(xmm7, xmm4, %%xmm2, xmm3) 

299  
300 
/* pass 4 */ 

301 
"movaps (%3), %%xmm3 \n\t" 

302 
"movaps 112(%2), %%xmm2 \n\t" 

303  
304 
BUTTERFLY2(xmm5, xmm3, xmm2, xmm1) 

305  
306 
BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) 

307 
"movaps %%xmm0, 16(%1) \n\t" 

308  
309 
BUTTERFLY2(xmm6, xmm3, xmm2, xmm1) 

310 
"movaps %%xmm6, 32(%1) \n\t" 

311  
312 
"movaps 48(%1), %%xmm0 \n\t" 

313 
BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) 

314 
"movaps %%xmm0, 48(%1) \n\t" 

315  
316 
BUTTERFLY2(xmm4, xmm3, xmm2, xmm1) 

317  
318 
BUTTERFLY2(xmm7, xmm3, xmm2, xmm1) 

319  
320 
"movaps 96(%1), %%xmm6 \n\t" 

321 
BUTTERFLY2(xmm6, xmm3, xmm2, xmm1) 

322  
323 
"movaps 112(%1), %%xmm0 \n\t" 

324 
BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) 

325  
326 
/* pass 5 */ 

327 
"movaps 128(%2), %%xmm2 \n\t" 

328 
"shufps $0xCC, %%xmm3,%%xmm3 \n\t" 

329  
330 
BUTTERFLY3(xmm5, xmm3, xmm2, xmm1) 

331 
"movaps %%xmm5, (%1) \n\t" 

332  
333 
"movaps 16(%1), %%xmm1 \n\t" 

334 
BUTTERFLY3(xmm1, xmm3, xmm2, xmm5) 

335 
"movaps %%xmm1, 16(%1) \n\t" 

336  
337 
BUTTERFLY3(xmm4, xmm3, xmm2, xmm5) 

338 
"movaps %%xmm4, 64(%1) \n\t" 

339  
340 
BUTTERFLY3(xmm7, xmm3, xmm2, xmm5) 

341 
"movaps %%xmm7, 80(%1) \n\t" 

342  
343 
"movaps 32(%1), %%xmm5 \n\t" 

344 
BUTTERFLY3(xmm5, xmm3, xmm2, xmm7) 

345 
"movaps %%xmm5, 32(%1) \n\t" 

346  
347 
"movaps 48(%1), %%xmm4 \n\t" 

348 
BUTTERFLY3(xmm4, xmm3, xmm2, xmm7) 

349 
"movaps %%xmm4, 48(%1) \n\t" 

350  
351 
BUTTERFLY3(xmm6, xmm3, xmm2, xmm7) 

352 
"movaps %%xmm6, 96(%1) \n\t" 

353  
354 
BUTTERFLY3(xmm0, xmm3, xmm2, xmm7) 

355 
"movaps %%xmm0, 112(%1) \n\t" 

356  
357  
358 
/* pass 6, no SIMD... */ 

359 
"movss 56(%1), %%xmm3 \n\t" 

360 
"movl 4(%1), %0 \n\t" 

361 
"addss 60(%1), %%xmm3 \n\t" 

362 
"movss 72(%1), %%xmm7 \n\t" 

363 
"addss %%xmm3, %%xmm4 \n\t" 

364 
"movss 52(%1), %%xmm2 \n\t" 

365 
"addss %%xmm3, %%xmm2 \n\t" 

366 
"movss 24(%1), %%xmm3 \n\t" 

367 
"addss 28(%1), %%xmm3 \n\t" 

368 
"addss 76(%1), %%xmm7 \n\t" 

369 
"addss %%xmm3, %%xmm1 \n\t" 

370 
"addss %%xmm4, %%xmm5 \n\t" 

371 
"movss %%xmm1, 16(%1) \n\t" 

372 
"movss 20(%1), %%xmm1 \n\t" 

373 
"addss %%xmm3, %%xmm1 \n\t" 

374 
"movss 40(%1), %%xmm3 \n\t" 

375 
"movss %%xmm1, 48(%1) \n\t" 

376 
"addss 44(%1), %%xmm3 \n\t" 

377 
"movss 20(%1), %%xmm1 \n\t" 

378 
"addss %%xmm3, %%xmm4 \n\t" 

379 
"addss %%xmm2, %%xmm3 \n\t" 

380 
"addss 28(%1), %%xmm1 \n\t" 

381 
"movss %%xmm3, 40(%1) \n\t" 

382 
"addss 36(%1), %%xmm2 \n\t" 

383 
"movss 8(%1), %%xmm3 \n\t" 

384 
"movss %%xmm2, 56(%1) \n\t" 

385 
"addss 12(%1), %%xmm3 \n\t" 

386 
"movss %%xmm5, 8(%1) \n\t" 

387 
"movss %%xmm3, 32(%1) \n\t" 

388 
"movss 52(%1), %%xmm2 \n\t" 

389 
"movss 80(%1), %%xmm3 \n\t" 

390 
"movss 120(%1), %%xmm5 \n\t" 

391 
"movss %%xmm1, 80(%1) \n\t" 

392 
"movss %%xmm4, 24(%1) \n\t" 

393 
"addss 124(%1), %%xmm5 \n\t" 

394 
"movss 64(%1), %%xmm1 \n\t" 

395 
"addss 60(%1), %%xmm2 \n\t" 

396 
"addss %%xmm5, %%xmm0 \n\t" 

397 
"addss 116(%1), %%xmm5 \n\t" 

398 
"movl %0, 64(%1) \n\t" 

399 
"addss %%xmm0, %%xmm6 \n\t" 

400 
"addss %%xmm6, %%xmm1 \n\t" 

401 
"movl 12(%1), %0 \n\t" 

402 
"movss %%xmm1, 4(%1) \n\t" 

403 
"movss 88(%1), %%xmm1 \n\t" 

404 
"movl %0, 96(%1) \n\t" 

405 
"addss 92(%1), %%xmm1 \n\t" 

406 
"movss 104(%1), %%xmm4 \n\t" 

407 
"movl 28(%1), %0 \n\t" 

408 
"addss 108(%1), %%xmm4 \n\t" 

409 
"addss %%xmm4, %%xmm0 \n\t" 

410 
"addss %%xmm1, %%xmm3 \n\t" 

411 
"addss 84(%1), %%xmm1 \n\t" 

412 
"addss %%xmm5, %%xmm4 \n\t" 

413 
"addss %%xmm3, %%xmm6 \n\t" 

414 
"addss %%xmm0, %%xmm3 \n\t" 

415 
"addss %%xmm7, %%xmm0 \n\t" 

416 
"addss 100(%1), %%xmm5 \n\t" 

417 
"addss %%xmm4, %%xmm7 \n\t" 

418 
"movl %0, 112(%1) \n\t" 

419 
"movss %%xmm0, 28(%1) \n\t" 

420 
"movss 36(%1), %%xmm0 \n\t" 

421 
"movss %%xmm7, 36(%1) \n\t" 

422 
"addss %%xmm1, %%xmm4 \n\t" 

423 
"movss 116(%1), %%xmm7 \n\t" 

424 
"addss %%xmm2, %%xmm0 \n\t" 

425 
"addss 124(%1), %%xmm7 \n\t" 

426 
"movss %%xmm0, 72(%1) \n\t" 

427 
"movss 44(%1), %%xmm0 \n\t" 

428 
"movss %%xmm6, 12(%1) \n\t" 

429 
"movss %%xmm3, 20(%1) \n\t" 

430 
"addss %%xmm0, %%xmm2 \n\t" 

431 
"movss %%xmm4, 44(%1) \n\t" 

432 
"movss %%xmm2, 88(%1) \n\t" 

433 
"addss 60(%1), %%xmm0 \n\t" 

434 
"movl 60(%1), %0 \n\t" 

435 
"movl %0, 120(%1) \n\t" 

436 
"movss %%xmm0, 104(%1) \n\t" 

437 
"addss %%xmm5, %%xmm1 \n\t" 

438 
"addss 68(%1), %%xmm5 \n\t" 

439 
"movss %%xmm1, 52(%1) \n\t" 

440 
"movss %%xmm5, 60(%1) \n\t" 

441 
"movss 68(%1), %%xmm1 \n\t" 

442 
"movss 100(%1), %%xmm5 \n\t" 

443 
"addss %%xmm7, %%xmm5 \n\t" 

444 
"addss 108(%1), %%xmm7 \n\t" 

445 
"addss %%xmm5, %%xmm1 \n\t" 

446 
"movss 84(%1), %%xmm2 \n\t" 

447 
"addss 92(%1), %%xmm2 \n\t" 

448 
"addss %%xmm2, %%xmm5 \n\t" 

449 
"movss %%xmm1, 68(%1) \n\t" 

450 
"addss %%xmm7, %%xmm2 \n\t" 

451 
"movss 76(%1), %%xmm1 \n\t" 

452 
"movss %%xmm2, 84(%1) \n\t" 

453 
"movss %%xmm5, 76(%1) \n\t" 

454 
"movss 108(%1), %%xmm2 \n\t" 

455 
"addss %%xmm1, %%xmm7 \n\t" 

456 
"addss 124(%1), %%xmm2 \n\t" 

457 
"addss %%xmm2, %%xmm1 \n\t" 

458 
"addss 92(%1), %%xmm2 \n\t" 

459 
"movss %%xmm1, 100(%1) \n\t" 

460 
"movss %%xmm2, 108(%1) \n\t" 

461 
"movss 92(%1), %%xmm2 \n\t" 

462 
"movss %%xmm7, 92(%1) \n\t" 

463 
"addss 124(%1), %%xmm2 \n\t" 

464 
"movss %%xmm2, 116(%1) \n\t" 

465 
:"+&r"(tmp1) 

466 
:"r"(out), "r"(b1), "r"(smask), "r"(in) 

467 
:"memory" 

468 
); 

469 
} 
Also available in: Unified diff