Revision 5d0ddd1a libavcodec/i386/fft_3dn.c
libavcodec/i386/fft_3dn.c  

1  1 
/* 
2  2 
* FFT/MDCT transform with 3DNow! optimizations 
3 
* Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt 

4 
* Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. 

3 
* Copyright (c) 2008 Loren Merritt 

5  4 
* 
6  5 
* This file is part of FFmpeg. 
7  6 
* 
...  ...  
20  19 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA 
21  20 
*/ 
22  21  
23 
#include "libavutil/x86_cpu.h" 

24 
#include "libavcodec/dsputil.h" 

25  
26 
static const int p1m1[2] __attribute__((aligned(8))) = 

27 
{ 0, 1 << 31 }; 

28  
29 
static const int m1p1[2] __attribute__((aligned(8))) = 

30 
{ 1 << 31, 0 }; 

31  
32 
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z) 

33 
{ 

34 
int ln = s>nbits; 

35 
long j; 

36 
x86_reg i; 

37 
long nblocks, nloops; 

38 
FFTComplex *p, *cptr; 

39  
40 
asm volatile( 

41 
/* FEMMS is not a must here but recommended by AMD */ 

42 
"femms \n\t" 

43 
"movq %0, %%mm7 \n\t" 

44 
::"m"(*(s>inverse ? m1p1 : p1m1)) 

45 
); 

46  
47 
i = 8 << ln; 

48 
asm volatile( 

49 
"1: \n\t" 

50 
"sub $32, %0 \n\t" 

51 
"movq (%0,%1), %%mm0 \n\t" 

52 
"movq 16(%0,%1), %%mm1 \n\t" 

53 
"movq 8(%0,%1), %%mm2 \n\t" 

54 
"movq 24(%0,%1), %%mm3 \n\t" 

55 
"movq %%mm0, %%mm4 \n\t" 

56 
"movq %%mm1, %%mm5 \n\t" 

57 
"pfadd %%mm2, %%mm0 \n\t" 

58 
"pfadd %%mm3, %%mm1 \n\t" 

59 
"pfsub %%mm2, %%mm4 \n\t" 

60 
"pfsub %%mm3, %%mm5 \n\t" 

61 
"movq %%mm0, %%mm2 \n\t" 

62 
"punpckldq %%mm5, %%mm6 \n\t" 

63 
"punpckhdq %%mm6, %%mm5 \n\t" 

64 
"movq %%mm4, %%mm3 \n\t" 

65 
"pxor %%mm7, %%mm5 \n\t" 

66 
"pfadd %%mm1, %%mm0 \n\t" 

67 
"pfadd %%mm5, %%mm4 \n\t" 

68 
"pfsub %%mm1, %%mm2 \n\t" 

69 
"pfsub %%mm5, %%mm3 \n\t" 

70 
"movq %%mm0, (%0,%1) \n\t" 

71 
"movq %%mm4, 8(%0,%1) \n\t" 

72 
"movq %%mm2, 16(%0,%1) \n\t" 

73 
"movq %%mm3, 24(%0,%1) \n\t" 

74 
"jg 1b \n\t" 

75 
:"+r"(i) 

76 
:"r"(z) 

77 
); 

78 
/* pass 2 .. ln1 */ 

79  
80 
nblocks = 1 << (ln3); 

81 
nloops = 1 << 2; 

82 
cptr = s>exptab1; 

83 
do { 

84 
p = z; 

85 
j = nblocks; 

86 
do { 

87 
i = nloops*8; 

88 
asm volatile( 

89 
"1: \n\t" 

90 
"sub $16, %0 \n\t" 

91 
"movq (%1,%0), %%mm0 \n\t" 

92 
"movq 8(%1,%0), %%mm1 \n\t" 

93 
"movq (%2,%0), %%mm2 \n\t" 

94 
"movq 8(%2,%0), %%mm3 \n\t" 

95 
"movq %%mm2, %%mm4 \n\t" 

96 
"movq %%mm3, %%mm5 \n\t" 

97 
"punpckldq %%mm2, %%mm2 \n\t" 

98 
"punpckldq %%mm3, %%mm3 \n\t" 

99 
"punpckhdq %%mm4, %%mm4 \n\t" 

100 
"punpckhdq %%mm5, %%mm5 \n\t" 

101 
"pfmul (%3,%0,2), %%mm2 \n\t" // cre*re cim*re 

102 
"pfmul 8(%3,%0,2), %%mm3 \n\t" 

103 
"pfmul 16(%3,%0,2), %%mm4 \n\t" // cim*im cre*im 

104 
"pfmul 24(%3,%0,2), %%mm5 \n\t" 

105 
"pfadd %%mm2, %%mm4 \n\t" // cre*recim*im cim*re+cre*im 

106 
"pfadd %%mm3, %%mm5 \n\t" 

107 
"movq %%mm0, %%mm2 \n\t" 

108 
"movq %%mm1, %%mm3 \n\t" 

109 
"pfadd %%mm4, %%mm0 \n\t" 

110 
"pfadd %%mm5, %%mm1 \n\t" 

111 
"pfsub %%mm4, %%mm2 \n\t" 

112 
"pfsub %%mm5, %%mm3 \n\t" 

113 
"movq %%mm0, (%1,%0) \n\t" 

114 
"movq %%mm1, 8(%1,%0) \n\t" 

115 
"movq %%mm2, (%2,%0) \n\t" 

116 
"movq %%mm3, 8(%2,%0) \n\t" 

117 
"jg 1b \n\t" 

118 
:"+r"(i) 

119 
:"r"(p), "r"(p + nloops), "r"(cptr) 

120 
); 

121 
p += nloops*2; 

122 
} while (j); 

123 
cptr += nloops*2; 

124 
nblocks >>= 1; 

125 
nloops <<= 1; 

126 
} while (nblocks != 0); 

127 
asm volatile("femms"); 

128 
} 

22 
#define EMULATE_3DNOWEXT 

23 
#include "fft_3dn2.c" 
Also available in: Unified diff