## ffmpeg / libavcodec / i386 / fft_3dn2.c @ be449fca

History | View | Annotate | Download (5.1 KB)

1 | 82eb4b0f | Zuxy Meng | ```
/*
``` |
---|---|---|---|

2 | ```
* FFT/MDCT transform with Extended 3DNow! optimizations
``` |
||

3 | 46803f4f | Loren Merritt | ```
* Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt
``` |

4 | 82eb4b0f | Zuxy Meng | ```
*
``` |

5 | b78e7197 | Diego Biurrun | ```
* This file is part of FFmpeg.
``` |

6 | ```
*
``` |
||

7 | ```
* FFmpeg is free software; you can redistribute it and/or
``` |
||

8 | 82eb4b0f | Zuxy Meng | ```
* modify it under the terms of the GNU Lesser General Public
``` |

9 | ```
* License as published by the Free Software Foundation; either
``` |
||

10 | b78e7197 | Diego Biurrun | ```
* version 2.1 of the License, or (at your option) any later version.
``` |

11 | 82eb4b0f | Zuxy Meng | ```
*
``` |

12 | b78e7197 | Diego Biurrun | ```
* FFmpeg is distributed in the hope that it will be useful,
``` |

13 | 82eb4b0f | Zuxy Meng | ```
* but WITHOUT ANY WARRANTY; without even the implied warranty of
``` |

14 | ```
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
``` |
||

15 | ```
* Lesser General Public License for more details.
``` |
||

16 | ```
*
``` |
||

17 | ```
* You should have received a copy of the GNU Lesser General Public
``` |
||

18 | b78e7197 | Diego Biurrun | ```
* License along with FFmpeg; if not, write to the Free Software
``` |

19 | 82eb4b0f | Zuxy Meng | ```
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
``` |

20 | ```
*/
``` |
||

21 | 245976da | Diego Biurrun | |

22 | #include "libavutil/x86_cpu.h" |
||

23 | #include "libavcodec/dsputil.h" |
||

24 | 82eb4b0f | Zuxy Meng | |

25 | 46803f4f | Loren Merritt | DECLARE_ALIGNED_8(static const int, m1m1[2]) = { 1<<31, 1<<31 }; |

26 | |||

27 | 5d0ddd1a | Loren Merritt | ```
#ifdef EMULATE_3DNOWEXT
``` |

28 | 46803f4f | Loren Merritt | ```
#define PSWAPD(s,d)\
``` |

29 | "movq "#s","#d"\n"\ |
||

30 | "psrlq $32,"#d"\n"\ |
||

31 | "punpckldq "#s","#d"\n" |
||

32 | 5d0ddd1a | Loren Merritt | ```
#define ff_fft_calc_3dn2 ff_fft_calc_3dn
``` |

33 | ```
#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
``` |
||

34 | ```
#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
``` |
||

35 | ```
#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
``` |
||

36 | ```
#define ff_imdct_half_3dn2 ff_imdct_half_3dn
``` |
||

37 | 46803f4f | Loren Merritt | ```
#else
``` |

38 | #define PSWAPD(s,d) "pswapd "#s","#d"\n" |
||

39 | 5d0ddd1a | Loren Merritt | ```
#endif
``` |

40 | 82eb4b0f | Zuxy Meng | |

41 | 5d0ddd1a | Loren Merritt | void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits); |

42 | 75ac2875 | Loren Merritt | void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits); |

43 | 82eb4b0f | Zuxy Meng | |

44 | ```
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
``` |
||

45 | { |
||

46 | 5d0ddd1a | Loren Merritt | int n = 1<<s->nbits; |

47 | ```
int i;
``` |
||

48 | ff_fft_dispatch_interleave_3dn2(z, s->nbits); |
||

49 | be449fca | Diego Pettenò | __asm__ volatile("femms"); |

50 | 5d0ddd1a | Loren Merritt | if(n <= 8) |

51 | for(i=0; i<n; i+=2) |
||

52 | ```
FFSWAP(FFTSample, z[i].im, z[i+1].re);
``` |
||

53 | 82eb4b0f | Zuxy Meng | } |

54 | |||

55 | 46803f4f | Loren Merritt | void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input) |

56 | bcfa3e58 | Loren Merritt | { |

57 | 46803f4f | Loren Merritt | x86_reg j, k; |

58 | long n = 1 << s->nbits; |
||

59 | long n2 = n >> 1; |
||

60 | long n4 = n >> 2; |
||

61 | long n8 = n >> 3; |
||

62 | bcfa3e58 | Loren Merritt | ```
const uint16_t *revtab = s->fft.revtab;
``` |

63 | ```
const FFTSample *tcos = s->tcos;
``` |
||

64 | ```
const FFTSample *tsin = s->tsin;
``` |
||

65 | ```
const FFTSample *in1, *in2;
``` |
||

66 | 46803f4f | Loren Merritt | FFTComplex *z = (FFTComplex *)output; |

67 | bcfa3e58 | Loren Merritt | |

68 | ```
/* pre rotation */
``` |
||

69 | in1 = input; |
||

70 | ```
in2 = input + n2 - 1;
``` |
||

71 | 46803f4f | Loren Merritt | ```
#ifdef EMULATE_3DNOWEXT
``` |

72 | be449fca | Diego Pettenò | __asm__ volatile("movd %0, %%mm7" ::"r"(1<<31)); |

73 | 46803f4f | Loren Merritt | ```
#endif
``` |

74 | bcfa3e58 | Loren Merritt | for(k = 0; k < n4; k++) { |

75 | 2494bdd9 | Loren Merritt | ```
// FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it
``` |

76 | be449fca | Diego Pettenò | ```
__asm__ volatile(
``` |

77 | 46803f4f | Loren Merritt | ```
"movd %0, %%mm0 \n"
``` |

78 | ```
"movd %2, %%mm1 \n"
``` |
||

79 | ```
"punpckldq %1, %%mm0 \n"
``` |
||

80 | ```
"punpckldq %3, %%mm1 \n"
``` |
||

81 | ```
"movq %%mm0, %%mm2 \n"
``` |
||

82 | PSWAPD( %%mm1, %%mm3 ) |
||

83 | ```
"pfmul %%mm1, %%mm0 \n"
``` |
||

84 | ```
"pfmul %%mm3, %%mm2 \n"
``` |
||

85 | ```
#ifdef EMULATE_3DNOWEXT
``` |
||

86 | ```
"movq %%mm0, %%mm1 \n"
``` |
||

87 | ```
"punpckhdq %%mm2, %%mm0 \n"
``` |
||

88 | ```
"punpckldq %%mm2, %%mm1 \n"
``` |
||

89 | ```
"pxor %%mm7, %%mm0 \n"
``` |
||

90 | ```
"pfadd %%mm1, %%mm0 \n"
``` |
||

91 | ```
#else
``` |
||

92 | ```
"pfpnacc %%mm2, %%mm0 \n"
``` |
||

93 | ```
#endif
``` |
||

94 | 2494bdd9 | Loren Merritt | ::"m"(in2[-2*k]), "m"(in1[2*k]), |

95 | "m"(tcos[k]), "m"(tsin[k]) |
||

96 | ); |
||

97 | be449fca | Diego Pettenò | ```
__asm__ volatile(
``` |

98 | bcfa3e58 | Loren Merritt | ```
"movq %%mm0, %0 \n\t"
``` |

99 | ```
:"=m"(z[revtab[k]])
``` |
||

100 | ); |
||

101 | } |
||

102 | |||

103 | 46803f4f | Loren Merritt | ff_fft_dispatch_3dn2(z, s->fft.nbits); |

104 | |||

105 | ```
#define CMUL(j,mm0,mm1)\
``` |
||

106 | "movq (%2,"#j",2), %%mm6 \n"\ |
||

107 | "movq 8(%2,"#j",2), "#mm0"\n"\ |
||

108 | "movq %%mm6, "#mm1"\n"\ |
||

109 | "movq "#mm0",%%mm7 \n"\ |
||

110 | "pfmul (%3,"#j"), %%mm6 \n"\ |
||

111 | "pfmul (%4,"#j"), "#mm0"\n"\ |
||

112 | "pfmul (%4,"#j"), "#mm1"\n"\ |
||

113 | "pfmul (%3,"#j"), %%mm7 \n"\ |
||

114 | "pfsub %%mm6, "#mm0"\n"\ |
||

115 | "pfadd %%mm7, "#mm1"\n" |
||

116 | |||

117 | ```
/* post rotation */
``` |
||

118 | j = -n2; |
||

119 | ```
k = n2-8;
``` |
||

120 | be449fca | Diego Pettenò | ```
__asm__ volatile(
``` |

121 | 46803f4f | Loren Merritt | ```
"1: \n"
``` |

122 | ```
CMUL(%0, %%mm0, %%mm1)
``` |
||

123 | ```
CMUL(%1, %%mm2, %%mm3)
``` |
||

124 | ```
"movd %%mm0, (%2,%0,2) \n"
``` |
||

125 | ```
"movd %%mm1,12(%2,%1,2) \n"
``` |
||

126 | ```
"movd %%mm2, (%2,%1,2) \n"
``` |
||

127 | ```
"movd %%mm3,12(%2,%0,2) \n"
``` |
||

128 | ```
"psrlq $32, %%mm0 \n"
``` |
||

129 | ```
"psrlq $32, %%mm1 \n"
``` |
||

130 | ```
"psrlq $32, %%mm2 \n"
``` |
||

131 | ```
"psrlq $32, %%mm3 \n"
``` |
||

132 | ```
"movd %%mm0, 8(%2,%0,2) \n"
``` |
||

133 | ```
"movd %%mm1, 4(%2,%1,2) \n"
``` |
||

134 | ```
"movd %%mm2, 8(%2,%1,2) \n"
``` |
||

135 | ```
"movd %%mm3, 4(%2,%0,2) \n"
``` |
||

136 | ```
"sub $8, %1 \n"
``` |
||

137 | ```
"add $8, %0 \n"
``` |
||

138 | ```
"jl 1b \n"
``` |
||

139 | :"+r"(j), "+r"(k) |
||

140 | :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) |
||

141 | f469094c | Loren Merritt | ```
:"memory"
``` |

142 | ); |
||

143 | be449fca | Diego Pettenò | __asm__ volatile("femms"); |

144 | bcfa3e58 | Loren Merritt | } |

145 | 1e4ecf26 | Loren Merritt | |

146 | 0a570e82 | Loren Merritt | void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input) |

147 | b9fa3208 | Loren Merritt | { |

148 | x86_reg j, k; |
||

149 | 46803f4f | Loren Merritt | long n = 1 << s->nbits; |

150 | long n4 = n >> 2; |
||

151 | b9fa3208 | Loren Merritt | |

152 | 46803f4f | Loren Merritt | ff_imdct_half_3dn2(s, output+n4, input); |

153 | b9fa3208 | Loren Merritt | |

154 | j = -n; |
||

155 | ```
k = n-8;
``` |
||

156 | be449fca | Diego Pettenò | ```
__asm__ volatile(
``` |

157 | 46803f4f | Loren Merritt | ```
"movq %4, %%mm7 \n"
``` |

158 | ```
"1: \n"
``` |
||

159 | PSWAPD((%2,%1), %%mm0) |
||

160 | PSWAPD((%3,%0), %%mm1) |
||

161 | ```
"pxor %%mm7, %%mm0 \n"
``` |
||

162 | ```
"movq %%mm1, (%3,%1) \n"
``` |
||

163 | ```
"movq %%mm0, (%2,%0) \n"
``` |
||

164 | ```
"sub $8, %1 \n"
``` |
||

165 | ```
"add $8, %0 \n"
``` |
||

166 | ```
"jl 1b \n"
``` |
||

167 | b9fa3208 | Loren Merritt | :"+r"(j), "+r"(k) |

168 | 46803f4f | Loren Merritt | :"r"(output+n4), "r"(output+n4*3), |

169 | ```
"m"(*m1m1)
``` |
||

170 | b9fa3208 | Loren Merritt | ); |

171 | be449fca | Diego Pettenò | __asm__ volatile("femms"); |

172 | b9fa3208 | Loren Merritt | } |