## ffmpeg / libavcodec / i386 / fft_3dn2.c @ 40d0e665

History | View | Annotate | Download (6.95 KB)

1 | 82eb4b0f | Zuxy Meng | ```
/*
``` |
---|---|---|---|

2 | ```
* FFT/MDCT transform with Extended 3DNow! optimizations
``` |
||

3 | bcfa3e58 | Loren Merritt | ```
* Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
``` |

4 | 82eb4b0f | Zuxy Meng | ```
* Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
``` |

5 | ```
*
``` |
||

6 | b78e7197 | Diego Biurrun | ```
* This file is part of FFmpeg.
``` |

7 | ```
*
``` |
||

8 | ```
* FFmpeg is free software; you can redistribute it and/or
``` |
||

9 | 82eb4b0f | Zuxy Meng | ```
* modify it under the terms of the GNU Lesser General Public
``` |

10 | ```
* License as published by the Free Software Foundation; either
``` |
||

11 | b78e7197 | Diego Biurrun | ```
* version 2.1 of the License, or (at your option) any later version.
``` |

12 | 82eb4b0f | Zuxy Meng | ```
*
``` |

13 | b78e7197 | Diego Biurrun | ```
* FFmpeg is distributed in the hope that it will be useful,
``` |

14 | 82eb4b0f | Zuxy Meng | ```
* but WITHOUT ANY WARRANTY; without even the implied warranty of
``` |

15 | ```
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
``` |
||

16 | ```
* Lesser General Public License for more details.
``` |
||

17 | ```
*
``` |
||

18 | ```
* You should have received a copy of the GNU Lesser General Public
``` |
||

19 | b78e7197 | Diego Biurrun | ```
* License along with FFmpeg; if not, write to the Free Software
``` |

20 | 82eb4b0f | Zuxy Meng | ```
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
``` |

21 | ```
*/
``` |
||

22 | b550bfaa | Ronald S. Bultje | #include "dsputil.h" |

23 | 40d0e665 | Ramiro Polla | #include "x86_cpu.h" |

24 | 82eb4b0f | Zuxy Meng | |

25 | static const int p1m1[2] __attribute__((aligned(8))) = |
||

26 | { 0, 1 << 31 }; |
||

27 | |||

28 | static const int m1p1[2] __attribute__((aligned(8))) = |
||

29 | { 1 << 31, 0 }; |
||

30 | |||

31 | ```
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
``` |
||

32 | { |
||

33 | ```
int ln = s->nbits;
``` |
||

34 | 40d0e665 | Ramiro Polla | ```
long j;
``` |

35 | x86_reg i; |
||

36 | 1e4ecf26 | Loren Merritt | ```
long nblocks, nloops;
``` |

37 | FFTComplex *p, *cptr; |
||

38 | |||

39 | asm volatile( |
||

40 | ```
/* FEMMS is not a must here but recommended by AMD */
``` |
||

41 | ```
"femms \n\t"
``` |
||

42 | ```
"movq %0, %%mm7 \n\t"
``` |
||

43 | ```
::"m"(*(s->inverse ? m1p1 : p1m1))
``` |
||

44 | ); |
||

45 | |||

46 | ```
i = 8 << ln;
``` |
||

47 | asm volatile( |
||

48 | ```
"1: \n\t"
``` |
||

49 | ```
"sub $32, %0 \n\t"
``` |
||

50 | ```
"movq (%0,%1), %%mm0 \n\t"
``` |
||

51 | ```
"movq 16(%0,%1), %%mm1 \n\t"
``` |
||

52 | ```
"movq 8(%0,%1), %%mm2 \n\t"
``` |
||

53 | ```
"movq 24(%0,%1), %%mm3 \n\t"
``` |
||

54 | ```
"movq %%mm0, %%mm4 \n\t"
``` |
||

55 | ```
"movq %%mm1, %%mm5 \n\t"
``` |
||

56 | ```
"pfadd %%mm2, %%mm0 \n\t"
``` |
||

57 | ```
"pfadd %%mm3, %%mm1 \n\t"
``` |
||

58 | ```
"pfsub %%mm2, %%mm4 \n\t"
``` |
||

59 | ```
"pfsub %%mm3, %%mm5 \n\t"
``` |
||

60 | ```
"movq %%mm0, %%mm2 \n\t"
``` |
||

61 | ```
"pswapd %%mm5, %%mm5 \n\t"
``` |
||

62 | ```
"movq %%mm4, %%mm3 \n\t"
``` |
||

63 | ```
"pxor %%mm7, %%mm5 \n\t"
``` |
||

64 | ```
"pfadd %%mm1, %%mm0 \n\t"
``` |
||

65 | ```
"pfadd %%mm5, %%mm4 \n\t"
``` |
||

66 | ```
"pfsub %%mm1, %%mm2 \n\t"
``` |
||

67 | ```
"pfsub %%mm5, %%mm3 \n\t"
``` |
||

68 | ```
"movq %%mm0, (%0,%1) \n\t"
``` |
||

69 | ```
"movq %%mm4, 8(%0,%1) \n\t"
``` |
||

70 | ```
"movq %%mm2, 16(%0,%1) \n\t"
``` |
||

71 | ```
"movq %%mm3, 24(%0,%1) \n\t"
``` |
||

72 | ```
"jg 1b \n\t"
``` |
||

73 | ```
:"+r"(i)
``` |
||

74 | ```
:"r"(z)
``` |
||

75 | ); |
||

76 | 82eb4b0f | Zuxy Meng | ```
/* pass 2 .. ln-1 */
``` |

77 | |||

78 | 1e4ecf26 | Loren Merritt | nblocks = 1 << (ln-3); |

79 | 82eb4b0f | Zuxy Meng | nloops = 1 << 2; |

80 | 1e4ecf26 | Loren Merritt | cptr = s->exptab1; |

81 | 82eb4b0f | Zuxy Meng | ```
do {
``` |

82 | p = z; |
||

83 | j = nblocks; |
||

84 | ```
do {
``` |
||

85 | 1e4ecf26 | Loren Merritt | ```
i = nloops*8;
``` |

86 | asm volatile( |
||

87 | ```
"1: \n\t"
``` |
||

88 | ```
"sub $16, %0 \n\t"
``` |
||

89 | ```
"movq (%1,%0), %%mm0 \n\t"
``` |
||

90 | ```
"movq 8(%1,%0), %%mm1 \n\t"
``` |
||

91 | ```
"movq (%2,%0), %%mm2 \n\t"
``` |
||

92 | ```
"movq 8(%2,%0), %%mm3 \n\t"
``` |
||

93 | ```
"movq (%3,%0,2), %%mm4 \n\t"
``` |
||

94 | ```
"movq 8(%3,%0,2), %%mm5 \n\t"
``` |
||

95 | "pswapd %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3] |
||

96 | ```
"pswapd %%mm5, %%mm7 \n\t"
``` |
||

97 | "pfmul %%mm2, %%mm4 \n\t" // cre*re cim*im |
||

98 | ```
"pfmul %%mm3, %%mm5 \n\t"
``` |
||

99 | "pfmul %%mm2, %%mm6 \n\t" // cim*re cre*im |
||

100 | ```
"pfmul %%mm3, %%mm7 \n\t"
``` |
||

101 | "pfpnacc %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im |
||

102 | ```
"pfpnacc %%mm7, %%mm5 \n\t"
``` |
||

103 | ```
"movq %%mm0, %%mm2 \n\t"
``` |
||

104 | ```
"movq %%mm1, %%mm3 \n\t"
``` |
||

105 | ```
"pfadd %%mm4, %%mm0 \n\t"
``` |
||

106 | ```
"pfadd %%mm5, %%mm1 \n\t"
``` |
||

107 | ```
"pfsub %%mm4, %%mm2 \n\t"
``` |
||

108 | ```
"pfsub %%mm5, %%mm3 \n\t"
``` |
||

109 | ```
"movq %%mm0, (%1,%0) \n\t"
``` |
||

110 | ```
"movq %%mm1, 8(%1,%0) \n\t"
``` |
||

111 | ```
"movq %%mm2, (%2,%0) \n\t"
``` |
||

112 | ```
"movq %%mm3, 8(%2,%0) \n\t"
``` |
||

113 | ```
"jg 1b \n\t"
``` |
||

114 | ```
:"+r"(i)
``` |
||

115 | :"r"(p), "r"(p + nloops), "r"(cptr) |
||

116 | ); |
||

117 | ```
p += nloops*2;
``` |
||

118 | 82eb4b0f | Zuxy Meng | ```
} while (--j);
``` |

119 | 1e4ecf26 | Loren Merritt | ```
cptr += nloops*2;
``` |

120 | ```
nblocks >>= 1;
``` |
||

121 | ```
nloops <<= 1;
``` |
||

122 | 82eb4b0f | Zuxy Meng | } while (nblocks != 0); |

123 | 1e4ecf26 | Loren Merritt | asm volatile("femms"); |

124 | 82eb4b0f | Zuxy Meng | } |

125 | |||

126 | bcfa3e58 | Loren Merritt | ```
void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
``` |

127 | ```
const FFTSample *input, FFTSample *tmp)
``` |
||

128 | { |
||

129 | 40d0e665 | Ramiro Polla | ```
long n8, n4, n2, n;
``` |

130 | x86_reg k; |
||

131 | bcfa3e58 | Loren Merritt | ```
const uint16_t *revtab = s->fft.revtab;
``` |

132 | ```
const FFTSample *tcos = s->tcos;
``` |
||

133 | ```
const FFTSample *tsin = s->tsin;
``` |
||

134 | ```
const FFTSample *in1, *in2;
``` |
||

135 | FFTComplex *z = (FFTComplex *)tmp; |
||

136 | |||

137 | ```
n = 1 << s->nbits;
``` |
||

138 | ```
n2 = n >> 1;
``` |
||

139 | ```
n4 = n >> 2;
``` |
||

140 | ```
n8 = n >> 3;
``` |
||

141 | |||

142 | ```
/* pre rotation */
``` |
||

143 | in1 = input; |
||

144 | ```
in2 = input + n2 - 1;
``` |
||

145 | for(k = 0; k < n4; k++) { |
||

146 | 2494bdd9 | Loren Merritt | ```
// FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it
``` |

147 | bcfa3e58 | Loren Merritt | asm volatile( |

148 | 2494bdd9 | Loren Merritt | ```
"movd %0, %%mm0 \n\t"
``` |

149 | ```
"movd %2, %%mm1 \n\t"
``` |
||

150 | ```
"punpckldq %1, %%mm0 \n\t"
``` |
||

151 | ```
"punpckldq %3, %%mm1 \n\t"
``` |
||

152 | bcfa3e58 | Loren Merritt | ```
"movq %%mm0, %%mm2 \n\t"
``` |

153 | ```
"pfmul %%mm1, %%mm0 \n\t"
``` |
||

154 | ```
"pswapd %%mm1, %%mm1 \n\t"
``` |
||

155 | ```
"pfmul %%mm1, %%mm2 \n\t"
``` |
||

156 | ```
"pfpnacc %%mm2, %%mm0 \n\t"
``` |
||

157 | 2494bdd9 | Loren Merritt | ::"m"(in2[-2*k]), "m"(in1[2*k]), |

158 | "m"(tcos[k]), "m"(tsin[k]) |
||

159 | ); |
||

160 | asm volatile( |
||

161 | bcfa3e58 | Loren Merritt | ```
"movq %%mm0, %0 \n\t"
``` |

162 | ```
:"=m"(z[revtab[k]])
``` |
||

163 | ); |
||

164 | } |
||

165 | |||

166 | ff_fft_calc(&s->fft, z); |
||

167 | |||

168 | ```
/* post rotation + reordering */
``` |
||

169 | for(k = 0; k < n4; k++) { |
||

170 | asm volatile( |
||

171 | ```
"movq %0, %%mm0 \n\t"
``` |
||

172 | ```
"movd %1, %%mm1 \n\t"
``` |
||

173 | ```
"punpckldq %2, %%mm1 \n\t"
``` |
||

174 | ```
"movq %%mm0, %%mm2 \n\t"
``` |
||

175 | ```
"pfmul %%mm1, %%mm0 \n\t"
``` |
||

176 | ```
"pswapd %%mm1, %%mm1 \n\t"
``` |
||

177 | ```
"pfmul %%mm1, %%mm2 \n\t"
``` |
||

178 | ```
"pfpnacc %%mm2, %%mm0 \n\t"
``` |
||

179 | ```
"movq %%mm0, %0 \n\t"
``` |
||

180 | ```
:"+m"(z[k])
``` |
||

181 | :"m"(tcos[k]), "m"(tsin[k]) |
||

182 | ); |
||

183 | } |
||

184 | |||

185 | f469094c | Loren Merritt | ```
k = n-8;
``` |

186 | bcfa3e58 | Loren Merritt | asm volatile("movd %0, %%mm7" ::"r"(1<<31)); |

187 | f469094c | Loren Merritt | asm volatile( |

188 | a4eb118a | Loren Merritt | ```
"1: \n\t"
``` |

189 | "movq (%4,%0), %%mm0 \n\t" // z[n8+k] |
||

190 | ```
"neg %0 \n\t"
``` |
||

191 | "pswapd -8(%4,%0), %%mm1 \n\t" // z[n8-1-k] |
||

192 | ```
"movq %%mm0, %%mm2 \n\t"
``` |
||

193 | ```
"pxor %%mm7, %%mm2 \n\t"
``` |
||

194 | ```
"punpckldq %%mm1, %%mm2 \n\t"
``` |
||

195 | ```
"pswapd %%mm2, %%mm3 \n\t"
``` |
||

196 | ```
"punpckhdq %%mm1, %%mm0 \n\t"
``` |
||

197 | ```
"pswapd %%mm0, %%mm4 \n\t"
``` |
||

198 | ```
"pxor %%mm7, %%mm0 \n\t"
``` |
||

199 | ```
"pxor %%mm7, %%mm4 \n\t"
``` |
||

200 | "movq %%mm3, -8(%3,%0) \n\t" // output[n-2-2*k] = { z[n8-1-k].im, -z[n8+k].re } |
||

201 | "movq %%mm4, -8(%2,%0) \n\t" // output[n2-2-2*k]= { -z[n8-1-k].re, z[n8+k].im } |
||

202 | ```
"neg %0 \n\t"
``` |
||

203 | "movq %%mm0, (%1,%0) \n\t" // output[2*k] = { -z[n8+k].im, z[n8-1-k].re } |
||

204 | "movq %%mm2, (%2,%0) \n\t" // output[n2+2*k] = { -z[n8+k].re, z[n8-1-k].im } |
||

205 | ```
"sub $8, %0 \n\t"
``` |
||

206 | ```
"jge 1b \n\t"
``` |
||

207 | f469094c | Loren Merritt | ```
:"+r"(k)
``` |

208 | :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8) |
||

209 | ```
:"memory"
``` |
||

210 | ); |
||

211 | ee5df927 | Loren Merritt | asm volatile("femms"); |

212 | bcfa3e58 | Loren Merritt | } |