Revision 5d0ddd1a libavcodec/i386/fft_sse.c
libavcodec/i386/fft_sse.c  

22  22 
#include "libavutil/x86_cpu.h" 
23  23 
#include "libavcodec/dsputil.h" 
24  24  
25 
static const int p1p1p1m1[4] __attribute__((aligned(16))) = 

26 
{ 0, 0, 0, 1 << 31 }; 

27  
28 
static const int p1p1m1p1[4] __attribute__((aligned(16))) = 

29 
{ 0, 0, 1 << 31, 0 }; 

30  
31 
static const int p1p1m1m1[4] __attribute__((aligned(16))) = 

32 
{ 0, 0, 1 << 31, 1 << 31 }; 

33  
34  25 
static const int p1m1p1m1[4] __attribute__((aligned(16))) = 
35  26 
{ 0, 1 << 31, 0, 1 << 31 }; 
36  27  
37  28 
static const int m1m1m1m1[4] __attribute__((aligned(16))) = 
38  29 
{ 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; 
39  30  
40 
#if 0 

41 
static void print_v4sf(const char *str, __m128 a) 

42 
{ 

43 
float *p = (float *)&a; 

44 
printf("%s: %f %f %f %f\n", 

45 
str, p[0], p[1], p[2], p[3]); 

46 
} 

47 
#endif 

31 
void ff_fft_dispatch_sse(FFTComplex *z, int nbits); 

32 
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits); 

48  33  
49 
/* XXX: handle reverse case */ 

50  34 
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) 
51  35 
{ 
52 
int ln = s>nbits; 

53 
x86_reg i; 

54 
long j; 

55 
long nblocks, nloops; 

56 
FFTComplex *p, *cptr; 

36 
int n = 1 << s>nbits; 

57  37  
58 
asm volatile( 

59 
"movaps %0, %%xmm4 \n\t" 

60 
"movaps %1, %%xmm5 \n\t" 

61 
::"m"(*p1p1m1m1), 

62 
"m"(*(s>inverse ? p1p1m1p1 : p1p1p1m1)) 

63 
); 

38 
ff_fft_dispatch_interleave_sse(z, s>nbits); 

64  39  
65 
i = 8 << ln; 

66 
asm volatile( 

67 
"1: \n\t" 

68 
"sub $32, %0 \n\t" 

69 
/* do the pass 0 butterfly */ 

70 
"movaps (%0,%1), %%xmm0 \n\t" 

71 
"movaps %%xmm0, %%xmm1 \n\t" 

72 
"shufps $0x4E, %%xmm0, %%xmm0 \n\t" 

73 
"xorps %%xmm4, %%xmm1 \n\t" 

74 
"addps %%xmm1, %%xmm0 \n\t" 

75 
"movaps 16(%0,%1), %%xmm2 \n\t" 

76 
"movaps %%xmm2, %%xmm3 \n\t" 

77 
"shufps $0x4E, %%xmm2, %%xmm2 \n\t" 

78 
"xorps %%xmm4, %%xmm3 \n\t" 

79 
"addps %%xmm3, %%xmm2 \n\t" 

80 
/* multiply third by i */ 

81 
/* by toggling the sign bit */ 

82 
"shufps $0xB4, %%xmm2, %%xmm2 \n\t" 

83 
"xorps %%xmm5, %%xmm2 \n\t" 

84 
/* do the pass 1 butterfly */ 

85 
"movaps %%xmm0, %%xmm1 \n\t" 

86 
"addps %%xmm2, %%xmm0 \n\t" 

87 
"subps %%xmm2, %%xmm1 \n\t" 

88 
"movaps %%xmm0, (%0,%1) \n\t" 

89 
"movaps %%xmm1, 16(%0,%1) \n\t" 

90 
"jg 1b \n\t" 

91 
:"+r"(i) 

92 
:"r"(z) 

93 
); 

94 
/* pass 2 .. ln1 */ 

40 
if(n <= 16) { 

41 
x86_reg i = 8*n; 

42 
asm volatile( 

43 
"1: \n" 

44 
"movaps (%0,%1), %%xmm0 \n" 

45 
"movaps %%xmm0, %%xmm1 \n" 

46 
"unpcklps 16(%0,%1), %%xmm0 \n" 

47 
"unpckhps 16(%0,%1), %%xmm1 \n" 

48 
"movaps %%xmm0, (%0,%1) \n" 

49 
"movaps %%xmm1, 16(%0,%1) \n" 

50 
"add $32, %0 \n" 

51 
"jl 1b \n" 

52 
:"+r"(i) 

53 
:"r"(z+n) 

54 
:"memory" 

55 
); 

56 
} 

57 
} 

95  58  
96 
nblocks = 1 << (ln3); 

97 
nloops = 1 << 2; 

98 
cptr = s>exptab1; 

99 
do { 

100 
p = z; 

101 
j = nblocks; 

102 
do { 

103 
i = nloops*8; 

104 
asm volatile( 

105 
"1: \n\t" 

106 
"sub $32, %0 \n\t" 

107 
"movaps (%2,%0), %%xmm1 \n\t" 

108 
"movaps (%1,%0), %%xmm0 \n\t" 

109 
"movaps 16(%2,%0), %%xmm5 \n\t" 

110 
"movaps 16(%1,%0), %%xmm4 \n\t" 

111 
"movaps %%xmm1, %%xmm2 \n\t" 

112 
"movaps %%xmm5, %%xmm6 \n\t" 

113 
"shufps $0xA0, %%xmm1, %%xmm1 \n\t" 

114 
"shufps $0xF5, %%xmm2, %%xmm2 \n\t" 

115 
"shufps $0xA0, %%xmm5, %%xmm5 \n\t" 

116 
"shufps $0xF5, %%xmm6, %%xmm6 \n\t" 

117 
"mulps (%3,%0,2), %%xmm1 \n\t" // cre*re cim*re 

118 
"mulps 16(%3,%0,2), %%xmm2 \n\t" // cim*im cre*im 

119 
"mulps 32(%3,%0,2), %%xmm5 \n\t" // cre*re cim*re 

120 
"mulps 48(%3,%0,2), %%xmm6 \n\t" // cim*im cre*im 

121 
"addps %%xmm2, %%xmm1 \n\t" 

122 
"addps %%xmm6, %%xmm5 \n\t" 

123 
"movaps %%xmm0, %%xmm3 \n\t" 

124 
"movaps %%xmm4, %%xmm7 \n\t" 

125 
"addps %%xmm1, %%xmm0 \n\t" 

126 
"subps %%xmm1, %%xmm3 \n\t" 

127 
"addps %%xmm5, %%xmm4 \n\t" 

128 
"subps %%xmm5, %%xmm7 \n\t" 

129 
"movaps %%xmm0, (%1,%0) \n\t" 

130 
"movaps %%xmm3, (%2,%0) \n\t" 

131 
"movaps %%xmm4, 16(%1,%0) \n\t" 

132 
"movaps %%xmm7, 16(%2,%0) \n\t" 

133 
"jg 1b \n\t" 

134 
:"+r"(i) 

135 
:"r"(p), "r"(p + nloops), "r"(cptr) 

136 
); 

137 
p += nloops*2; 

138 
} while (j); 

139 
cptr += nloops*2; 

140 
nblocks >>= 1; 

141 
nloops <<= 1; 

142 
} while (nblocks != 0); 

59 
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z) 

60 
{ 

61 
int n = 1 << s>nbits; 

62 
int i; 

63 
for(i=0; i<n; i+=2) { 

64 
asm volatile( 

65 
"movaps %2, %%xmm0 \n" 

66 
"movlps %%xmm0, %0 \n" 

67 
"movhps %%xmm0, %1 \n" 

68 
:"=m"(s>tmp_buf[s>revtab[i]]), 

69 
"=m"(s>tmp_buf[s>revtab[i+1]]) 

70 
:"m"(z[i]) 

71 
); 

72 
} 

73 
memcpy(z, s>tmp_buf, n*sizeof(FFTComplex)); 

143  74 
} 
144  75  
145  76 
static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp) 
Also available in: Unified diff