Revision 83ad74e7 libavcodec/armv4l/float_arm_vfp.c
libavcodec/armv4l/float_arm_vfp.c  

20  20  
21  21 
#include "libavcodec/dsputil.h" 
22  22  
23 
/* 

24 
* VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle 

25 
* throughput for almost all the instructions (except for double precision 

26 
* arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles 

27 
* for arithmetic operations. Scheduling code to avoid pipeline stalls is very 

28 
* important for performance. One more interesting feature is that VFP has 

29 
* independent load/store and arithmetics pipelines, so it is possible to make 

30 
* them work simultaneously and get more than 1 operation per cycle. Load/store 

31 
* pipeline can process 2 single precision floating point values per cycle and 

32 
* supports bulk loads and stores for large sets of registers. Arithmetic operations 

33 
* can be done on vectors, which allows to keep the arithmetics pipeline busy, 

34 
* while the processor may issue and execute other instructions. Detailed 

35 
* optimization manuals can be found at http://www.arm.com 

36 
*/ 

37  
38 
/** 

39 
* ARM VFP optimized implementation of 'vector_fmul_c' function. 

40 
* Assume that len is a positive number and is multiple of 8 

41 
*/ 

42 
static void vector_fmul_vfp(float *dst, const float *src, int len) 

43 
{ 

44 
int tmp; 

45 
__asm__ volatile( 

46 
"fmrx %[tmp], fpscr\n\t" 

47 
"orr %[tmp], %[tmp], #(3 << 16)\n\t" /* set vector size to 4 */ 

48 
"fmxr fpscr, %[tmp]\n\t" 

49  
50 
"fldmias %[dst_r]!, {s0s3}\n\t" 

51 
"fldmias %[src]!, {s8s11}\n\t" 

52 
"fldmias %[dst_r]!, {s4s7}\n\t" 

53 
"fldmias %[src]!, {s12s15}\n\t" 

54 
"fmuls s8, s0, s8\n\t" 

55 
"1:\n\t" 

56 
"subs %[len], %[len], #16\n\t" 

57 
"fmuls s12, s4, s12\n\t" 

58 
"fldmiasge %[dst_r]!, {s16s19}\n\t" 

59 
"fldmiasge %[src]!, {s24s27}\n\t" 

60 
"fldmiasge %[dst_r]!, {s20s23}\n\t" 

61 
"fldmiasge %[src]!, {s28s31}\n\t" 

62 
"fmulsge s24, s16, s24\n\t" 

63 
"fstmias %[dst_w]!, {s8s11}\n\t" 

64 
"fstmias %[dst_w]!, {s12s15}\n\t" 

65 
"fmulsge s28, s20, s28\n\t" 

66 
"fldmiasgt %[dst_r]!, {s0s3}\n\t" 

67 
"fldmiasgt %[src]!, {s8s11}\n\t" 

68 
"fldmiasgt %[dst_r]!, {s4s7}\n\t" 

69 
"fldmiasgt %[src]!, {s12s15}\n\t" 

70 
"fmulsge s8, s0, s8\n\t" 

71 
"fstmiasge %[dst_w]!, {s24s27}\n\t" 

72 
"fstmiasge %[dst_w]!, {s28s31}\n\t" 

73 
"bgt 1b\n\t" 

74  
75 
"bic %[tmp], %[tmp], #(7 << 16)\n\t" /* set vector size back to 1 */ 

76 
"fmxr fpscr, %[tmp]\n\t" 

77 
: [dst_w] "+&r" (dst), [dst_r] "+&r" (dst), [src] "+&r" (src), [len] "+&r" (len), [tmp] "=&r" (tmp) 

78 
: 

79 
: "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", 

80 
"s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", 

81 
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", 

82 
"s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31", 

83 
"cc", "memory"); 

84 
} 

85  
86 
/** 

87 
* ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. 

88 
* Assume that len is a positive number and is multiple of 8 

89 
*/ 

90 
static void vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len) 

91 
{ 

92 
src1 += len; 

93 
__asm__ volatile( 

94 
"fldmdbs %[src1]!, {s0s3}\n\t" 

95 
"fldmias %[src0]!, {s8s11}\n\t" 

96 
"fldmdbs %[src1]!, {s4s7}\n\t" 

97 
"fldmias %[src0]!, {s12s15}\n\t" 

98 
"fmuls s8, s3, s8\n\t" 

99 
"fmuls s9, s2, s9\n\t" 

100 
"fmuls s10, s1, s10\n\t" 

101 
"fmuls s11, s0, s11\n\t" 

102 
"1:\n\t" 

103 
"subs %[len], %[len], #16\n\t" 

104 
"fldmdbsge %[src1]!, {s16s19}\n\t" 

105 
"fmuls s12, s7, s12\n\t" 

106 
"fldmiasge %[src0]!, {s24s27}\n\t" 

107 
"fmuls s13, s6, s13\n\t" 

108 
"fldmdbsge %[src1]!, {s20s23}\n\t" 

109 
"fmuls s14, s5, s14\n\t" 

110 
"fldmiasge %[src0]!, {s28s31}\n\t" 

111 
"fmuls s15, s4, s15\n\t" 

112 
"fmulsge s24, s19, s24\n\t" 

113 
"fldmdbsgt %[src1]!, {s0s3}\n\t" 

114 
"fmulsge s25, s18, s25\n\t" 

115 
"fstmias %[dst]!, {s8s13}\n\t" 

116 
"fmulsge s26, s17, s26\n\t" 

117 
"fldmiasgt %[src0]!, {s8s11}\n\t" 

118 
"fmulsge s27, s16, s27\n\t" 

119 
"fmulsge s28, s23, s28\n\t" 

120 
"fldmdbsgt %[src1]!, {s4s7}\n\t" 

121 
"fmulsge s29, s22, s29\n\t" 

122 
"fstmias %[dst]!, {s14s15}\n\t" 

123 
"fmulsge s30, s21, s30\n\t" 

124 
"fmulsge s31, s20, s31\n\t" 

125 
"fmulsge s8, s3, s8\n\t" 

126 
"fldmiasgt %[src0]!, {s12s15}\n\t" 

127 
"fmulsge s9, s2, s9\n\t" 

128 
"fmulsge s10, s1, s10\n\t" 

129 
"fstmiasge %[dst]!, {s24s27}\n\t" 

130 
"fmulsge s11, s0, s11\n\t" 

131 
"fstmiasge %[dst]!, {s28s31}\n\t" 

132 
"bgt 1b\n\t" 

133  
134 
: [dst] "+&r" (dst), [src0] "+&r" (src0), [src1] "+&r" (src1), [len] "+&r" (len) 

135 
: 

136 
: "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", 

137 
"s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", 

138 
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", 

139 
"s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31", 

140 
"cc", "memory"); 

141 
} 

142  
143 
#ifdef HAVE_ARMV6 

144 
/** 

145 
* ARM VFP optimized float to int16 conversion. 

146 
* Assume that len is a positive number and is multiple of 8, destination 

147 
* buffer is at least 4 bytes aligned (8 bytes alignment is better for 

148 
* performance), little endian byte sex 

149 
*/ 

150 
void float_to_int16_vfp(int16_t *dst, const float *src, int len) 

151 
{ 

152 
__asm__ volatile( 

153 
"fldmias %[src]!, {s16s23}\n\t" 

154 
"ftosis s0, s16\n\t" 

155 
"ftosis s1, s17\n\t" 

156 
"ftosis s2, s18\n\t" 

157 
"ftosis s3, s19\n\t" 

158 
"ftosis s4, s20\n\t" 

159 
"ftosis s5, s21\n\t" 

160 
"ftosis s6, s22\n\t" 

161 
"ftosis s7, s23\n\t" 

162 
"1:\n\t" 

163 
"subs %[len], %[len], #8\n\t" 

164 
"fmrrs r3, r4, {s0, s1}\n\t" 

165 
"fmrrs r5, r6, {s2, s3}\n\t" 

166 
"fmrrs r7, r8, {s4, s5}\n\t" 

167 
"fmrrs ip, lr, {s6, s7}\n\t" 

168 
"fldmiasgt %[src]!, {s16s23}\n\t" 

169 
"ssat r4, #16, r4\n\t" 

170 
"ssat r3, #16, r3\n\t" 

171 
"ssat r6, #16, r6\n\t" 

172 
"ssat r5, #16, r5\n\t" 

173 
"pkhbt r3, r3, r4, lsl #16\n\t" 

174 
"pkhbt r4, r5, r6, lsl #16\n\t" 

175 
"ftosisgt s0, s16\n\t" 

176 
"ftosisgt s1, s17\n\t" 

177 
"ftosisgt s2, s18\n\t" 

178 
"ftosisgt s3, s19\n\t" 

179 
"ftosisgt s4, s20\n\t" 

180 
"ftosisgt s5, s21\n\t" 

181 
"ftosisgt s6, s22\n\t" 

182 
"ftosisgt s7, s23\n\t" 

183 
"ssat r8, #16, r8\n\t" 

184 
"ssat r7, #16, r7\n\t" 

185 
"ssat lr, #16, lr\n\t" 

186 
"ssat ip, #16, ip\n\t" 

187 
"pkhbt r5, r7, r8, lsl #16\n\t" 

188 
"pkhbt r6, ip, lr, lsl #16\n\t" 

189 
"stmia %[dst]!, {r3r6}\n\t" 

190 
"bgt 1b\n\t" 

191  
192 
: [dst] "+&r" (dst), [src] "+&r" (src), [len] "+&r" (len) 

193 
: 

194 
: "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", 

195 
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", 

196 
"r3", "r4", "r5", "r6", "r7", "r8", "ip", "lr", 

197 
"cc", "memory"); 

198 
} 

199 
#endif 

23 
extern void ff_vector_fmul_vfp(float *dst, const float *src, int len); 

24 
extern void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, 

25 
const float *src1, int len); 

26 
extern void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); 

200  27  
201  28 
void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx) 
202  29 
{ 
203 
c>vector_fmul = vector_fmul_vfp; 

204 
c>vector_fmul_reverse = vector_fmul_reverse_vfp; 

30 
c>vector_fmul = ff_vector_fmul_vfp;


31 
c>vector_fmul_reverse = ff_vector_fmul_reverse_vfp;


205  32 
#ifdef HAVE_ARMV6 
206 
c>float_to_int16 = float_to_int16_vfp; 

33 
c>float_to_int16 = ff_float_to_int16_vfp;


207  34 
#endif 
208  35 
} 
Also available in: Unified diff