## ffmpeg / libavcodec / arm / dsputil_vfp.S @ 015f9f1a

History | View | Annotate | Download (6.93 KB)

1 | 83ad74e7 | Måns Rullgård | /* |
---|---|---|---|

2 | * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> |
||

3 | * |
||

4 | * This file is part of FFmpeg. |
||

5 | * |
||

6 | * FFmpeg is free software; you can redistribute it and/or |
||

7 | * modify it under the terms of the GNU Lesser General Public |
||

8 | * License as published by the Free Software Foundation; either |
||

9 | * version 2.1 of the License, or (at your option) any later version. |
||

10 | * |
||

11 | * FFmpeg is distributed in the hope that it will be useful, |
||

12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||

13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||

14 | * Lesser General Public License for more details. |
||

15 | * |
||

16 | * You should have received a copy of the GNU Lesser General Public |
||

17 | * License along with FFmpeg; if not, write to the Free Software |
||

18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||

19 | */ |
||

20 | |||

21 | #include "config.h" |
||

22 | #include "asm.S" |
||

23 | |||

24 | fd818a21 | Måns Rullgård | .syntax unified |

25 | 83ad74e7 | Måns Rullgård | /* |

26 | * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle |
||

27 | * throughput for almost all the instructions (except for double precision |
||

28 | * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles |
||

29 | * for arithmetic operations. Scheduling code to avoid pipeline stalls is very |
||

30 | * important for performance. One more interesting feature is that VFP has |
||

31 | * independent load/store and arithmetics pipelines, so it is possible to make |
||

32 | * them work simultaneously and get more than 1 operation per cycle. Load/store |
||

33 | * pipeline can process 2 single precision floating point values per cycle and |
||

34 | * supports bulk loads and stores for large sets of registers. Arithmetic operations |
||

35 | * can be done on vectors, which allows to keep the arithmetics pipeline busy, |
||

36 | * while the processor may issue and execute other instructions. Detailed |
||

37 | * optimization manuals can be found at http://www.arm.com |
||

38 | */ |
||

39 | |||

40 | /** |
||

41 | * ARM VFP optimized implementation of 'vector_fmul_c' function. |
||

42 | * Assume that len is a positive number and is multiple of 8 |
||

43 | */ |
||

44 | 015f9f1a | Justin Ruggles | @ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len) |

45 | 83ad74e7 | Måns Rullgård | function ff_vector_fmul_vfp, export=1 |

46 | vpush {d8-d15} |
||

47 | fmrx r12, fpscr |
||

48 | orr r12, r12, #(3 << 16) /* set vector size to 4 */ |
||

49 | fmxr fpscr, r12 |
||

50 | |||

51 | 015f9f1a | Justin Ruggles | vldmia r1!, {s0-s3} |

52 | vldmia r2!, {s8-s11} |
||

53 | vldmia r1!, {s4-s7} |
||

54 | vldmia r2!, {s12-s15} |
||

55 | b0e8ce55 | Måns Rullgård | vmul.f32 s8, s0, s8 |

56 | 83ad74e7 | Måns Rullgård | 1: |

57 | 015f9f1a | Justin Ruggles | subs r3, r3, #16 |

58 | b0e8ce55 | Måns Rullgård | vmul.f32 s12, s4, s12 |

59 | 015f9f1a | Justin Ruggles | vldmiage r1!, {s16-s19} |

60 | vldmiage r2!, {s24-s27} |
||

61 | vldmiage r1!, {s20-s23} |
||

62 | vldmiage r2!, {s28-s31} |
||

63 | b0e8ce55 | Måns Rullgård | vmulge.f32 s24, s16, s24 |

64 | vstmia r0!, {s8-s11} |
||

65 | vstmia r0!, {s12-s15} |
||

66 | vmulge.f32 s28, s20, s28 |
||

67 | 015f9f1a | Justin Ruggles | vldmiagt r1!, {s0-s3} |

68 | vldmiagt r2!, {s8-s11} |
||

69 | vldmiagt r1!, {s4-s7} |
||

70 | vldmiagt r2!, {s12-s15} |
||

71 | b0e8ce55 | Måns Rullgård | vmulge.f32 s8, s0, s8 |

72 | vstmiage r0!, {s24-s27} |
||

73 | vstmiage r0!, {s28-s31} |
||

74 | 83ad74e7 | Måns Rullgård | bgt 1b |

75 | |||

76 | bic r12, r12, #(7 << 16) /* set vector size back to 1 */ |
||

77 | fmxr fpscr, r12 |
||

78 | vpop {d8-d15} |
||

79 | bx lr |
||

80 | a7e7d40c | Måns Rullgård | endfunc |

81 | 83ad74e7 | Måns Rullgård | |

82 | /** |
||

83 | * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. |
||

84 | * Assume that len is a positive number and is multiple of 8 |
||

85 | */ |
||

86 | @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, |
||

87 | @ const float *src1, int len) |
||

88 | function ff_vector_fmul_reverse_vfp, export=1 |
||

89 | vpush {d8-d15} |
||

90 | add r2, r2, r3, lsl #2 |
||

91 | b0e8ce55 | Måns Rullgård | vldmdb r2!, {s0-s3} |

92 | vldmia r1!, {s8-s11} |
||

93 | vldmdb r2!, {s4-s7} |
||

94 | vldmia r1!, {s12-s15} |
||

95 | vmul.f32 s8, s3, s8 |
||

96 | vmul.f32 s9, s2, s9 |
||

97 | vmul.f32 s10, s1, s10 |
||

98 | vmul.f32 s11, s0, s11 |
||

99 | 83ad74e7 | Måns Rullgård | 1: |

100 | subs r3, r3, #16 |
||

101 | b0e8ce55 | Måns Rullgård | vldmdbge r2!, {s16-s19} |

102 | vmul.f32 s12, s7, s12 |
||

103 | vldmiage r1!, {s24-s27} |
||

104 | vmul.f32 s13, s6, s13 |
||

105 | vldmdbge r2!, {s20-s23} |
||

106 | vmul.f32 s14, s5, s14 |
||

107 | vldmiage r1!, {s28-s31} |
||

108 | vmul.f32 s15, s4, s15 |
||

109 | vmulge.f32 s24, s19, s24 |
||

110 | vldmdbgt r2!, {s0-s3} |
||

111 | vmulge.f32 s25, s18, s25 |
||

112 | vstmia r0!, {s8-s13} |
||

113 | vmulge.f32 s26, s17, s26 |
||

114 | vldmiagt r1!, {s8-s11} |
||

115 | vmulge.f32 s27, s16, s27 |
||

116 | vmulge.f32 s28, s23, s28 |
||

117 | vldmdbgt r2!, {s4-s7} |
||

118 | vmulge.f32 s29, s22, s29 |
||

119 | vstmia r0!, {s14-s15} |
||

120 | vmulge.f32 s30, s21, s30 |
||

121 | vmulge.f32 s31, s20, s31 |
||

122 | vmulge.f32 s8, s3, s8 |
||

123 | vldmiagt r1!, {s12-s15} |
||

124 | vmulge.f32 s9, s2, s9 |
||

125 | vmulge.f32 s10, s1, s10 |
||

126 | vstmiage r0!, {s24-s27} |
||

127 | vmulge.f32 s11, s0, s11 |
||

128 | vstmiage r0!, {s28-s31} |
||

129 | 83ad74e7 | Måns Rullgård | bgt 1b |

130 | |||

131 | vpop {d8-d15} |
||

132 | bx lr |
||

133 | a7e7d40c | Måns Rullgård | endfunc |

134 | 83ad74e7 | Måns Rullgård | |

135 | b250f9c6 | Aurelien Jacobs | #if HAVE_ARMV6 |

136 | 83ad74e7 | Måns Rullgård | /** |

137 | * ARM VFP optimized float to int16 conversion. |
||

138 | * Assume that len is a positive number and is multiple of 8, destination |
||

139 | * buffer is at least 4 bytes aligned (8 bytes alignment is better for |
||

140 | * performance), little endian byte sex |
||

141 | */ |
||

142 | @ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) |
||

143 | function ff_float_to_int16_vfp, export=1 |
||

144 | push {r4-r8,lr} |
||

145 | vpush {d8-d11} |
||

146 | b0e8ce55 | Måns Rullgård | vldmia r1!, {s16-s23} |

147 | vcvt.s32.f32 s0, s16 |
||

148 | vcvt.s32.f32 s1, s17 |
||

149 | vcvt.s32.f32 s2, s18 |
||

150 | vcvt.s32.f32 s3, s19 |
||

151 | vcvt.s32.f32 s4, s20 |
||

152 | vcvt.s32.f32 s5, s21 |
||

153 | vcvt.s32.f32 s6, s22 |
||

154 | vcvt.s32.f32 s7, s23 |
||

155 | 83ad74e7 | Måns Rullgård | 1: |

156 | subs r2, r2, #8 |
||

157 | b0e8ce55 | Måns Rullgård | vmov r3, r4, s0, s1 |

158 | vmov r5, r6, s2, s3 |
||

159 | vmov r7, r8, s4, s5 |
||

160 | vmov ip, lr, s6, s7 |
||

161 | vldmiagt r1!, {s16-s23} |
||

162 | 83ad74e7 | Måns Rullgård | ssat r4, #16, r4 |

163 | ssat r3, #16, r3 |
||

164 | ssat r6, #16, r6 |
||

165 | ssat r5, #16, r5 |
||

166 | pkhbt r3, r3, r4, lsl #16 |
||

167 | pkhbt r4, r5, r6, lsl #16 |
||

168 | b0e8ce55 | Måns Rullgård | vcvtgt.s32.f32 s0, s16 |

169 | vcvtgt.s32.f32 s1, s17 |
||

170 | vcvtgt.s32.f32 s2, s18 |
||

171 | vcvtgt.s32.f32 s3, s19 |
||

172 | vcvtgt.s32.f32 s4, s20 |
||

173 | vcvtgt.s32.f32 s5, s21 |
||

174 | vcvtgt.s32.f32 s6, s22 |
||

175 | vcvtgt.s32.f32 s7, s23 |
||

176 | 83ad74e7 | Måns Rullgård | ssat r8, #16, r8 |

177 | ssat r7, #16, r7 |
||

178 | ssat lr, #16, lr |
||

179 | ssat ip, #16, ip |
||

180 | pkhbt r5, r7, r8, lsl #16 |
||

181 | pkhbt r6, ip, lr, lsl #16 |
||

182 | stmia r0!, {r3-r6} |
||

183 | bgt 1b |
||

184 | |||

185 | vpop {d8-d11} |
||

186 | pop {r4-r8,pc} |
||

187 | a7e7d40c | Måns Rullgård | endfunc |

188 | 83ad74e7 | Måns Rullgård | #endif |