## ffmpeg / libavcodec / x86 / ac3dsp.asm @ cc4d3dd3

History | View | Annotate | Download (5.69 KB)

1 |
;***************************************************************************** |
---|---|

2 |
;* x86-optimized AC-3 DSP utils |

3 |
;* Copyright (c) 2011 Justin Ruggles |

4 |
;* |

5 |
;* This file is part of FFmpeg. |

6 |
;* |

7 |
;* FFmpeg is free software; you can redistribute it and/or |

8 |
;* modify it under the terms of the GNU Lesser General Public |

9 |
;* License as published by the Free Software Foundation; either |

10 |
;* version 2.1 of the License, or (at your option) any later version. |

11 |
;* |

12 |
;* FFmpeg is distributed in the hope that it will be useful, |

13 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

14 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

15 |
;* Lesser General Public License for more details. |

16 |
;* |

17 |
;* You should have received a copy of the GNU Lesser General Public |

18 |
;* License along with FFmpeg; if not, write to the Free Software |

19 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

20 |
;****************************************************************************** |

21 | |

22 |
%include "x86inc.asm" |

23 |
%include "x86util.asm" |

24 | |

25 |
SECTION .text |

26 | |

27 |
;----------------------------------------------------------------------------- |

28 |
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs) |

29 |
;----------------------------------------------------------------------------- |

30 | |

31 |
%macro AC3_EXPONENT_MIN 1 |

32 |
cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset |

33 |
shl reuse_blksq, 8 |

34 |
jz .end |

35 |
LOOP_ALIGN |

36 |
.nextexp: |

37 |
mov offsetq, reuse_blksq |

38 |
mova m0, [expq+offsetq] |

39 |
sub offsetq, 256 |

40 |
LOOP_ALIGN |

41 |
.nextblk: |

42 |
PMINUB m0, [expq+offsetq], m1 |

43 |
sub offsetq, 256 |

44 |
jae .nextblk |

45 |
mova [expq], m0 |

46 |
add expq, mmsize |

47 |
sub expnq, mmsize |

48 |
jg .nextexp |

49 |
.end: |

50 |
REP_RET |

51 |
%endmacro |

52 | |

53 |
%define PMINUB PMINUB_MMX |

54 |
%define LOOP_ALIGN |

55 |
INIT_MMX |

56 |
AC3_EXPONENT_MIN mmx |

57 |
%ifdef HAVE_MMX2 |

58 |
%define PMINUB PMINUB_MMXEXT |

59 |
%define LOOP_ALIGN ALIGN 16 |

60 |
AC3_EXPONENT_MIN mmxext |

61 |
%endif |

62 |
%ifdef HAVE_SSE |

63 |
INIT_XMM |

64 |
AC3_EXPONENT_MIN sse2 |

65 |
%endif |

66 |
%undef PMINUB |

67 |
%undef LOOP_ALIGN |

68 | |

69 |
;----------------------------------------------------------------------------- |

70 |
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len) |

71 |
; |

72 |
; This function uses 2 different methods to calculate a valid result. |

73 |
; 1) logical 'or' of abs of each element |

74 |
; This is used for ssse3 because of the pabsw instruction. |

75 |
; It is also used for mmx because of the lack of min/max instructions. |

76 |
; 2) calculate min/max for the array, then or(abs(min),abs(max)) |

77 |
; This is used for mmxext and sse2 because they have pminsw/pmaxsw. |

78 |
;----------------------------------------------------------------------------- |

79 | |

80 |
%macro AC3_MAX_MSB_ABS_INT16 2 |

81 |
cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len |

82 |
pxor m2, m2 |

83 |
pxor m3, m3 |

84 |
.loop: |

85 |
%ifidn %2, min_max |

86 |
mova m0, [srcq] |

87 |
mova m1, [srcq+mmsize] |

88 |
pminsw m2, m0 |

89 |
pminsw m2, m1 |

90 |
pmaxsw m3, m0 |

91 |
pmaxsw m3, m1 |

92 |
%else ; or_abs |

93 |
%ifidn %1, mmx |

94 |
mova m0, [srcq] |

95 |
mova m1, [srcq+mmsize] |

96 |
ABS2 m0, m1, m3, m4 |

97 |
%else ; ssse3 |

98 |
; using memory args is faster for ssse3 |

99 |
pabsw m0, [srcq] |

100 |
pabsw m1, [srcq+mmsize] |

101 |
%endif |

102 |
por m2, m0 |

103 |
por m2, m1 |

104 |
%endif |

105 |
add srcq, mmsize*2 |

106 |
sub lend, mmsize |

107 |
ja .loop |

108 |
%ifidn %2, min_max |

109 |
ABS2 m2, m3, m0, m1 |

110 |
por m2, m3 |

111 |
%endif |

112 |
%ifidn mmsize, 16 |

113 |
movhlps m0, m2 |

114 |
por m2, m0 |

115 |
%endif |

116 |
PSHUFLW m0, m2, 0xe |

117 |
por m2, m0 |

118 |
PSHUFLW m0, m2, 0x1 |

119 |
por m2, m0 |

120 |
movd eax, m2 |

121 |
and eax, 0xFFFF |

122 |
RET |

123 |
%endmacro |

124 | |

125 |
INIT_MMX |

126 |
%define ABS2 ABS2_MMX |

127 |
%define PSHUFLW pshufw |

128 |
AC3_MAX_MSB_ABS_INT16 mmx, or_abs |

129 |
%define ABS2 ABS2_MMX2 |

130 |
AC3_MAX_MSB_ABS_INT16 mmxext, min_max |

131 |
INIT_XMM |

132 |
%define PSHUFLW pshuflw |

133 |
AC3_MAX_MSB_ABS_INT16 sse2, min_max |

134 |
%define ABS2 ABS2_SSSE3 |

135 |
AC3_MAX_MSB_ABS_INT16 ssse3, or_abs |

136 | |

137 |
;----------------------------------------------------------------------------- |

138 |
; macro used for ff_ac3_lshift_int16() and ff_ac3_shift_int32() |

139 |
;----------------------------------------------------------------------------- |

140 | |

141 |
%macro AC3_SHIFT_4MM 3 ; src/dst, shift instruction, shift amount |

142 |
mova m1, [%1 ] |

143 |
mova m2, [%1+mmsize ] |

144 |
mova m3, [%1+mmsize*2] |

145 |
mova m4, [%1+mmsize*3] |

146 |
%2 m1, %3 |

147 |
%2 m2, %3 |

148 |
%2 m3, %3 |

149 |
%2 m4, %3 |

150 |
mova [%1 ], m1 |

151 |
mova [%1+mmsize ], m2 |

152 |
mova [%1+mmsize*2], m3 |

153 |
mova [%1+mmsize*3], m4 |

154 |
add %1, mmsize*4 |

155 |
%endmacro |

156 | |

157 |
;----------------------------------------------------------------------------- |

158 |
; void ff_ac3_lshift_int16(int16_t *src, int len, unsigned int shift) |

159 |
;----------------------------------------------------------------------------- |

160 | |

161 |
%macro AC3_LSHIFT_INT16 1 |

162 |
cglobal ac3_lshift_int16_%1, 3,3,5, src, len, shift |

163 |
test shiftd, shiftd |

164 |
jz .end |

165 |
movd m0, shiftd |

166 |
ALIGN 8 |

167 |
.loop: |

168 |
AC3_SHIFT_4MM srcq, psllw, m0 |

169 |
sub lend, mmsize*2 |

170 |
ja .loop |

171 |
.end: |

172 |
REP_RET |

173 |
%endmacro |

174 | |

175 |
INIT_MMX |

176 |
AC3_LSHIFT_INT16 mmx |

177 |
INIT_XMM |

178 |
AC3_LSHIFT_INT16 sse2 |

179 | |

180 |
;----------------------------------------------------------------------------- |

181 |
; void ff_ac3_shift_int32(int32_t *src, int len, int shift) |

182 |
;----------------------------------------------------------------------------- |

183 | |

184 |
%macro AC3_SHIFT_INT32 1 |

185 |
cglobal ac3_shift_int32_%1, 3,3,5, src, len, shift |

186 |
test shiftd, shiftd |

187 |
je .end |

188 |
js .shift_right |

189 |
movd m0, shiftd |

190 |
.loop_left: |

191 |
AC3_SHIFT_4MM srcq, pslld, m0 |

192 |
sub lend, mmsize |

193 |
ja .loop_left |

194 |
jmp .end |

195 |
.shift_right: |

196 |
neg shiftd |

197 |
movd m0, shiftd |

198 |
.loop_right: |

199 |
AC3_SHIFT_4MM srcq, psrad, m0 |

200 |
sub lend, mmsize |

201 |
ja .loop_right |

202 |
.end: |

203 |
REP_RET |

204 |
%endmacro |

205 | |

206 |
INIT_MMX |

207 |
AC3_SHIFT_INT32 mmx |

208 |
INIT_XMM |

209 |
AC3_SHIFT_INT32 sse2 |