## ffmpeg / libavcodec / x86 / ac3dsp.asm @ 888fa31e

History | View | Annotate | Download (7.79 KB)

1 |
;***************************************************************************** |
---|---|

2 |
;* x86-optimized AC-3 DSP utils |

3 |
;* Copyright (c) 2011 Justin Ruggles |

4 |
;* |

5 |
;* This file is part of Libav. |

6 |
;* |

7 |
;* Libav is free software; you can redistribute it and/or |

8 |
;* modify it under the terms of the GNU Lesser General Public |

9 |
;* License as published by the Free Software Foundation; either |

10 |
;* version 2.1 of the License, or (at your option) any later version. |

11 |
;* |

12 |
;* Libav is distributed in the hope that it will be useful, |

13 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

14 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

15 |
;* Lesser General Public License for more details. |

16 |
;* |

17 |
;* You should have received a copy of the GNU Lesser General Public |

18 |
;* License along with Libav; if not, write to the Free Software |

19 |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

20 |
;****************************************************************************** |

21 | |

22 |
%include "x86inc.asm" |

23 |
%include "x86util.asm" |

24 | |

25 |
SECTION_RODATA |

26 | |

27 |
; 16777216.0f - used in ff_float_to_fixed24() |

28 |
pf_1_24: times 4 dd 0x4B800000 |

29 | |

30 |
SECTION .text |

31 | |

32 |
;----------------------------------------------------------------------------- |

33 |
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs) |

34 |
;----------------------------------------------------------------------------- |

35 | |

36 |
%macro AC3_EXPONENT_MIN 1 |

37 |
cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset |

38 |
shl reuse_blksq, 8 |

39 |
jz .end |

40 |
LOOP_ALIGN |

41 |
.nextexp: |

42 |
mov offsetq, reuse_blksq |

43 |
mova m0, [expq+offsetq] |

44 |
sub offsetq, 256 |

45 |
LOOP_ALIGN |

46 |
.nextblk: |

47 |
PMINUB m0, [expq+offsetq], m1 |

48 |
sub offsetq, 256 |

49 |
jae .nextblk |

50 |
mova [expq], m0 |

51 |
add expq, mmsize |

52 |
sub expnq, mmsize |

53 |
jg .nextexp |

54 |
.end: |

55 |
REP_RET |

56 |
%endmacro |

57 | |

58 |
%define PMINUB PMINUB_MMX |

59 |
%define LOOP_ALIGN |

60 |
INIT_MMX |

61 |
AC3_EXPONENT_MIN mmx |

62 |
%ifdef HAVE_MMX2 |

63 |
%define PMINUB PMINUB_MMXEXT |

64 |
%define LOOP_ALIGN ALIGN 16 |

65 |
AC3_EXPONENT_MIN mmxext |

66 |
%endif |

67 |
%ifdef HAVE_SSE |

68 |
INIT_XMM |

69 |
AC3_EXPONENT_MIN sse2 |

70 |
%endif |

71 |
%undef PMINUB |

72 |
%undef LOOP_ALIGN |

73 | |

74 |
;----------------------------------------------------------------------------- |

75 |
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len) |

76 |
; |

77 |
; This function uses 2 different methods to calculate a valid result. |

78 |
; 1) logical 'or' of abs of each element |

79 |
; This is used for ssse3 because of the pabsw instruction. |

80 |
; It is also used for mmx because of the lack of min/max instructions. |

81 |
; 2) calculate min/max for the array, then or(abs(min),abs(max)) |

82 |
; This is used for mmxext and sse2 because they have pminsw/pmaxsw. |

83 |
;----------------------------------------------------------------------------- |

84 | |

85 |
%macro AC3_MAX_MSB_ABS_INT16 2 |

86 |
cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len |

87 |
pxor m2, m2 |

88 |
pxor m3, m3 |

89 |
.loop: |

90 |
%ifidn %2, min_max |

91 |
mova m0, [srcq] |

92 |
mova m1, [srcq+mmsize] |

93 |
pminsw m2, m0 |

94 |
pminsw m2, m1 |

95 |
pmaxsw m3, m0 |

96 |
pmaxsw m3, m1 |

97 |
%else ; or_abs |

98 |
%ifidn %1, mmx |

99 |
mova m0, [srcq] |

100 |
mova m1, [srcq+mmsize] |

101 |
ABS2 m0, m1, m3, m4 |

102 |
%else ; ssse3 |

103 |
; using memory args is faster for ssse3 |

104 |
pabsw m0, [srcq] |

105 |
pabsw m1, [srcq+mmsize] |

106 |
%endif |

107 |
por m2, m0 |

108 |
por m2, m1 |

109 |
%endif |

110 |
add srcq, mmsize*2 |

111 |
sub lend, mmsize |

112 |
ja .loop |

113 |
%ifidn %2, min_max |

114 |
ABS2 m2, m3, m0, m1 |

115 |
por m2, m3 |

116 |
%endif |

117 |
%ifidn mmsize, 16 |

118 |
movhlps m0, m2 |

119 |
por m2, m0 |

120 |
%endif |

121 |
PSHUFLW m0, m2, 0xe |

122 |
por m2, m0 |

123 |
PSHUFLW m0, m2, 0x1 |

124 |
por m2, m0 |

125 |
movd eax, m2 |

126 |
and eax, 0xFFFF |

127 |
RET |

128 |
%endmacro |

129 | |

130 |
INIT_MMX |

131 |
%define ABS2 ABS2_MMX |

132 |
%define PSHUFLW pshufw |

133 |
AC3_MAX_MSB_ABS_INT16 mmx, or_abs |

134 |
%define ABS2 ABS2_MMX2 |

135 |
AC3_MAX_MSB_ABS_INT16 mmxext, min_max |

136 |
INIT_XMM |

137 |
%define PSHUFLW pshuflw |

138 |
AC3_MAX_MSB_ABS_INT16 sse2, min_max |

139 |
%define ABS2 ABS2_SSSE3 |

140 |
AC3_MAX_MSB_ABS_INT16 ssse3, or_abs |

141 | |

142 |
;----------------------------------------------------------------------------- |

143 |
; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32() |

144 |
;----------------------------------------------------------------------------- |

145 | |

146 |
%macro AC3_SHIFT 4 ; l/r, 16/32, shift instruction, instruction set |

147 |
cglobal ac3_%1shift_int%2_%4, 3,3,5, src, len, shift |

148 |
movd m0, shiftd |

149 |
.loop: |

150 |
mova m1, [srcq ] |

151 |
mova m2, [srcq+mmsize ] |

152 |
mova m3, [srcq+mmsize*2] |

153 |
mova m4, [srcq+mmsize*3] |

154 |
%3 m1, m0 |

155 |
%3 m2, m0 |

156 |
%3 m3, m0 |

157 |
%3 m4, m0 |

158 |
mova [srcq ], m1 |

159 |
mova [srcq+mmsize ], m2 |

160 |
mova [srcq+mmsize*2], m3 |

161 |
mova [srcq+mmsize*3], m4 |

162 |
add srcq, mmsize*4 |

163 |
sub lend, mmsize*32/%2 |

164 |
ja .loop |

165 |
.end: |

166 |
REP_RET |

167 |
%endmacro |

168 | |

169 |
;----------------------------------------------------------------------------- |

170 |
; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift) |

171 |
;----------------------------------------------------------------------------- |

172 | |

173 |
INIT_MMX |

174 |
AC3_SHIFT l, 16, psllw, mmx |

175 |
INIT_XMM |

176 |
AC3_SHIFT l, 16, psllw, sse2 |

177 | |

178 |
;----------------------------------------------------------------------------- |

179 |
; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift) |

180 |
;----------------------------------------------------------------------------- |

181 | |

182 |
INIT_MMX |

183 |
AC3_SHIFT r, 32, psrad, mmx |

184 |
INIT_XMM |

185 |
AC3_SHIFT r, 32, psrad, sse2 |

186 | |

187 |
;----------------------------------------------------------------------------- |

188 |
; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len) |

189 |
;----------------------------------------------------------------------------- |

190 | |

191 |
; The 3DNow! version is not bit-identical because pf2id uses truncation rather |

192 |
; than round-to-nearest. |

193 |
INIT_MMX |

194 |
cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len |

195 |
movq m0, [pf_1_24] |

196 |
.loop: |

197 |
movq m1, [srcq ] |

198 |
movq m2, [srcq+8 ] |

199 |
movq m3, [srcq+16] |

200 |
movq m4, [srcq+24] |

201 |
pfmul m1, m0 |

202 |
pfmul m2, m0 |

203 |
pfmul m3, m0 |

204 |
pfmul m4, m0 |

205 |
pf2id m1, m1 |

206 |
pf2id m2, m2 |

207 |
pf2id m3, m3 |

208 |
pf2id m4, m4 |

209 |
movq [dstq ], m1 |

210 |
movq [dstq+8 ], m2 |

211 |
movq [dstq+16], m3 |

212 |
movq [dstq+24], m4 |

213 |
add srcq, 32 |

214 |
add dstq, 32 |

215 |
sub lend, 8 |

216 |
ja .loop |

217 |
REP_RET |

218 | |

219 |
INIT_XMM |

220 |
cglobal float_to_fixed24_sse, 3,3,3, dst, src, len |

221 |
movaps m0, [pf_1_24] |

222 |
.loop: |

223 |
movaps m1, [srcq ] |

224 |
movaps m2, [srcq+16] |

225 |
mulps m1, m0 |

226 |
mulps m2, m0 |

227 |
cvtps2pi mm0, m1 |

228 |
movhlps m1, m1 |

229 |
cvtps2pi mm1, m1 |

230 |
cvtps2pi mm2, m2 |

231 |
movhlps m2, m2 |

232 |
cvtps2pi mm3, m2 |

233 |
movq [dstq ], mm0 |

234 |
movq [dstq+ 8], mm1 |

235 |
movq [dstq+16], mm2 |

236 |
movq [dstq+24], mm3 |

237 |
add srcq, 32 |

238 |
add dstq, 32 |

239 |
sub lend, 8 |

240 |
ja .loop |

241 |
REP_RET |

242 | |

243 |
INIT_XMM |

244 |
cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len |

245 |
movaps m0, [pf_1_24] |

246 |
.loop: |

247 |
movaps m1, [srcq ] |

248 |
movaps m2, [srcq+16 ] |

249 |
movaps m3, [srcq+32 ] |

250 |
movaps m4, [srcq+48 ] |

251 |
%ifdef m8 |

252 |
movaps m5, [srcq+64 ] |

253 |
movaps m6, [srcq+80 ] |

254 |
movaps m7, [srcq+96 ] |

255 |
movaps m8, [srcq+112] |

256 |
%endif |

257 |
mulps m1, m0 |

258 |
mulps m2, m0 |

259 |
mulps m3, m0 |

260 |
mulps m4, m0 |

261 |
%ifdef m8 |

262 |
mulps m5, m0 |

263 |
mulps m6, m0 |

264 |
mulps m7, m0 |

265 |
mulps m8, m0 |

266 |
%endif |

267 |
cvtps2dq m1, m1 |

268 |
cvtps2dq m2, m2 |

269 |
cvtps2dq m3, m3 |

270 |
cvtps2dq m4, m4 |

271 |
%ifdef m8 |

272 |
cvtps2dq m5, m5 |

273 |
cvtps2dq m6, m6 |

274 |
cvtps2dq m7, m7 |

275 |
cvtps2dq m8, m8 |

276 |
%endif |

277 |
movdqa [dstq ], m1 |

278 |
movdqa [dstq+16 ], m2 |

279 |
movdqa [dstq+32 ], m3 |

280 |
movdqa [dstq+48 ], m4 |

281 |
%ifdef m8 |

282 |
movdqa [dstq+64 ], m5 |

283 |
movdqa [dstq+80 ], m6 |

284 |
movdqa [dstq+96 ], m7 |

285 |
movdqa [dstq+112], m8 |

286 |
add srcq, 128 |

287 |
add dstq, 128 |

288 |
sub lenq, 32 |

289 |
%else |

290 |
add srcq, 64 |

291 |
add dstq, 64 |

292 |
sub lenq, 16 |

293 |
%endif |

294 |
ja .loop |

295 |
REP_RET |