## ffmpeg / libavcodec / x86 / ac3dsp.asm @ f1efbca5

History | View | Annotate | Download (5.25 KB)

1 |
;***************************************************************************** |
---|---|

2 |
;* x86-optimized AC-3 DSP utils |

3 |
;* Copyright (c) 2011 Justin Ruggles |

4 |
;* |

5 |
;* This file is part of FFmpeg. |

6 |
;* |

7 |
;* FFmpeg is free software; you can redistribute it and/or |

8 |
;* modify it under the terms of the GNU Lesser General Public |

9 |
;* License as published by the Free Software Foundation; either |

10 |
;* version 2.1 of the License, or (at your option) any later version. |

11 |
;* |

12 |
;* FFmpeg is distributed in the hope that it will be useful, |

13 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

14 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

15 |
;* Lesser General Public License for more details. |

16 |
;* |

17 |
;* You should have received a copy of the GNU Lesser General Public |

18 |
;* License along with FFmpeg; if not, write to the Free Software |

19 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

20 |
;****************************************************************************** |

21 | |

22 |
%include "x86inc.asm" |

23 |
%include "x86util.asm" |

24 | |

25 |
SECTION .text |

26 | |

27 |
;----------------------------------------------------------------------------- |

28 |
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs) |

29 |
;----------------------------------------------------------------------------- |

30 | |

31 |
%macro AC3_EXPONENT_MIN 1 |

32 |
cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset |

33 |
shl reuse_blksq, 8 |

34 |
jz .end |

35 |
LOOP_ALIGN |

36 |
.nextexp: |

37 |
mov offsetq, reuse_blksq |

38 |
mova m0, [expq+offsetq] |

39 |
sub offsetq, 256 |

40 |
LOOP_ALIGN |

41 |
.nextblk: |

42 |
PMINUB m0, [expq+offsetq], m1 |

43 |
sub offsetq, 256 |

44 |
jae .nextblk |

45 |
mova [expq], m0 |

46 |
add expq, mmsize |

47 |
sub expnq, mmsize |

48 |
jg .nextexp |

49 |
.end: |

50 |
REP_RET |

51 |
%endmacro |

52 | |

53 |
%define PMINUB PMINUB_MMX |

54 |
%define LOOP_ALIGN |

55 |
INIT_MMX |

56 |
AC3_EXPONENT_MIN mmx |

57 |
%ifdef HAVE_MMX2 |

58 |
%define PMINUB PMINUB_MMXEXT |

59 |
%define LOOP_ALIGN ALIGN 16 |

60 |
AC3_EXPONENT_MIN mmxext |

61 |
%endif |

62 |
%ifdef HAVE_SSE |

63 |
INIT_XMM |

64 |
AC3_EXPONENT_MIN sse2 |

65 |
%endif |

66 |
%undef PMINUB |

67 |
%undef LOOP_ALIGN |

68 | |

69 |
;----------------------------------------------------------------------------- |

70 |
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len) |

71 |
; |

72 |
; This function uses 2 different methods to calculate a valid result. |

73 |
; 1) logical 'or' of abs of each element |

74 |
; This is used for ssse3 because of the pabsw instruction. |

75 |
; It is also used for mmx because of the lack of min/max instructions. |

76 |
; 2) calculate min/max for the array, then or(abs(min),abs(max)) |

77 |
; This is used for mmxext and sse2 because they have pminsw/pmaxsw. |

78 |
;----------------------------------------------------------------------------- |

79 | |

80 |
%macro AC3_MAX_MSB_ABS_INT16 2 |

81 |
cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len |

82 |
pxor m2, m2 |

83 |
pxor m3, m3 |

84 |
.loop: |

85 |
%ifidn %2, min_max |

86 |
mova m0, [srcq] |

87 |
mova m1, [srcq+mmsize] |

88 |
pminsw m2, m0 |

89 |
pminsw m2, m1 |

90 |
pmaxsw m3, m0 |

91 |
pmaxsw m3, m1 |

92 |
%else ; or_abs |

93 |
%ifidn %1, mmx |

94 |
mova m0, [srcq] |

95 |
mova m1, [srcq+mmsize] |

96 |
ABS2 m0, m1, m3, m4 |

97 |
%else ; ssse3 |

98 |
; using memory args is faster for ssse3 |

99 |
pabsw m0, [srcq] |

100 |
pabsw m1, [srcq+mmsize] |

101 |
%endif |

102 |
por m2, m0 |

103 |
por m2, m1 |

104 |
%endif |

105 |
add srcq, mmsize*2 |

106 |
sub lend, mmsize |

107 |
ja .loop |

108 |
%ifidn %2, min_max |

109 |
ABS2 m2, m3, m0, m1 |

110 |
por m2, m3 |

111 |
%endif |

112 |
%ifidn mmsize, 16 |

113 |
movhlps m0, m2 |

114 |
por m2, m0 |

115 |
%endif |

116 |
PSHUFLW m0, m2, 0xe |

117 |
por m2, m0 |

118 |
PSHUFLW m0, m2, 0x1 |

119 |
por m2, m0 |

120 |
movd eax, m2 |

121 |
and eax, 0xFFFF |

122 |
RET |

123 |
%endmacro |

124 | |

125 |
INIT_MMX |

126 |
%define ABS2 ABS2_MMX |

127 |
%define PSHUFLW pshufw |

128 |
AC3_MAX_MSB_ABS_INT16 mmx, or_abs |

129 |
%define ABS2 ABS2_MMX2 |

130 |
AC3_MAX_MSB_ABS_INT16 mmxext, min_max |

131 |
INIT_XMM |

132 |
%define PSHUFLW pshuflw |

133 |
AC3_MAX_MSB_ABS_INT16 sse2, min_max |

134 |
%define ABS2 ABS2_SSSE3 |

135 |
AC3_MAX_MSB_ABS_INT16 ssse3, or_abs |

136 | |

137 |
;----------------------------------------------------------------------------- |

138 |
; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32() |

139 |
;----------------------------------------------------------------------------- |

140 | |

141 |
%macro AC3_SHIFT 4 ; l/r, 16/32, shift instruction, instruction set |

142 |
cglobal ac3_%1shift_int%2_%4, 3,3,5, src, len, shift |

143 |
movd m0, shiftd |

144 |
.loop: |

145 |
mova m1, [srcq ] |

146 |
mova m2, [srcq+mmsize ] |

147 |
mova m3, [srcq+mmsize*2] |

148 |
mova m4, [srcq+mmsize*3] |

149 |
%3 m1, m0 |

150 |
%3 m2, m0 |

151 |
%3 m3, m0 |

152 |
%3 m4, m0 |

153 |
mova [srcq ], m1 |

154 |
mova [srcq+mmsize ], m2 |

155 |
mova [srcq+mmsize*2], m3 |

156 |
mova [srcq+mmsize*3], m4 |

157 |
add srcq, mmsize*4 |

158 |
sub lend, mmsize*32/%2 |

159 |
ja .loop |

160 |
.end: |

161 |
REP_RET |

162 |
%endmacro |

163 | |

164 |
;----------------------------------------------------------------------------- |

165 |
; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift) |

166 |
;----------------------------------------------------------------------------- |

167 | |

168 |
INIT_MMX |

169 |
AC3_SHIFT l, 16, psllw, mmx |

170 |
INIT_XMM |

171 |
AC3_SHIFT l, 16, psllw, sse2 |

172 | |

173 |
;----------------------------------------------------------------------------- |

174 |
; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift) |

175 |
;----------------------------------------------------------------------------- |

176 | |

177 |
INIT_MMX |

178 |
AC3_SHIFT r, 32, psrad, mmx |

179 |
INIT_XMM |

180 |
AC3_SHIFT r, 32, psrad, sse2 |