ffmpeg / libavcodec / x86 / ac3dsp.asm @ 888fa31e
History | View | Annotate | Download (7.79 KB)
1 | dda3f0ef | Justin Ruggles | ;***************************************************************************** |
---|---|---|---|
2 | ;* x86-optimized AC-3 DSP utils |
||
3 | ;* Copyright (c) 2011 Justin Ruggles |
||
4 | ;* |
||
5 | 2912e87a | Mans Rullgard | ;* This file is part of Libav. |
6 | dda3f0ef | Justin Ruggles | ;* |
7 | 2912e87a | Mans Rullgard | ;* Libav is free software; you can redistribute it and/or |
8 | dda3f0ef | Justin Ruggles | ;* modify it under the terms of the GNU Lesser General Public |
9 | ;* License as published by the Free Software Foundation; either |
||
10 | ;* version 2.1 of the License, or (at your option) any later version. |
||
11 | ;* |
||
12 | 2912e87a | Mans Rullgard | ;* Libav is distributed in the hope that it will be useful, |
13 | dda3f0ef | Justin Ruggles | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | ;* Lesser General Public License for more details. |
||
16 | ;* |
||
17 | ;* You should have received a copy of the GNU Lesser General Public |
||
18 | 2912e87a | Mans Rullgard | ;* License along with Libav; if not, write to the Free Software |
19 | 888fa31e | Diego Biurrun | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 | dda3f0ef | Justin Ruggles | ;****************************************************************************** |
21 | |||
22 | %include "x86inc.asm" |
||
23 | %include "x86util.asm" |
||
24 | |||
25 | 0f999cfd | Justin Ruggles | SECTION_RODATA |
26 | |||
27 | ; 16777216.0f - used in ff_float_to_fixed24() |
||
28 | pf_1_24: times 4 dd 0x4B800000 |
||
29 | |||
30 | dda3f0ef | Justin Ruggles | SECTION .text |
31 | |||
32 | ;----------------------------------------------------------------------------- |
||
33 | ; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs) |
||
34 | ;----------------------------------------------------------------------------- |
||
35 | |||
36 | %macro AC3_EXPONENT_MIN 1 |
||
37 | cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset |
||
38 | shl reuse_blksq, 8 |
||
39 | jz .end |
||
40 | LOOP_ALIGN |
||
41 | .nextexp: |
||
42 | mov offsetq, reuse_blksq |
||
43 | mova m0, [expq+offsetq] |
||
44 | sub offsetq, 256 |
||
45 | LOOP_ALIGN |
||
46 | .nextblk: |
||
47 | PMINUB m0, [expq+offsetq], m1 |
||
48 | sub offsetq, 256 |
||
49 | jae .nextblk |
||
50 | mova [expq], m0 |
||
51 | add expq, mmsize |
||
52 | sub expnq, mmsize |
||
53 | jg .nextexp |
||
54 | .end: |
||
55 | REP_RET |
||
56 | %endmacro |
||
57 | |||
58 | %define PMINUB PMINUB_MMX |
||
59 | %define LOOP_ALIGN |
||
60 | INIT_MMX |
||
61 | AC3_EXPONENT_MIN mmx |
||
62 | %ifdef HAVE_MMX2 |
||
63 | %define PMINUB PMINUB_MMXEXT |
||
64 | %define LOOP_ALIGN ALIGN 16 |
||
65 | AC3_EXPONENT_MIN mmxext |
||
66 | %endif |
||
67 | %ifdef HAVE_SSE |
||
68 | INIT_XMM |
||
69 | AC3_EXPONENT_MIN sse2 |
||
70 | %endif |
||
71 | %undef PMINUB |
||
72 | %undef LOOP_ALIGN |
||
73 | fbb6b49d | Justin Ruggles | |
74 | ;----------------------------------------------------------------------------- |
||
75 | ; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len) |
||
76 | ; |
||
77 | ; This function uses 2 different methods to calculate a valid result. |
||
78 | ; 1) logical 'or' of abs of each element |
||
79 | ; This is used for ssse3 because of the pabsw instruction. |
||
80 | ; It is also used for mmx because of the lack of min/max instructions. |
||
81 | ; 2) calculate min/max for the array, then or(abs(min),abs(max)) |
||
82 | ; This is used for mmxext and sse2 because they have pminsw/pmaxsw. |
||
83 | ;----------------------------------------------------------------------------- |
||
84 | |||
85 | %macro AC3_MAX_MSB_ABS_INT16 2 |
||
86 | cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len |
||
87 | pxor m2, m2 |
||
88 | pxor m3, m3 |
||
89 | .loop: |
||
90 | %ifidn %2, min_max |
||
91 | mova m0, [srcq] |
||
92 | mova m1, [srcq+mmsize] |
||
93 | pminsw m2, m0 |
||
94 | pminsw m2, m1 |
||
95 | pmaxsw m3, m0 |
||
96 | pmaxsw m3, m1 |
||
97 | %else ; or_abs |
||
98 | %ifidn %1, mmx |
||
99 | mova m0, [srcq] |
||
100 | mova m1, [srcq+mmsize] |
||
101 | ABS2 m0, m1, m3, m4 |
||
102 | %else ; ssse3 |
||
103 | ; using memory args is faster for ssse3 |
||
104 | pabsw m0, [srcq] |
||
105 | pabsw m1, [srcq+mmsize] |
||
106 | %endif |
||
107 | por m2, m0 |
||
108 | por m2, m1 |
||
109 | %endif |
||
110 | add srcq, mmsize*2 |
||
111 | sub lend, mmsize |
||
112 | ja .loop |
||
113 | %ifidn %2, min_max |
||
114 | ABS2 m2, m3, m0, m1 |
||
115 | por m2, m3 |
||
116 | %endif |
||
117 | %ifidn mmsize, 16 |
||
118 | 1f004fc5 | Justin Ruggles | movhlps m0, m2 |
119 | fbb6b49d | Justin Ruggles | por m2, m0 |
120 | %endif |
||
121 | PSHUFLW m0, m2, 0xe |
||
122 | por m2, m0 |
||
123 | PSHUFLW m0, m2, 0x1 |
||
124 | por m2, m0 |
||
125 | movd eax, m2 |
||
126 | and eax, 0xFFFF |
||
127 | RET |
||
128 | %endmacro |
||
129 | |||
130 | INIT_MMX |
||
131 | %define ABS2 ABS2_MMX |
||
132 | %define PSHUFLW pshufw |
||
133 | AC3_MAX_MSB_ABS_INT16 mmx, or_abs |
||
134 | %define ABS2 ABS2_MMX2 |
||
135 | AC3_MAX_MSB_ABS_INT16 mmxext, min_max |
||
136 | INIT_XMM |
||
137 | %define PSHUFLW pshuflw |
||
138 | AC3_MAX_MSB_ABS_INT16 sse2, min_max |
||
139 | %define ABS2 ABS2_SSSE3 |
||
140 | AC3_MAX_MSB_ABS_INT16 ssse3, or_abs |
||
141 | f1efbca5 | Justin Ruggles | |
142 | ;----------------------------------------------------------------------------- |
||
143 | ; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32() |
||
144 | ;----------------------------------------------------------------------------- |
||
145 | |||
146 | %macro AC3_SHIFT 4 ; l/r, 16/32, shift instruction, instruction set |
||
147 | cglobal ac3_%1shift_int%2_%4, 3,3,5, src, len, shift |
||
148 | movd m0, shiftd |
||
149 | .loop: |
||
150 | mova m1, [srcq ] |
||
151 | mova m2, [srcq+mmsize ] |
||
152 | mova m3, [srcq+mmsize*2] |
||
153 | mova m4, [srcq+mmsize*3] |
||
154 | %3 m1, m0 |
||
155 | %3 m2, m0 |
||
156 | %3 m3, m0 |
||
157 | %3 m4, m0 |
||
158 | mova [srcq ], m1 |
||
159 | mova [srcq+mmsize ], m2 |
||
160 | mova [srcq+mmsize*2], m3 |
||
161 | mova [srcq+mmsize*3], m4 |
||
162 | add srcq, mmsize*4 |
||
163 | sub lend, mmsize*32/%2 |
||
164 | ja .loop |
||
165 | .end: |
||
166 | REP_RET |
||
167 | %endmacro |
||
168 | |||
169 | ;----------------------------------------------------------------------------- |
||
170 | ; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift) |
||
171 | ;----------------------------------------------------------------------------- |
||
172 | |||
173 | INIT_MMX |
||
174 | AC3_SHIFT l, 16, psllw, mmx |
||
175 | INIT_XMM |
||
176 | AC3_SHIFT l, 16, psllw, sse2 |
||
177 | |||
178 | ;----------------------------------------------------------------------------- |
||
179 | ; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift) |
||
180 | ;----------------------------------------------------------------------------- |
||
181 | |||
182 | INIT_MMX |
||
183 | AC3_SHIFT r, 32, psrad, mmx |
||
184 | INIT_XMM |
||
185 | AC3_SHIFT r, 32, psrad, sse2 |
||
186 | 0f999cfd | Justin Ruggles | |
187 | ;----------------------------------------------------------------------------- |
||
188 | ; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len) |
||
189 | ;----------------------------------------------------------------------------- |
||
190 | |||
191 | ; The 3DNow! version is not bit-identical because pf2id uses truncation rather |
||
192 | ; than round-to-nearest. |
||
193 | INIT_MMX |
||
194 | cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len |
||
195 | movq m0, [pf_1_24] |
||
196 | .loop: |
||
197 | movq m1, [srcq ] |
||
198 | movq m2, [srcq+8 ] |
||
199 | movq m3, [srcq+16] |
||
200 | movq m4, [srcq+24] |
||
201 | pfmul m1, m0 |
||
202 | pfmul m2, m0 |
||
203 | pfmul m3, m0 |
||
204 | pfmul m4, m0 |
||
205 | pf2id m1, m1 |
||
206 | pf2id m2, m2 |
||
207 | pf2id m3, m3 |
||
208 | pf2id m4, m4 |
||
209 | movq [dstq ], m1 |
||
210 | movq [dstq+8 ], m2 |
||
211 | movq [dstq+16], m3 |
||
212 | movq [dstq+24], m4 |
||
213 | add srcq, 32 |
||
214 | add dstq, 32 |
||
215 | sub lend, 8 |
||
216 | ja .loop |
||
217 | REP_RET |
||
218 | |||
219 | INIT_XMM |
||
220 | cglobal float_to_fixed24_sse, 3,3,3, dst, src, len |
||
221 | movaps m0, [pf_1_24] |
||
222 | .loop: |
||
223 | movaps m1, [srcq ] |
||
224 | movaps m2, [srcq+16] |
||
225 | mulps m1, m0 |
||
226 | mulps m2, m0 |
||
227 | cvtps2pi mm0, m1 |
||
228 | movhlps m1, m1 |
||
229 | cvtps2pi mm1, m1 |
||
230 | cvtps2pi mm2, m2 |
||
231 | movhlps m2, m2 |
||
232 | cvtps2pi mm3, m2 |
||
233 | movq [dstq ], mm0 |
||
234 | movq [dstq+ 8], mm1 |
||
235 | movq [dstq+16], mm2 |
||
236 | movq [dstq+24], mm3 |
||
237 | add srcq, 32 |
||
238 | add dstq, 32 |
||
239 | sub lend, 8 |
||
240 | ja .loop |
||
241 | REP_RET |
||
242 | |||
243 | INIT_XMM |
||
244 | cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len |
||
245 | movaps m0, [pf_1_24] |
||
246 | .loop: |
||
247 | movaps m1, [srcq ] |
||
248 | movaps m2, [srcq+16 ] |
||
249 | movaps m3, [srcq+32 ] |
||
250 | movaps m4, [srcq+48 ] |
||
251 | %ifdef m8 |
||
252 | movaps m5, [srcq+64 ] |
||
253 | movaps m6, [srcq+80 ] |
||
254 | movaps m7, [srcq+96 ] |
||
255 | movaps m8, [srcq+112] |
||
256 | %endif |
||
257 | mulps m1, m0 |
||
258 | mulps m2, m0 |
||
259 | mulps m3, m0 |
||
260 | mulps m4, m0 |
||
261 | %ifdef m8 |
||
262 | mulps m5, m0 |
||
263 | mulps m6, m0 |
||
264 | mulps m7, m0 |
||
265 | mulps m8, m0 |
||
266 | %endif |
||
267 | cvtps2dq m1, m1 |
||
268 | cvtps2dq m2, m2 |
||
269 | cvtps2dq m3, m3 |
||
270 | cvtps2dq m4, m4 |
||
271 | %ifdef m8 |
||
272 | cvtps2dq m5, m5 |
||
273 | cvtps2dq m6, m6 |
||
274 | cvtps2dq m7, m7 |
||
275 | cvtps2dq m8, m8 |
||
276 | %endif |
||
277 | movdqa [dstq ], m1 |
||
278 | movdqa [dstq+16 ], m2 |
||
279 | movdqa [dstq+32 ], m3 |
||
280 | movdqa [dstq+48 ], m4 |
||
281 | %ifdef m8 |
||
282 | movdqa [dstq+64 ], m5 |
||
283 | movdqa [dstq+80 ], m6 |
||
284 | movdqa [dstq+96 ], m7 |
||
285 | movdqa [dstq+112], m8 |
||
286 | add srcq, 128 |
||
287 | add dstq, 128 |
||
288 | sub lenq, 32 |
||
289 | %else |
||
290 | add srcq, 64 |
||
291 | add dstq, 64 |
||
292 | sub lenq, 16 |
||
293 | %endif |
||
294 | ja .loop |
||
295 | REP_RET |