Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / ac3dsp.asm @ 888fa31e

History | View | Annotate | Download (7.79 KB)

1
;*****************************************************************************
2
;* x86-optimized AC-3 DSP utils
3
;* Copyright (c) 2011 Justin Ruggles
4
;*
5
;* This file is part of Libav.
6
;*
7
;* Libav is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* Libav is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with Libav; if not, write to the Free Software
19
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21

    
22
%include "x86inc.asm"
23
%include "x86util.asm"
24

    
25
SECTION_RODATA
26

    
27
; 16777216.0f - used in ff_float_to_fixed24()
28
pf_1_24: times 4 dd 0x4B800000
29

    
30
SECTION .text
31

    
32
;-----------------------------------------------------------------------------
33
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
34
;-----------------------------------------------------------------------------
35

    
36
%macro AC3_EXPONENT_MIN 1
37
cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset
38
    shl  reuse_blksq, 8
39
    jz .end
40
    LOOP_ALIGN
41
.nextexp:
42
    mov      offsetq, reuse_blksq
43
    mova          m0, [expq+offsetq]
44
    sub      offsetq, 256
45
    LOOP_ALIGN
46
.nextblk:
47
    PMINUB        m0, [expq+offsetq], m1
48
    sub      offsetq, 256
49
    jae .nextblk
50
    mova      [expq], m0
51
    add         expq, mmsize
52
    sub        expnq, mmsize
53
    jg .nextexp
54
.end:
55
    REP_RET
56
%endmacro
57

    
58
%define PMINUB PMINUB_MMX
59
%define LOOP_ALIGN
60
INIT_MMX
61
AC3_EXPONENT_MIN mmx
62
%ifdef HAVE_MMX2
63
%define PMINUB PMINUB_MMXEXT
64
%define LOOP_ALIGN ALIGN 16
65
AC3_EXPONENT_MIN mmxext
66
%endif
67
%ifdef HAVE_SSE
68
INIT_XMM
69
AC3_EXPONENT_MIN sse2
70
%endif
71
%undef PMINUB
72
%undef LOOP_ALIGN
73

    
74
;-----------------------------------------------------------------------------
75
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
76
;
77
; This function uses 2 different methods to calculate a valid result.
78
; 1) logical 'or' of abs of each element
79
;        This is used for ssse3 because of the pabsw instruction.
80
;        It is also used for mmx because of the lack of min/max instructions.
81
; 2) calculate min/max for the array, then or(abs(min),abs(max))
82
;        This is used for mmxext and sse2 because they have pminsw/pmaxsw.
83
;-----------------------------------------------------------------------------
84

    
85
%macro AC3_MAX_MSB_ABS_INT16 2
86
cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len
87
    pxor        m2, m2
88
    pxor        m3, m3
89
.loop:
90
%ifidn %2, min_max
91
    mova        m0, [srcq]
92
    mova        m1, [srcq+mmsize]
93
    pminsw      m2, m0
94
    pminsw      m2, m1
95
    pmaxsw      m3, m0
96
    pmaxsw      m3, m1
97
%else ; or_abs
98
%ifidn %1, mmx
99
    mova        m0, [srcq]
100
    mova        m1, [srcq+mmsize]
101
    ABS2        m0, m1, m3, m4
102
%else ; ssse3
103
    ; using memory args is faster for ssse3
104
    pabsw       m0, [srcq]
105
    pabsw       m1, [srcq+mmsize]
106
%endif
107
    por         m2, m0
108
    por         m2, m1
109
%endif
110
    add       srcq, mmsize*2
111
    sub       lend, mmsize
112
    ja .loop
113
%ifidn %2, min_max
114
    ABS2        m2, m3, m0, m1
115
    por         m2, m3
116
%endif
117
%ifidn mmsize, 16
118
    movhlps     m0, m2
119
    por         m2, m0
120
%endif
121
    PSHUFLW     m0, m2, 0xe
122
    por         m2, m0
123
    PSHUFLW     m0, m2, 0x1
124
    por         m2, m0
125
    movd       eax, m2
126
    and        eax, 0xFFFF
127
    RET
128
%endmacro
129

    
130
INIT_MMX
131
%define ABS2 ABS2_MMX
132
%define PSHUFLW pshufw
133
AC3_MAX_MSB_ABS_INT16 mmx, or_abs
134
%define ABS2 ABS2_MMX2
135
AC3_MAX_MSB_ABS_INT16 mmxext, min_max
136
INIT_XMM
137
%define PSHUFLW pshuflw
138
AC3_MAX_MSB_ABS_INT16 sse2, min_max
139
%define ABS2 ABS2_SSSE3
140
AC3_MAX_MSB_ABS_INT16 ssse3, or_abs
141

    
142
;-----------------------------------------------------------------------------
143
; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
144
;-----------------------------------------------------------------------------
145

    
146
%macro AC3_SHIFT 4 ; l/r, 16/32, shift instruction, instruction set
147
cglobal ac3_%1shift_int%2_%4, 3,3,5, src, len, shift
148
    movd      m0, shiftd
149
.loop:
150
    mova      m1, [srcq         ]
151
    mova      m2, [srcq+mmsize  ]
152
    mova      m3, [srcq+mmsize*2]
153
    mova      m4, [srcq+mmsize*3]
154
    %3        m1, m0
155
    %3        m2, m0
156
    %3        m3, m0
157
    %3        m4, m0
158
    mova  [srcq         ], m1
159
    mova  [srcq+mmsize  ], m2
160
    mova  [srcq+mmsize*2], m3
161
    mova  [srcq+mmsize*3], m4
162
    add     srcq, mmsize*4
163
    sub     lend, mmsize*32/%2
164
    ja .loop
165
.end:
166
    REP_RET
167
%endmacro
168

    
169
;-----------------------------------------------------------------------------
170
; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
171
;-----------------------------------------------------------------------------
172

    
173
INIT_MMX
174
AC3_SHIFT l, 16, psllw, mmx
175
INIT_XMM
176
AC3_SHIFT l, 16, psllw, sse2
177

    
178
;-----------------------------------------------------------------------------
179
; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
180
;-----------------------------------------------------------------------------
181

    
182
INIT_MMX
183
AC3_SHIFT r, 32, psrad, mmx
184
INIT_XMM
185
AC3_SHIFT r, 32, psrad, sse2
186

    
187
;-----------------------------------------------------------------------------
188
; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
189
;-----------------------------------------------------------------------------
190

    
191
; The 3DNow! version is not bit-identical because pf2id uses truncation rather
192
; than round-to-nearest.
193
INIT_MMX
194
cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len
195
    movq   m0, [pf_1_24]
196
.loop:
197
    movq   m1, [srcq   ]
198
    movq   m2, [srcq+8 ]
199
    movq   m3, [srcq+16]
200
    movq   m4, [srcq+24]
201
    pfmul  m1, m0
202
    pfmul  m2, m0
203
    pfmul  m3, m0
204
    pfmul  m4, m0
205
    pf2id  m1, m1
206
    pf2id  m2, m2
207
    pf2id  m3, m3
208
    pf2id  m4, m4
209
    movq  [dstq   ], m1
210
    movq  [dstq+8 ], m2
211
    movq  [dstq+16], m3
212
    movq  [dstq+24], m4
213
    add  srcq, 32
214
    add  dstq, 32
215
    sub  lend, 8
216
    ja .loop
217
    REP_RET
218

    
219
INIT_XMM
220
cglobal float_to_fixed24_sse, 3,3,3, dst, src, len
221
    movaps     m0, [pf_1_24]
222
.loop:
223
    movaps     m1, [srcq   ]
224
    movaps     m2, [srcq+16]
225
    mulps      m1, m0
226
    mulps      m2, m0
227
    cvtps2pi  mm0, m1
228
    movhlps    m1, m1
229
    cvtps2pi  mm1, m1
230
    cvtps2pi  mm2, m2
231
    movhlps    m2, m2
232
    cvtps2pi  mm3, m2
233
    movq  [dstq   ], mm0
234
    movq  [dstq+ 8], mm1
235
    movq  [dstq+16], mm2
236
    movq  [dstq+24], mm3
237
    add      srcq, 32
238
    add      dstq, 32
239
    sub      lend, 8
240
    ja .loop
241
    REP_RET
242

    
243
INIT_XMM
244
cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len
245
    movaps     m0, [pf_1_24]
246
.loop:
247
    movaps     m1, [srcq    ]
248
    movaps     m2, [srcq+16 ]
249
    movaps     m3, [srcq+32 ]
250
    movaps     m4, [srcq+48 ]
251
%ifdef m8
252
    movaps     m5, [srcq+64 ]
253
    movaps     m6, [srcq+80 ]
254
    movaps     m7, [srcq+96 ]
255
    movaps     m8, [srcq+112]
256
%endif
257
    mulps      m1, m0
258
    mulps      m2, m0
259
    mulps      m3, m0
260
    mulps      m4, m0
261
%ifdef m8
262
    mulps      m5, m0
263
    mulps      m6, m0
264
    mulps      m7, m0
265
    mulps      m8, m0
266
%endif
267
    cvtps2dq   m1, m1
268
    cvtps2dq   m2, m2
269
    cvtps2dq   m3, m3
270
    cvtps2dq   m4, m4
271
%ifdef m8
272
    cvtps2dq   m5, m5
273
    cvtps2dq   m6, m6
274
    cvtps2dq   m7, m7
275
    cvtps2dq   m8, m8
276
%endif
277
    movdqa  [dstq    ], m1
278
    movdqa  [dstq+16 ], m2
279
    movdqa  [dstq+32 ], m3
280
    movdqa  [dstq+48 ], m4
281
%ifdef m8
282
    movdqa  [dstq+64 ], m5
283
    movdqa  [dstq+80 ], m6
284
    movdqa  [dstq+96 ], m7
285
    movdqa  [dstq+112], m8
286
    add      srcq, 128
287
    add      dstq, 128
288
    sub      lenq, 32
289
%else
290
    add      srcq, 64
291
    add      dstq, 64
292
    sub      lenq, 16
293
%endif
294
    ja .loop
295
    REP_RET