ffmpeg / libavcodec / x86 / fmtconvert_mmx.c @ c73d99e6
History | View | Annotate | Download (9.68 KB)
1 |
/*
|
---|---|
2 |
* Format Conversion Utils
|
3 |
* Copyright (c) 2000, 2001 Fabrice Bellard
|
4 |
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
5 |
*
|
6 |
* This file is part of FFmpeg.
|
7 |
*
|
8 |
* FFmpeg is free software; you can redistribute it and/or
|
9 |
* modify it under the terms of the GNU Lesser General Public
|
10 |
* License as published by the Free Software Foundation; either
|
11 |
* version 2.1 of the License, or (at your option) any later version.
|
12 |
*
|
13 |
* FFmpeg is distributed in the hope that it will be useful,
|
14 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16 |
* Lesser General Public License for more details.
|
17 |
*
|
18 |
* You should have received a copy of the GNU Lesser General Public
|
19 |
* License along with FFmpeg; if not, write to the Free Software
|
20 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
21 |
*
|
22 |
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
23 |
*/
|
24 |
|
25 |
#include "libavutil/cpu.h" |
26 |
#include "libavutil/x86_cpu.h" |
27 |
#include "libavcodec/fmtconvert.h" |
28 |
|
29 |
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) |
30 |
{ |
31 |
x86_reg i = -4*len;
|
32 |
__asm__ volatile(
|
33 |
"movss %3, %%xmm4 \n"
|
34 |
"shufps $0, %%xmm4, %%xmm4 \n"
|
35 |
"1: \n"
|
36 |
"cvtpi2ps (%2,%0), %%xmm0 \n"
|
37 |
"cvtpi2ps 8(%2,%0), %%xmm1 \n"
|
38 |
"cvtpi2ps 16(%2,%0), %%xmm2 \n"
|
39 |
"cvtpi2ps 24(%2,%0), %%xmm3 \n"
|
40 |
"movlhps %%xmm1, %%xmm0 \n"
|
41 |
"movlhps %%xmm3, %%xmm2 \n"
|
42 |
"mulps %%xmm4, %%xmm0 \n"
|
43 |
"mulps %%xmm4, %%xmm2 \n"
|
44 |
"movaps %%xmm0, (%1,%0) \n"
|
45 |
"movaps %%xmm2, 16(%1,%0) \n"
|
46 |
"add $32, %0 \n"
|
47 |
"jl 1b \n"
|
48 |
:"+r"(i)
|
49 |
:"r"(dst+len), "r"(src+len), "m"(mul) |
50 |
); |
51 |
} |
52 |
|
53 |
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) |
54 |
{ |
55 |
x86_reg i = -4*len;
|
56 |
__asm__ volatile(
|
57 |
"movss %3, %%xmm4 \n"
|
58 |
"shufps $0, %%xmm4, %%xmm4 \n"
|
59 |
"1: \n"
|
60 |
"cvtdq2ps (%2,%0), %%xmm0 \n"
|
61 |
"cvtdq2ps 16(%2,%0), %%xmm1 \n"
|
62 |
"mulps %%xmm4, %%xmm0 \n"
|
63 |
"mulps %%xmm4, %%xmm1 \n"
|
64 |
"movaps %%xmm0, (%1,%0) \n"
|
65 |
"movaps %%xmm1, 16(%1,%0) \n"
|
66 |
"add $32, %0 \n"
|
67 |
"jl 1b \n"
|
68 |
:"+r"(i)
|
69 |
:"r"(dst+len), "r"(src+len), "m"(mul) |
70 |
); |
71 |
} |
72 |
|
73 |
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ |
74 |
x86_reg reglen = len; |
75 |
// not bit-exact: pf2id uses different rounding than C and SSE
|
76 |
__asm__ volatile(
|
77 |
"add %0 , %0 \n\t"
|
78 |
"lea (%2,%0,2) , %2 \n\t"
|
79 |
"add %0 , %1 \n\t"
|
80 |
"neg %0 \n\t"
|
81 |
"1: \n\t"
|
82 |
"pf2id (%2,%0,2) , %%mm0 \n\t"
|
83 |
"pf2id 8(%2,%0,2) , %%mm1 \n\t"
|
84 |
"pf2id 16(%2,%0,2) , %%mm2 \n\t"
|
85 |
"pf2id 24(%2,%0,2) , %%mm3 \n\t"
|
86 |
"packssdw %%mm1 , %%mm0 \n\t"
|
87 |
"packssdw %%mm3 , %%mm2 \n\t"
|
88 |
"movq %%mm0 , (%1,%0) \n\t"
|
89 |
"movq %%mm2 , 8(%1,%0) \n\t"
|
90 |
"add $16 , %0 \n\t"
|
91 |
" js 1b \n\t"
|
92 |
"femms \n\t"
|
93 |
:"+r"(reglen), "+r"(dst), "+r"(src) |
94 |
); |
95 |
} |
96 |
|
97 |
static void float_to_int16_sse(int16_t *dst, const float *src, long len){ |
98 |
x86_reg reglen = len; |
99 |
__asm__ volatile(
|
100 |
"add %0 , %0 \n\t"
|
101 |
"lea (%2,%0,2) , %2 \n\t"
|
102 |
"add %0 , %1 \n\t"
|
103 |
"neg %0 \n\t"
|
104 |
"1: \n\t"
|
105 |
"cvtps2pi (%2,%0,2) , %%mm0 \n\t"
|
106 |
"cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
|
107 |
"cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
|
108 |
"cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
|
109 |
"packssdw %%mm1 , %%mm0 \n\t"
|
110 |
"packssdw %%mm3 , %%mm2 \n\t"
|
111 |
"movq %%mm0 , (%1,%0) \n\t"
|
112 |
"movq %%mm2 , 8(%1,%0) \n\t"
|
113 |
"add $16 , %0 \n\t"
|
114 |
" js 1b \n\t"
|
115 |
"emms \n\t"
|
116 |
:"+r"(reglen), "+r"(dst), "+r"(src) |
117 |
); |
118 |
} |
119 |
|
120 |
static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ |
121 |
x86_reg reglen = len; |
122 |
__asm__ volatile(
|
123 |
"add %0 , %0 \n\t"
|
124 |
"lea (%2,%0,2) , %2 \n\t"
|
125 |
"add %0 , %1 \n\t"
|
126 |
"neg %0 \n\t"
|
127 |
"1: \n\t"
|
128 |
"cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
|
129 |
"cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
|
130 |
"packssdw %%xmm1 , %%xmm0 \n\t"
|
131 |
"movdqa %%xmm0 , (%1,%0) \n\t"
|
132 |
"add $16 , %0 \n\t"
|
133 |
" js 1b \n\t"
|
134 |
:"+r"(reglen), "+r"(dst), "+r"(src) |
135 |
); |
136 |
} |
137 |
|
138 |
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); |
139 |
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); |
140 |
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); |
141 |
|
142 |
#if !HAVE_YASM
|
143 |
#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) |
144 |
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) |
145 |
#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) |
146 |
#endif
|
147 |
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
|
148 |
|
149 |
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
|
150 |
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
|
151 |
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ |
152 |
DECLARE_ALIGNED(16, int16_t, tmp)[len];\
|
153 |
int i,j,c;\
|
154 |
for(c=0; c<channels; c++){\ |
155 |
float_to_int16_##cpu(tmp, src[c], len);\ |
156 |
for(i=0, j=c; i<len; i++, j+=channels)\ |
157 |
dst[j] = tmp[i];\ |
158 |
}\ |
159 |
}\ |
160 |
\ |
161 |
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ |
162 |
if(channels==1)\ |
163 |
float_to_int16_##cpu(dst, src[0], len);\ |
164 |
else if(channels==2){\ |
165 |
x86_reg reglen = len; \ |
166 |
const float *src0 = src[0];\ |
167 |
const float *src1 = src[1];\ |
168 |
__asm__ volatile(\
|
169 |
"shl $2, %0 \n"\
|
170 |
"add %0, %1 \n"\
|
171 |
"add %0, %2 \n"\
|
172 |
"add %0, %3 \n"\
|
173 |
"neg %0 \n"\
|
174 |
body\ |
175 |
:"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ |
176 |
);\ |
177 |
}else if(channels==6){\ |
178 |
ff_float_to_int16_interleave6_##cpu(dst, src, len);\ |
179 |
}else\
|
180 |
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ |
181 |
} |
182 |
|
183 |
FLOAT_TO_INT16_INTERLEAVE(3dnow,
|
184 |
"1: \n"
|
185 |
"pf2id (%2,%0), %%mm0 \n"
|
186 |
"pf2id 8(%2,%0), %%mm1 \n"
|
187 |
"pf2id (%3,%0), %%mm2 \n"
|
188 |
"pf2id 8(%3,%0), %%mm3 \n"
|
189 |
"packssdw %%mm1, %%mm0 \n"
|
190 |
"packssdw %%mm3, %%mm2 \n"
|
191 |
"movq %%mm0, %%mm1 \n"
|
192 |
"punpcklwd %%mm2, %%mm0 \n"
|
193 |
"punpckhwd %%mm2, %%mm1 \n"
|
194 |
"movq %%mm0, (%1,%0)\n"
|
195 |
"movq %%mm1, 8(%1,%0)\n"
|
196 |
"add $16, %0 \n"
|
197 |
"js 1b \n"
|
198 |
"femms \n"
|
199 |
) |
200 |
|
201 |
FLOAT_TO_INT16_INTERLEAVE(sse, |
202 |
"1: \n"
|
203 |
"cvtps2pi (%2,%0), %%mm0 \n"
|
204 |
"cvtps2pi 8(%2,%0), %%mm1 \n"
|
205 |
"cvtps2pi (%3,%0), %%mm2 \n"
|
206 |
"cvtps2pi 8(%3,%0), %%mm3 \n"
|
207 |
"packssdw %%mm1, %%mm0 \n"
|
208 |
"packssdw %%mm3, %%mm2 \n"
|
209 |
"movq %%mm0, %%mm1 \n"
|
210 |
"punpcklwd %%mm2, %%mm0 \n"
|
211 |
"punpckhwd %%mm2, %%mm1 \n"
|
212 |
"movq %%mm0, (%1,%0)\n"
|
213 |
"movq %%mm1, 8(%1,%0)\n"
|
214 |
"add $16, %0 \n"
|
215 |
"js 1b \n"
|
216 |
"emms \n"
|
217 |
) |
218 |
|
219 |
FLOAT_TO_INT16_INTERLEAVE(sse2, |
220 |
"1: \n"
|
221 |
"cvtps2dq (%2,%0), %%xmm0 \n"
|
222 |
"cvtps2dq (%3,%0), %%xmm1 \n"
|
223 |
"packssdw %%xmm1, %%xmm0 \n"
|
224 |
"movhlps %%xmm0, %%xmm1 \n"
|
225 |
"punpcklwd %%xmm1, %%xmm0 \n"
|
226 |
"movdqa %%xmm0, (%1,%0) \n"
|
227 |
"add $16, %0 \n"
|
228 |
"js 1b \n"
|
229 |
) |
230 |
|
231 |
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ |
232 |
if(channels==6) |
233 |
ff_float_to_int16_interleave6_3dn2(dst, src, len); |
234 |
else
|
235 |
float_to_int16_interleave_3dnow(dst, src, len, channels); |
236 |
} |
237 |
|
238 |
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
|
239 |
{ |
240 |
int mm_flags = av_get_cpu_flags();
|
241 |
|
242 |
if (mm_flags & AV_CPU_FLAG_MMX) {
|
243 |
|
244 |
if(mm_flags & AV_CPU_FLAG_3DNOW){
|
245 |
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
246 |
c->float_to_int16 = float_to_int16_3dnow; |
247 |
c->float_to_int16_interleave = float_to_int16_interleave_3dnow; |
248 |
} |
249 |
} |
250 |
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
|
251 |
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
252 |
c->float_to_int16_interleave = float_to_int16_interleave_3dn2; |
253 |
} |
254 |
} |
255 |
if(mm_flags & AV_CPU_FLAG_SSE){
|
256 |
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; |
257 |
c->float_to_int16 = float_to_int16_sse; |
258 |
c->float_to_int16_interleave = float_to_int16_interleave_sse; |
259 |
} |
260 |
if(mm_flags & AV_CPU_FLAG_SSE2){
|
261 |
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; |
262 |
c->float_to_int16 = float_to_int16_sse2; |
263 |
c->float_to_int16_interleave = float_to_int16_interleave_sse2; |
264 |
} |
265 |
} |
266 |
} |