Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / fmtconvert_mmx.c @ fe2ff6d2

History | View | Annotate | Download (9.68 KB)

1
/*
2
 * Format Conversion Utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

    
25
#include "libavutil/cpu.h"
26
#include "libavutil/x86_cpu.h"
27
#include "libavcodec/fmtconvert.h"
28

    
29
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
30
{
31
    x86_reg i = -4*len;
32
    __asm__ volatile(
33
        "movss  %3, %%xmm4 \n"
34
        "shufps $0, %%xmm4, %%xmm4 \n"
35
        "1: \n"
36
        "cvtpi2ps   (%2,%0), %%xmm0 \n"
37
        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
38
        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
39
        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
40
        "movlhps  %%xmm1,    %%xmm0 \n"
41
        "movlhps  %%xmm3,    %%xmm2 \n"
42
        "mulps    %%xmm4,    %%xmm0 \n"
43
        "mulps    %%xmm4,    %%xmm2 \n"
44
        "movaps   %%xmm0,   (%1,%0) \n"
45
        "movaps   %%xmm2, 16(%1,%0) \n"
46
        "add $32, %0 \n"
47
        "jl 1b \n"
48
        :"+r"(i)
49
        :"r"(dst+len), "r"(src+len), "m"(mul)
50
    );
51
}
52

    
53
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
54
{
55
    x86_reg i = -4*len;
56
    __asm__ volatile(
57
        "movss  %3, %%xmm4 \n"
58
        "shufps $0, %%xmm4, %%xmm4 \n"
59
        "1: \n"
60
        "cvtdq2ps   (%2,%0), %%xmm0 \n"
61
        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
62
        "mulps    %%xmm4,    %%xmm0 \n"
63
        "mulps    %%xmm4,    %%xmm1 \n"
64
        "movaps   %%xmm0,   (%1,%0) \n"
65
        "movaps   %%xmm1, 16(%1,%0) \n"
66
        "add $32, %0 \n"
67
        "jl 1b \n"
68
        :"+r"(i)
69
        :"r"(dst+len), "r"(src+len), "m"(mul)
70
    );
71
}
72

    
73
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
74
    x86_reg reglen = len;
75
    // not bit-exact: pf2id uses different rounding than C and SSE
76
    __asm__ volatile(
77
        "add        %0          , %0        \n\t"
78
        "lea         (%2,%0,2)  , %2        \n\t"
79
        "add        %0          , %1        \n\t"
80
        "neg        %0                      \n\t"
81
        "1:                                 \n\t"
82
        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
83
        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
84
        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
85
        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
86
        "packssdw   %%mm1       , %%mm0     \n\t"
87
        "packssdw   %%mm3       , %%mm2     \n\t"
88
        "movq       %%mm0       ,  (%1,%0)  \n\t"
89
        "movq       %%mm2       , 8(%1,%0)  \n\t"
90
        "add        $16         , %0        \n\t"
91
        " js 1b                             \n\t"
92
        "femms                              \n\t"
93
        :"+r"(reglen), "+r"(dst), "+r"(src)
94
    );
95
}
96

    
97
static void float_to_int16_sse(int16_t *dst, const float *src, long len){
98
    x86_reg reglen = len;
99
    __asm__ volatile(
100
        "add        %0          , %0        \n\t"
101
        "lea         (%2,%0,2)  , %2        \n\t"
102
        "add        %0          , %1        \n\t"
103
        "neg        %0                      \n\t"
104
        "1:                                 \n\t"
105
        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
106
        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
107
        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
108
        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
109
        "packssdw   %%mm1       , %%mm0     \n\t"
110
        "packssdw   %%mm3       , %%mm2     \n\t"
111
        "movq       %%mm0       ,  (%1,%0)  \n\t"
112
        "movq       %%mm2       , 8(%1,%0)  \n\t"
113
        "add        $16         , %0        \n\t"
114
        " js 1b                             \n\t"
115
        "emms                               \n\t"
116
        :"+r"(reglen), "+r"(dst), "+r"(src)
117
    );
118
}
119

    
120
static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
121
    x86_reg reglen = len;
122
    __asm__ volatile(
123
        "add        %0          , %0        \n\t"
124
        "lea         (%2,%0,2)  , %2        \n\t"
125
        "add        %0          , %1        \n\t"
126
        "neg        %0                      \n\t"
127
        "1:                                 \n\t"
128
        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
129
        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
130
        "packssdw   %%xmm1      , %%xmm0    \n\t"
131
        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
132
        "add        $16         , %0        \n\t"
133
        " js 1b                             \n\t"
134
        :"+r"(reglen), "+r"(dst), "+r"(src)
135
    );
136
}
137

    
138
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
139
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
140
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
141

    
142
#if !HAVE_YASM
143
#define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
144
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
145
#define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
146
#endif
147
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
148

    
149
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
150
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
151
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
152
    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
153
    int i,j,c;\
154
    for(c=0; c<channels; c++){\
155
        float_to_int16_##cpu(tmp, src[c], len);\
156
        for(i=0, j=c; i<len; i++, j+=channels)\
157
            dst[j] = tmp[i];\
158
    }\
159
}\
160
\
161
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
162
    if(channels==1)\
163
        float_to_int16_##cpu(dst, src[0], len);\
164
    else if(channels==2){\
165
        x86_reg reglen = len; \
166
        const float *src0 = src[0];\
167
        const float *src1 = src[1];\
168
        __asm__ volatile(\
169
            "shl $2, %0 \n"\
170
            "add %0, %1 \n"\
171
            "add %0, %2 \n"\
172
            "add %0, %3 \n"\
173
            "neg %0 \n"\
174
            body\
175
            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
176
        );\
177
    }else if(channels==6){\
178
        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
179
    }else\
180
        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
181
}
182

    
183
FLOAT_TO_INT16_INTERLEAVE(3dnow,
184
    "1:                         \n"
185
    "pf2id     (%2,%0), %%mm0   \n"
186
    "pf2id    8(%2,%0), %%mm1   \n"
187
    "pf2id     (%3,%0), %%mm2   \n"
188
    "pf2id    8(%3,%0), %%mm3   \n"
189
    "packssdw    %%mm1, %%mm0   \n"
190
    "packssdw    %%mm3, %%mm2   \n"
191
    "movq        %%mm0, %%mm1   \n"
192
    "punpcklwd   %%mm2, %%mm0   \n"
193
    "punpckhwd   %%mm2, %%mm1   \n"
194
    "movq        %%mm0,  (%1,%0)\n"
195
    "movq        %%mm1, 8(%1,%0)\n"
196
    "add $16, %0                \n"
197
    "js 1b                      \n"
198
    "femms                      \n"
199
)
200

    
201
FLOAT_TO_INT16_INTERLEAVE(sse,
202
    "1:                         \n"
203
    "cvtps2pi  (%2,%0), %%mm0   \n"
204
    "cvtps2pi 8(%2,%0), %%mm1   \n"
205
    "cvtps2pi  (%3,%0), %%mm2   \n"
206
    "cvtps2pi 8(%3,%0), %%mm3   \n"
207
    "packssdw    %%mm1, %%mm0   \n"
208
    "packssdw    %%mm3, %%mm2   \n"
209
    "movq        %%mm0, %%mm1   \n"
210
    "punpcklwd   %%mm2, %%mm0   \n"
211
    "punpckhwd   %%mm2, %%mm1   \n"
212
    "movq        %%mm0,  (%1,%0)\n"
213
    "movq        %%mm1, 8(%1,%0)\n"
214
    "add $16, %0                \n"
215
    "js 1b                      \n"
216
    "emms                       \n"
217
)
218

    
219
FLOAT_TO_INT16_INTERLEAVE(sse2,
220
    "1:                         \n"
221
    "cvtps2dq  (%2,%0), %%xmm0  \n"
222
    "cvtps2dq  (%3,%0), %%xmm1  \n"
223
    "packssdw   %%xmm1, %%xmm0  \n"
224
    "movhlps    %%xmm0, %%xmm1  \n"
225
    "punpcklwd  %%xmm1, %%xmm0  \n"
226
    "movdqa     %%xmm0, (%1,%0) \n"
227
    "add $16, %0                \n"
228
    "js 1b                      \n"
229
)
230

    
231
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
232
    if(channels==6)
233
        ff_float_to_int16_interleave6_3dn2(dst, src, len);
234
    else
235
        float_to_int16_interleave_3dnow(dst, src, len, channels);
236
}
237

    
238
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
239
{
240
    int mm_flags = av_get_cpu_flags();
241

    
242
    if (mm_flags & AV_CPU_FLAG_MMX) {
243

    
244
        if(mm_flags & AV_CPU_FLAG_3DNOW){
245
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
246
                c->float_to_int16 = float_to_int16_3dnow;
247
                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
248
            }
249
        }
250
        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
251
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
252
                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
253
            }
254
        }
255
        if(mm_flags & AV_CPU_FLAG_SSE){
256
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
257
            c->float_to_int16 = float_to_int16_sse;
258
            c->float_to_int16_interleave = float_to_int16_interleave_sse;
259
        }
260
        if(mm_flags & AV_CPU_FLAG_SSE2){
261
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
262
            c->float_to_int16 = float_to_int16_sse2;
263
            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
264
        }
265
    }
266
}