ffmpeg / libavcodec / x86 / lpc_mmx.c @ 77a78e9b
History | View | Annotate | Download (6.06 KB)
1 |
/*
|
---|---|
2 |
* MMX optimized LPC DSP utils
|
3 |
* Copyright (c) 2007 Loren Merritt
|
4 |
*
|
5 |
* This file is part of FFmpeg.
|
6 |
*
|
7 |
* FFmpeg is free software; you can redistribute it and/or
|
8 |
* modify it under the terms of the GNU Lesser General Public
|
9 |
* License as published by the Free Software Foundation; either
|
10 |
* version 2.1 of the License, or (at your option) any later version.
|
11 |
*
|
12 |
* FFmpeg is distributed in the hope that it will be useful,
|
13 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
15 |
* Lesser General Public License for more details.
|
16 |
*
|
17 |
* You should have received a copy of the GNU Lesser General Public
|
18 |
* License along with FFmpeg; if not, write to the Free Software
|
19 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
20 |
*/
|
21 |
|
22 |
#include "libavutil/x86_cpu.h" |
23 |
#include "libavutil/cpu.h" |
24 |
#include "libavcodec/lpc.h" |
25 |
|
26 |
static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data) |
27 |
{ |
28 |
double c = 2.0 / (len-1.0); |
29 |
int n2 = len>>1; |
30 |
x86_reg i = -n2*sizeof(int32_t);
|
31 |
x86_reg j = n2*sizeof(int32_t);
|
32 |
__asm__ volatile(
|
33 |
"movsd %4, %%xmm7 \n\t"
|
34 |
"movapd "MANGLE(ff_pd_1)", %%xmm6 \n\t" |
35 |
"movapd "MANGLE(ff_pd_2)", %%xmm5 \n\t" |
36 |
"movlhps %%xmm7, %%xmm7 \n\t"
|
37 |
"subpd %%xmm5, %%xmm7 \n\t"
|
38 |
"addsd %%xmm6, %%xmm7 \n\t"
|
39 |
"test $1, %5 \n\t"
|
40 |
"jz 2f \n\t"
|
41 |
#define WELCH(MOVPD, offset)\
|
42 |
"1: \n\t"\
|
43 |
"movapd %%xmm7, %%xmm1 \n\t"\
|
44 |
"mulpd %%xmm1, %%xmm1 \n\t"\
|
45 |
"movapd %%xmm6, %%xmm0 \n\t"\
|
46 |
"subpd %%xmm1, %%xmm0 \n\t"\
|
47 |
"pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
|
48 |
"cvtpi2pd (%3,%0), %%xmm2 \n\t"\
|
49 |
"cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\ |
50 |
"mulpd %%xmm0, %%xmm2 \n\t"\
|
51 |
"mulpd %%xmm1, %%xmm3 \n\t"\
|
52 |
"movapd %%xmm2, (%2,%0,2) \n\t"\
|
53 |
MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\ |
54 |
"subpd %%xmm5, %%xmm7 \n\t"\
|
55 |
"sub $8, %1 \n\t"\
|
56 |
"add $8, %0 \n\t"\
|
57 |
"jl 1b \n\t"\
|
58 |
|
59 |
WELCH("movupd", -1) |
60 |
"jmp 3f \n\t"
|
61 |
"2: \n\t"
|
62 |
WELCH("movapd", -2) |
63 |
"3: \n\t"
|
64 |
:"+&r"(i), "+&r"(j) |
65 |
:"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len) |
66 |
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", |
67 |
"%xmm5", "%xmm6", "%xmm7") |
68 |
); |
69 |
#undef WELCH
|
70 |
} |
71 |
|
72 |
static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, |
73 |
double *autoc)
|
74 |
{ |
75 |
int j;
|
76 |
|
77 |
if((x86_reg)data & 15) |
78 |
data++; |
79 |
|
80 |
for(j=0; j<lag; j+=2){ |
81 |
x86_reg i = -len*sizeof(double); |
82 |
if(j == lag-2) { |
83 |
__asm__ volatile(
|
84 |
"movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t" |
85 |
"movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t" |
86 |
"movsd "MANGLE(ff_pd_1)", %%xmm2 \n\t" |
87 |
"1: \n\t"
|
88 |
"movapd (%2,%0), %%xmm3 \n\t"
|
89 |
"movupd -8(%3,%0), %%xmm4 \n\t"
|
90 |
"movapd (%3,%0), %%xmm5 \n\t"
|
91 |
"mulpd %%xmm3, %%xmm4 \n\t"
|
92 |
"mulpd %%xmm3, %%xmm5 \n\t"
|
93 |
"mulpd -16(%3,%0), %%xmm3 \n\t"
|
94 |
"addpd %%xmm4, %%xmm1 \n\t"
|
95 |
"addpd %%xmm5, %%xmm0 \n\t"
|
96 |
"addpd %%xmm3, %%xmm2 \n\t"
|
97 |
"add $16, %0 \n\t"
|
98 |
"jl 1b \n\t"
|
99 |
"movhlps %%xmm0, %%xmm3 \n\t"
|
100 |
"movhlps %%xmm1, %%xmm4 \n\t"
|
101 |
"movhlps %%xmm2, %%xmm5 \n\t"
|
102 |
"addsd %%xmm3, %%xmm0 \n\t"
|
103 |
"addsd %%xmm4, %%xmm1 \n\t"
|
104 |
"addsd %%xmm5, %%xmm2 \n\t"
|
105 |
"movsd %%xmm0, (%1) \n\t"
|
106 |
"movsd %%xmm1, 8(%1) \n\t"
|
107 |
"movsd %%xmm2, 16(%1) \n\t"
|
108 |
:"+&r"(i)
|
109 |
:"r"(autoc+j), "r"(data+len), "r"(data+len-j) |
110 |
:"memory"
|
111 |
); |
112 |
} else {
|
113 |
__asm__ volatile(
|
114 |
"movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t" |
115 |
"movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t" |
116 |
"1: \n\t"
|
117 |
"movapd (%3,%0), %%xmm3 \n\t"
|
118 |
"movupd -8(%4,%0), %%xmm4 \n\t"
|
119 |
"mulpd %%xmm3, %%xmm4 \n\t"
|
120 |
"mulpd (%4,%0), %%xmm3 \n\t"
|
121 |
"addpd %%xmm4, %%xmm1 \n\t"
|
122 |
"addpd %%xmm3, %%xmm0 \n\t"
|
123 |
"add $16, %0 \n\t"
|
124 |
"jl 1b \n\t"
|
125 |
"movhlps %%xmm0, %%xmm3 \n\t"
|
126 |
"movhlps %%xmm1, %%xmm4 \n\t"
|
127 |
"addsd %%xmm3, %%xmm0 \n\t"
|
128 |
"addsd %%xmm4, %%xmm1 \n\t"
|
129 |
"movsd %%xmm0, %1 \n\t"
|
130 |
"movsd %%xmm1, %2 \n\t"
|
131 |
:"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]) |
132 |
:"r"(data+len), "r"(data+len-j) |
133 |
); |
134 |
} |
135 |
} |
136 |
} |
137 |
|
138 |
av_cold void ff_lpc_init_x86(LPCContext *c)
|
139 |
{ |
140 |
int mm_flags = av_get_cpu_flags();
|
141 |
|
142 |
if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
|
143 |
c->lpc_apply_welch_window = apply_welch_window_sse2; |
144 |
c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; |
145 |
} |
146 |
} |