ffmpeg / libavcodec / x86 / dsputil_mmx.h @ 12802ec0
History | View | Annotate | Download (7.86 KB)
1 |
/*
|
---|---|
2 |
* MMX optimized DSP utils
|
3 |
* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
|
4 |
*
|
5 |
* This file is part of FFmpeg.
|
6 |
*
|
7 |
* FFmpeg is free software; you can redistribute it and/or
|
8 |
* modify it under the terms of the GNU Lesser General Public
|
9 |
* License as published by the Free Software Foundation; either
|
10 |
* version 2.1 of the License, or (at your option) any later version.
|
11 |
*
|
12 |
* FFmpeg is distributed in the hope that it will be useful,
|
13 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
15 |
* Lesser General Public License for more details.
|
16 |
*
|
17 |
* You should have received a copy of the GNU Lesser General Public
|
18 |
* License along with FFmpeg; if not, write to the Free Software
|
19 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
20 |
*/
|
21 |
|
22 |
#ifndef AVCODEC_X86_DSPUTIL_MMX_H
|
23 |
#define AVCODEC_X86_DSPUTIL_MMX_H
|
24 |
|
25 |
#include <stdint.h> |
26 |
#include "libavcodec/dsputil.h" |
27 |
#include "libavutil/x86_cpu.h" |
28 |
|
29 |
typedef struct { uint64_t a, b; } xmm_reg; |
30 |
|
31 |
extern const uint64_t ff_bone; |
32 |
extern const uint64_t ff_wtwo; |
33 |
|
34 |
extern const uint64_t ff_pdw_80000000[2]; |
35 |
|
36 |
extern const xmm_reg ff_pw_3; |
37 |
extern const xmm_reg ff_pw_4; |
38 |
extern const xmm_reg ff_pw_5; |
39 |
extern const xmm_reg ff_pw_8; |
40 |
extern const uint64_t ff_pw_15; |
41 |
extern const xmm_reg ff_pw_16; |
42 |
extern const xmm_reg ff_pw_18; |
43 |
extern const uint64_t ff_pw_20; |
44 |
extern const xmm_reg ff_pw_27; |
45 |
extern const xmm_reg ff_pw_28; |
46 |
extern const xmm_reg ff_pw_32; |
47 |
extern const uint64_t ff_pw_42; |
48 |
extern const uint64_t ff_pw_53; |
49 |
extern const xmm_reg ff_pw_63; |
50 |
extern const xmm_reg ff_pw_64; |
51 |
extern const uint64_t ff_pw_96; |
52 |
extern const uint64_t ff_pw_128; |
53 |
extern const uint64_t ff_pw_255; |
54 |
|
55 |
extern const xmm_reg ff_pb_1; |
56 |
extern const xmm_reg ff_pb_3; |
57 |
extern const uint64_t ff_pb_7; |
58 |
extern const uint64_t ff_pb_1F; |
59 |
extern const uint64_t ff_pb_3F; |
60 |
extern const uint64_t ff_pb_81; |
61 |
extern const xmm_reg ff_pb_A1; |
62 |
extern const xmm_reg ff_pb_F8; |
63 |
extern const uint64_t ff_pb_FC; |
64 |
extern const xmm_reg ff_pb_FE; |
65 |
|
66 |
extern const double ff_pd_1[2]; |
67 |
extern const double ff_pd_2[2]; |
68 |
|
69 |
#define LOAD4(stride,in,a,b,c,d)\
|
70 |
"movq 0*"#stride"+"#in", "#a"\n\t"\ |
71 |
"movq 1*"#stride"+"#in", "#b"\n\t"\ |
72 |
"movq 2*"#stride"+"#in", "#c"\n\t"\ |
73 |
"movq 3*"#stride"+"#in", "#d"\n\t" |
74 |
|
75 |
#define STORE4(stride,out,a,b,c,d)\
|
76 |
"movq "#a", 0*"#stride"+"#out"\n\t"\ |
77 |
"movq "#b", 1*"#stride"+"#out"\n\t"\ |
78 |
"movq "#c", 2*"#stride"+"#out"\n\t"\ |
79 |
"movq "#d", 3*"#stride"+"#out"\n\t" |
80 |
|
81 |
/* in/out: mma=mma+mmb, mmb=mmb-mma */
|
82 |
#define SUMSUB_BA( a, b ) \
|
83 |
"paddw "#b", "#a" \n\t"\ |
84 |
"paddw "#b", "#b" \n\t"\ |
85 |
"psubw "#a", "#b" \n\t" |
86 |
|
87 |
#define SBUTTERFLY(a,b,t,n,m)\
|
88 |
"mov" #m " " #a ", " #t " \n\t" /* abcd */\ |
89 |
"punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ |
90 |
"punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ |
91 |
|
92 |
#define TRANSPOSE4(a,b,c,d,t)\
|
93 |
SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
|
94 |
SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
|
95 |
SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
|
96 |
SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
|
97 |
|
98 |
static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){ |
99 |
__asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... |
100 |
"movd (%1), %%mm0 \n\t"
|
101 |
"add %3, %1 \n\t"
|
102 |
"movd (%1), %%mm1 \n\t"
|
103 |
"movd (%1,%3,1), %%mm2 \n\t"
|
104 |
"movd (%1,%3,2), %%mm3 \n\t"
|
105 |
"punpcklbw %%mm1, %%mm0 \n\t"
|
106 |
"punpcklbw %%mm3, %%mm2 \n\t"
|
107 |
"movq %%mm0, %%mm1 \n\t"
|
108 |
"punpcklwd %%mm2, %%mm0 \n\t"
|
109 |
"punpckhwd %%mm2, %%mm1 \n\t"
|
110 |
"movd %%mm0, (%0) \n\t"
|
111 |
"add %2, %0 \n\t"
|
112 |
"punpckhdq %%mm0, %%mm0 \n\t"
|
113 |
"movd %%mm0, (%0) \n\t"
|
114 |
"movd %%mm1, (%0,%2,1) \n\t"
|
115 |
"punpckhdq %%mm1, %%mm1 \n\t"
|
116 |
"movd %%mm1, (%0,%2,2) \n\t"
|
117 |
|
118 |
: "+&r" (dst),
|
119 |
"+&r" (src)
|
120 |
: "r" (dst_stride),
|
121 |
"r" (src_stride)
|
122 |
: "memory"
|
123 |
); |
124 |
} |
125 |
|
126 |
// e,f,g,h can be memory
|
127 |
// out: a,d,t,c
|
128 |
#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\
|
129 |
"punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\ |
130 |
"punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\ |
131 |
"punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\ |
132 |
"punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\ |
133 |
SBUTTERFLY(a, b, t, bw, q) /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\
|
134 |
/* t= a2 b2 e2 f2 a3 b3 e3 f3 */\
|
135 |
SBUTTERFLY(c, d, b, bw, q) /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\
|
136 |
/* b= c2 d2 g2 h2 c3 d3 g3 h3 */\
|
137 |
SBUTTERFLY(a, c, d, wd, q) /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\
|
138 |
/* d= a1 b1 c1 d1 e1 f1 g1 h1 */\
|
139 |
SBUTTERFLY(t, b, c, wd, q) /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\
|
140 |
/* c= a3 b3 c3 d3 e3 f3 g3 h3 */
|
141 |
|
142 |
#if ARCH_X86_64
|
143 |
// permutes 01234567 -> 05736421
|
144 |
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
|
145 |
SBUTTERFLY(a,b,%%xmm8,wd,dqa)\ |
146 |
SBUTTERFLY(c,d,b,wd,dqa)\ |
147 |
SBUTTERFLY(e,f,d,wd,dqa)\ |
148 |
SBUTTERFLY(g,h,f,wd,dqa)\ |
149 |
SBUTTERFLY(a,c,h,dq,dqa)\ |
150 |
SBUTTERFLY(%%xmm8,b,c,dq,dqa)\ |
151 |
SBUTTERFLY(e,g,b,dq,dqa)\ |
152 |
SBUTTERFLY(d,f,g,dq,dqa)\ |
153 |
SBUTTERFLY(a,e,f,qdq,dqa)\ |
154 |
SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\ |
155 |
SBUTTERFLY(h,b,d,qdq,dqa)\ |
156 |
SBUTTERFLY(c,g,b,qdq,dqa)\ |
157 |
"movdqa %%xmm8, "#g" \n\t" |
158 |
#else
|
159 |
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
|
160 |
"movdqa "#h", "#t" \n\t"\ |
161 |
SBUTTERFLY(a,b,h,wd,dqa)\ |
162 |
"movdqa "#h", 16"#t" \n\t"\ |
163 |
"movdqa "#t", "#h" \n\t"\ |
164 |
SBUTTERFLY(c,d,b,wd,dqa)\ |
165 |
SBUTTERFLY(e,f,d,wd,dqa)\ |
166 |
SBUTTERFLY(g,h,f,wd,dqa)\ |
167 |
SBUTTERFLY(a,c,h,dq,dqa)\ |
168 |
"movdqa "#h", "#t" \n\t"\ |
169 |
"movdqa 16"#t", "#h" \n\t"\ |
170 |
SBUTTERFLY(h,b,c,dq,dqa)\ |
171 |
SBUTTERFLY(e,g,b,dq,dqa)\ |
172 |
SBUTTERFLY(d,f,g,dq,dqa)\ |
173 |
SBUTTERFLY(a,e,f,qdq,dqa)\ |
174 |
SBUTTERFLY(h,d,e,qdq,dqa)\ |
175 |
"movdqa "#h", 16"#t" \n\t"\ |
176 |
"movdqa "#t", "#h" \n\t"\ |
177 |
SBUTTERFLY(h,b,d,qdq,dqa)\ |
178 |
SBUTTERFLY(c,g,b,qdq,dqa)\ |
179 |
"movdqa 16"#t", "#g" \n\t" |
180 |
#endif
|
181 |
|
182 |
#define MOVQ_WONE(regd) \
|
183 |
__asm__ volatile ( \
|
184 |
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
185 |
"psrlw $15, %%" #regd ::) |
186 |
|
187 |
void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx);
|
188 |
void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
|
189 |
|
190 |
void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); |
191 |
void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); |
192 |
void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); |
193 |
|
194 |
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); |
195 |
void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); |
196 |
void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); |
197 |
void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); |
198 |
|
199 |
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); |
200 |
void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd); |
201 |
|
202 |
void ff_mmx_idct(DCTELEM *block);
|
203 |
void ff_mmxext_idct(DCTELEM *block);
|
204 |
|
205 |
|
206 |
void ff_deinterlace_line_mmx(uint8_t *dst,
|
207 |
const uint8_t *lum_m4, const uint8_t *lum_m3, |
208 |
const uint8_t *lum_m2, const uint8_t *lum_m1, |
209 |
const uint8_t *lum,
|
210 |
int size);
|
211 |
|
212 |
void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4, |
213 |
const uint8_t *lum_m3,
|
214 |
const uint8_t *lum_m2,
|
215 |
const uint8_t *lum_m1,
|
216 |
const uint8_t *lum, int size); |
217 |
|
218 |
#endif /* AVCODEC_X86_DSPUTIL_MMX_H */ |