ffmpeg / libavcodec / i386 / vp3dsp_sse2.c @ b550bfaa
History | View | Annotate | Download (35.3 KB)
1 |
/*
|
---|---|
2 |
* Copyright (C) 2004 the ffmpeg project
|
3 |
*
|
4 |
* This file is part of FFmpeg.
|
5 |
*
|
6 |
* FFmpeg is free software; you can redistribute it and/or
|
7 |
* modify it under the terms of the GNU Lesser General Public
|
8 |
* License as published by the Free Software Foundation; either
|
9 |
* version 2.1 of the License, or (at your option) any later version.
|
10 |
*
|
11 |
* FFmpeg is distributed in the hope that it will be useful,
|
12 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14 |
* Lesser General Public License for more details.
|
15 |
*
|
16 |
* You should have received a copy of the GNU Lesser General Public
|
17 |
* License along with FFmpeg; if not, write to the Free Software
|
18 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
19 |
*/
|
20 |
|
21 |
/**
|
22 |
* @file vp3dsp_sse2.c
|
23 |
* SSE2-optimized functions cribbed from the original VP3 source code.
|
24 |
*/
|
25 |
|
26 |
#include "dsputil.h" |
27 |
#include "mmx.h" |
28 |
|
29 |
static DECLARE_ALIGNED_16(const unsigned short, SSE2_dequant_const[]) = |
30 |
{ |
31 |
0,65535,65535,0,0,0,0,0, // 0x0000 0000 0000 0000 0000 FFFF FFFF 0000 |
32 |
0,0,0,0,65535,65535,0,0, // 0x0000 0000 FFFF FFFF 0000 0000 0000 0000 |
33 |
65535,65535,65535,0,0,0,0,0,// 0x0000 0000 0000 0000 0000 FFFF FFFF FFFF |
34 |
0,0,0,65535,0,0,0,0, // 0x0000 0000 0000 0000 FFFF 0000 0000 0000 |
35 |
0,0,0,65535,65535,0,0,0, // 0x0000 0000 0000 FFFF FFFF 0000 0000 0000 |
36 |
65535,0,0,0,0,65535,0,0, // 0x0000 0000 FFFF 0000 0000 0000 0000 FFFF |
37 |
0,0,65535,65535, 0,0,0,0 // 0x0000 0000 0000 0000 FFFF FFFF 0000 0000 |
38 |
}; |
39 |
|
40 |
static DECLARE_ALIGNED_16(const unsigned int, eight_data[]) = |
41 |
{ |
42 |
0x00080008,
|
43 |
0x00080008,
|
44 |
0x00080008,
|
45 |
0x00080008
|
46 |
}; |
47 |
|
48 |
static DECLARE_ALIGNED_16(const unsigned short, SSE2_idct_data[7 * 8]) = |
49 |
{ |
50 |
64277,64277,64277,64277,64277,64277,64277,64277, |
51 |
60547,60547,60547,60547,60547,60547,60547,60547, |
52 |
54491,54491,54491,54491,54491,54491,54491,54491, |
53 |
46341,46341,46341,46341,46341,46341,46341,46341, |
54 |
36410,36410,36410,36410,36410,36410,36410,36410, |
55 |
25080,25080,25080,25080,25080,25080,25080,25080, |
56 |
12785,12785,12785,12785,12785,12785,12785,12785 |
57 |
}; |
58 |
|
59 |
|
60 |
#define SSE2_Column_IDCT() { \
|
61 |
\ |
62 |
movdqu_m2r(*I(3), xmm2); /* xmm2 = i3 */ \ |
63 |
movdqu_m2r(*C(3), xmm6); /* xmm6 = c3 */ \ |
64 |
\ |
65 |
movdqu_r2r(xmm2, xmm4); /* xmm4 = i3 */ \
|
66 |
movdqu_m2r(*I(5), xmm7); /* xmm7 = i5 */ \ |
67 |
\ |
68 |
pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \
|
69 |
movdqu_m2r(*C(5), xmm1); /* xmm1 = c5 */ \ |
70 |
\ |
71 |
pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \
|
72 |
movdqu_r2r(xmm1, xmm5); /* xmm5 = c5 */ \
|
73 |
\ |
74 |
pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \
|
75 |
movdqu_m2r(*I(1), xmm3); /* xmm3 = i1 */ \ |
76 |
\ |
77 |
pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \
|
78 |
movdqu_m2r(*C(1), xmm0); /* xmm0 = c1 */ \ |
79 |
\ |
80 |
/* all registers are in use */ \
|
81 |
\ |
82 |
paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \
|
83 |
paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \
|
84 |
\ |
85 |
paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \
|
86 |
movdqu_m2r(*I(7), xmm1); /* xmm1 = i7 */ \ |
87 |
\ |
88 |
paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \
|
89 |
movdqu_r2r(xmm0, xmm5); /* xmm5 = c1 */ \
|
90 |
\ |
91 |
pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \
|
92 |
paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \
|
93 |
\ |
94 |
pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \
|
95 |
movdqu_m2r(*C(7), xmm7); /* xmm7 = c7 */ \ |
96 |
\ |
97 |
psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \
|
98 |
paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \
|
99 |
\ |
100 |
pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \
|
101 |
movdqu_m2r(*I(2), xmm2); /* xmm2 = i2 */ \ |
102 |
\ |
103 |
pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \
|
104 |
paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \
|
105 |
\ |
106 |
movdqu_r2r(xmm2, xmm1); /* xmm1 = i2 */ \
|
107 |
pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \ |
108 |
\ |
109 |
psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \
|
110 |
movdqu_m2r(*I(6), xmm5); /* xmm5 = i6 */ \ |
111 |
\ |
112 |
paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \
|
113 |
movdqu_r2r(xmm5, xmm7); /* xmm7 = i6 */ \
|
114 |
\ |
115 |
psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \
|
116 |
pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \ |
117 |
\ |
118 |
paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \
|
119 |
pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \ |
120 |
\ |
121 |
paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \
|
122 |
paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \
|
123 |
\ |
124 |
psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \
|
125 |
paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \
|
126 |
\ |
127 |
paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \
|
128 |
pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \ |
129 |
\ |
130 |
paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \
|
131 |
movdqu_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \ |
132 |
\ |
133 |
psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \
|
134 |
movdqu_m2r(*C(4), xmm4); /* xmm4 = c4 */ \ |
135 |
\ |
136 |
movdqu_r2r(xmm3, xmm5); /* xmm5 = B - D */ \
|
137 |
pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
|
138 |
\ |
139 |
paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \
|
140 |
movdqu_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \ |
141 |
\ |
142 |
movdqu_r2r(xmm0, xmm2); /* xmm2 = A - C */ \
|
143 |
movdqu_m2r(*I(0), xmm6); /* xmm6 = i0 */ \ |
144 |
\ |
145 |
pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
|
146 |
paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \
|
147 |
\ |
148 |
movdqu_m2r(*I(4), xmm3); /* xmm3 = i4 */ \ |
149 |
psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \
|
150 |
\ |
151 |
paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \
|
152 |
psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \
|
153 |
\ |
154 |
movdqu_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \
|
155 |
pmulhw_r2r(xmm4, xmm6); /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \
|
156 |
\ |
157 |
paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \
|
158 |
paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \
|
159 |
\ |
160 |
paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \
|
161 |
paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \
|
162 |
\ |
163 |
pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
|
164 |
paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \
|
165 |
\ |
166 |
psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \
|
167 |
paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \
|
168 |
\ |
169 |
movdqu_m2r(*I(1), xmm0); /* Load C. from I(1) */ \ |
170 |
paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \
|
171 |
\ |
172 |
paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
|
173 |
psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \
|
174 |
\ |
175 |
paddsw_m2r(*Eight, xmm2); /* Adjust R2 and R1 before shifting */ \
|
176 |
paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \
|
177 |
\ |
178 |
paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \
|
179 |
psraw_i2r(4, xmm2); /* xmm2 = op2 */ \ |
180 |
\ |
181 |
psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \
|
182 |
psraw_i2r(4, xmm1); /* xmm1 = op1 */ \ |
183 |
\ |
184 |
movdqu_m2r(*I(2), xmm3); /* Load D. from I(2) */ \ |
185 |
paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \
|
186 |
\ |
187 |
movdqu_r2m(xmm2, *O(2)); /* Write out op2 */ \ |
188 |
paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \
|
189 |
\ |
190 |
movdqu_r2m(xmm1, *O(1)); /* Write out op1 */ \ |
191 |
psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \
|
192 |
\ |
193 |
paddsw_m2r(*Eight, xmm4); /* Adjust R4 and R3 before shifting */ \
|
194 |
paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \
|
195 |
\ |
196 |
paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \
|
197 |
psraw_i2r(4, xmm4); /* xmm4 = op4 */ \ |
198 |
\ |
199 |
psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \
|
200 |
psraw_i2r(4, xmm3); /* xmm3 = op3 */ \ |
201 |
\ |
202 |
paddsw_m2r(*Eight, xmm6); /* Adjust R6 and R5 before shifting */ \
|
203 |
paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \
|
204 |
\ |
205 |
paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \
|
206 |
psraw_i2r(4, xmm6); /* xmm6 = op6 */ \ |
207 |
\ |
208 |
movdqu_r2m(xmm4, *O(4)); /* Write out op4 */ \ |
209 |
psraw_i2r(4, xmm5); /* xmm5 = op5 */ \ |
210 |
\ |
211 |
movdqu_r2m(xmm3, *O(3)); /* Write out op3 */ \ |
212 |
psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \
|
213 |
\ |
214 |
paddsw_m2r(*Eight, xmm7); /* Adjust R7 and R0 before shifting */ \
|
215 |
paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \
|
216 |
\ |
217 |
paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \
|
218 |
psraw_i2r(4, xmm7); /* xmm7 = op7 */ \ |
219 |
\ |
220 |
movdqu_r2m(xmm6, *O(6)); /* Write out op6 */ \ |
221 |
psraw_i2r(4, xmm0); /* xmm0 = op0 */ \ |
222 |
\ |
223 |
movdqu_r2m(xmm5, *O(5)); /* Write out op5 */ \ |
224 |
movdqu_r2m(xmm7, *O(7)); /* Write out op7 */ \ |
225 |
\ |
226 |
movdqu_r2m(xmm0, *O(0)); /* Write out op0 */ \ |
227 |
\ |
228 |
} /* End of SSE2_Column_IDCT macro */
|
229 |
|
230 |
|
231 |
#define SSE2_Row_IDCT() { \
|
232 |
\ |
233 |
movdqu_m2r(*I(3), xmm2); /* xmm2 = i3 */ \ |
234 |
movdqu_m2r(*C(3), xmm6); /* xmm6 = c3 */ \ |
235 |
\ |
236 |
movdqu_r2r(xmm2, xmm4); /* xmm4 = i3 */ \
|
237 |
movdqu_m2r(*I(5), xmm7); /* xmm7 = i5 */ \ |
238 |
\ |
239 |
pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \
|
240 |
movdqu_m2r(*C(5), xmm1); /* xmm1 = c5 */ \ |
241 |
\ |
242 |
pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \
|
243 |
movdqu_r2r(xmm1, xmm5); /* xmm5 = c5 */ \
|
244 |
\ |
245 |
pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \
|
246 |
movdqu_m2r(*I(1), xmm3); /* xmm3 = i1 */ \ |
247 |
\ |
248 |
pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \
|
249 |
movdqu_m2r(*C(1), xmm0); /* xmm0 = c1 */ \ |
250 |
\ |
251 |
/* all registers are in use */ \
|
252 |
\ |
253 |
paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \
|
254 |
paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \
|
255 |
\ |
256 |
paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \
|
257 |
movdqu_m2r(*I(7), xmm1); /* xmm1 = i7 */ \ |
258 |
\ |
259 |
paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \
|
260 |
movdqu_r2r(xmm0, xmm5); /* xmm5 = c1 */ \
|
261 |
\ |
262 |
pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \
|
263 |
paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \
|
264 |
\ |
265 |
pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \
|
266 |
movdqu_m2r(*C(7), xmm7); /* xmm7 = c7 */ \ |
267 |
\ |
268 |
psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \
|
269 |
paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \
|
270 |
\ |
271 |
pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \
|
272 |
movdqu_m2r(*I(2), xmm2); /* xmm2 = i2 */ \ |
273 |
\ |
274 |
pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \
|
275 |
paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \
|
276 |
\ |
277 |
movdqu_r2r(xmm2, xmm1); /* xmm1 = i2 */ \
|
278 |
pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \ |
279 |
\ |
280 |
psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \
|
281 |
movdqu_m2r(*I(6), xmm5); /* xmm5 = i6 */ \ |
282 |
\ |
283 |
paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \
|
284 |
movdqu_r2r(xmm5, xmm7); /* xmm7 = i6 */ \
|
285 |
\ |
286 |
psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \
|
287 |
pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \ |
288 |
\ |
289 |
paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \
|
290 |
pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \ |
291 |
\ |
292 |
paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \
|
293 |
paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \
|
294 |
\ |
295 |
psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \
|
296 |
paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \
|
297 |
\ |
298 |
paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \
|
299 |
pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \ |
300 |
\ |
301 |
paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \
|
302 |
movdqu_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \ |
303 |
\ |
304 |
psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \
|
305 |
movdqu_m2r(*C(4), xmm4); /* xmm4 = c4 */ \ |
306 |
\ |
307 |
movdqu_r2r(xmm3, xmm5); /* xmm5 = B - D */ \
|
308 |
pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
|
309 |
\ |
310 |
paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \
|
311 |
movdqu_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \ |
312 |
\ |
313 |
movdqu_r2r(xmm0, xmm2); /* xmm2 = A - C */ \
|
314 |
movdqu_m2r(*I(0), xmm6); /* xmm6 = i0 */ \ |
315 |
\ |
316 |
pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
|
317 |
paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \
|
318 |
\ |
319 |
movdqu_m2r(*I(4), xmm3); /* xmm3 = i4 */ \ |
320 |
psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \
|
321 |
\ |
322 |
paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \
|
323 |
psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \
|
324 |
\ |
325 |
movdqu_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \
|
326 |
pmulhw_r2r(xmm4, xmm6); /* xmm6 = ( c4 - 1 ) * ( i0 - i4 ) = F */ \
|
327 |
\ |
328 |
paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \
|
329 |
paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \
|
330 |
\ |
331 |
paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \
|
332 |
paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \
|
333 |
\ |
334 |
pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
|
335 |
paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \
|
336 |
\ |
337 |
psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \
|
338 |
paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \
|
339 |
\ |
340 |
movdqu_m2r(*I(1), xmm0); /* Load C. from I(1) */ \ |
341 |
paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \
|
342 |
\ |
343 |
paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
|
344 |
psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \
|
345 |
\ |
346 |
paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \
|
347 |
paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \
|
348 |
\ |
349 |
psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \
|
350 |
\ |
351 |
movdqu_m2r(*I(2), xmm3); /* Load D. from I(2) */ \ |
352 |
paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \
|
353 |
\ |
354 |
movdqu_r2m(xmm2, *I(2)); /* Write out op2 */ \ |
355 |
paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \
|
356 |
\ |
357 |
movdqu_r2m(xmm1, *I(1)); /* Write out op1 */ \ |
358 |
psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \
|
359 |
\ |
360 |
paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \
|
361 |
\ |
362 |
paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \
|
363 |
\ |
364 |
psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \
|
365 |
\ |
366 |
paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \
|
367 |
\ |
368 |
paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \
|
369 |
\ |
370 |
movdqu_r2m(xmm4, *I(4)); /* Write out op4 */ \ |
371 |
\ |
372 |
movdqu_r2m(xmm3, *I(3)); /* Write out op3 */ \ |
373 |
psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \
|
374 |
\ |
375 |
paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \
|
376 |
\ |
377 |
paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \
|
378 |
\ |
379 |
movdqu_r2m(xmm6, *I(6)); /* Write out op6 */ \ |
380 |
\ |
381 |
movdqu_r2m(xmm5, *I(5)); /* Write out op5 */ \ |
382 |
movdqu_r2m(xmm7, *I(7)); /* Write out op7 */ \ |
383 |
\ |
384 |
movdqu_r2m(xmm0, *I(0)); /* Write out op0 */ \ |
385 |
\ |
386 |
} /* End of SSE2_Row_IDCT macro */
|
387 |
|
388 |
|
389 |
#define SSE2_Transpose() { \
|
390 |
\ |
391 |
movdqu_m2r(*I(4), xmm4); /* xmm4=e7e6e5e4e3e2e1e0 */ \ |
392 |
movdqu_m2r(*I(5), xmm0); /* xmm4=f7f6f5f4f3f2f1f0 */ \ |
393 |
\ |
394 |
movdqu_r2r(xmm4, xmm5); /* make a copy */ \
|
395 |
punpcklwd_r2r(xmm0, xmm4); /* xmm4=f3e3f2e2f1e1f0e0 */ \
|
396 |
\ |
397 |
punpckhwd_r2r(xmm0, xmm5); /* xmm5=f7e7f6e6f5e5f4e4 */ \
|
398 |
movdqu_m2r(*I(6), xmm6); /* xmm6=g7g6g5g4g3g2g1g0 */ \ |
399 |
\ |
400 |
movdqu_m2r(*I(7), xmm0); /* xmm0=h7h6h5h4h3h2h1h0 */ \ |
401 |
movdqu_r2r(xmm6, xmm7); /* make a copy */ \
|
402 |
\ |
403 |
punpcklwd_r2r(xmm0, xmm6); /* xmm6=h3g3h3g2h1g1h0g0 */ \
|
404 |
punpckhwd_r2r(xmm0, xmm7); /* xmm7=h7g7h6g6h5g5h4g4 */ \
|
405 |
\ |
406 |
movdqu_r2r(xmm4, xmm3); /* make a copy */ \
|
407 |
punpckldq_r2r(xmm6, xmm4); /* xmm4=h1g1f1e1h0g0f0e0 */ \
|
408 |
\ |
409 |
punpckhdq_r2r(xmm6, xmm3); /* xmm3=h3g3g3e3h2g2f2e2 */ \
|
410 |
movdqu_r2m(xmm3, *I(6)); /* save h3g3g3e3h2g2f2e2 */ \ |
411 |
/* Free xmm6 */ \
|
412 |
movdqu_r2r(xmm5, xmm6); /* make a copy */ \
|
413 |
punpckldq_r2r(xmm7, xmm5); /* xmm5=h5g5f5e5h4g4f4e4 */ \
|
414 |
\ |
415 |
punpckhdq_r2r(xmm7, xmm6); /* xmm6=h7g7f7e7h6g6f6e6 */ \
|
416 |
movdqu_m2r(*I(0), xmm0); /* xmm0=a7a6a5a4a3a2a1a0 */ \ |
417 |
/* Free xmm7 */ \
|
418 |
movdqu_m2r(*I(1), xmm1); /* xmm1=b7b6b5b4b3b2b1b0 */ \ |
419 |
movdqu_r2r(xmm0, xmm7); /* make a copy */ \
|
420 |
\ |
421 |
punpcklwd_r2r(xmm1, xmm0); /* xmm0=b3a3b2a2b1a1b0a0 */ \
|
422 |
punpckhwd_r2r(xmm1, xmm7); /* xmm7=b7a7b6a6b5a5b4a4 */ \
|
423 |
/* Free xmm1 */ \
|
424 |
movdqu_m2r(*I(2), xmm2); /* xmm2=c7c6c5c4c3c2c1c0 */ \ |
425 |
movdqu_m2r(*I(3), xmm3); /* xmm3=d7d6d5d4d3d2d1d0 */ \ |
426 |
\ |
427 |
movdqu_r2r(xmm2, xmm1); /* make a copy */ \
|
428 |
punpcklwd_r2r(xmm3, xmm2); /* xmm2=d3c3d2c2d1c1d0c0 */ \
|
429 |
\ |
430 |
punpckhwd_r2r(xmm3, xmm1); /* xmm1=d7c7d6c6d5c5d4c4 */ \
|
431 |
movdqu_r2r(xmm0, xmm3); /* make a copy */ \
|
432 |
\ |
433 |
punpckldq_r2r(xmm2, xmm0); /* xmm0=d1c1b1a1d0c0b0a0 */ \
|
434 |
punpckhdq_r2r(xmm2, xmm3); /* xmm3=d3c3b3a3d2c2b2a2 */ \
|
435 |
/* Free xmm2 */ \
|
436 |
movdqu_r2r(xmm7, xmm2); /* make a copy */ \
|
437 |
punpckldq_r2r(xmm1, xmm2); /* xmm2=d5c5b5a5d4c4b4a4 */ \
|
438 |
\ |
439 |
punpckhdq_r2r(xmm1, xmm7); /* xmm7=d7c7b7a7d6c6b6a6 */ \
|
440 |
movdqu_r2r(xmm0, xmm1); /* make a copy */ \
|
441 |
\ |
442 |
punpcklqdq_r2r(xmm4, xmm0); /* xmm0=h0g0f0e0d0c0b0a0 */ \
|
443 |
punpckhqdq_r2r(xmm4, xmm1); /* xmm1=h1g1g1e1d1c1b1a1 */ \
|
444 |
\ |
445 |
movdqu_r2m(xmm0, *I(0)); /* save I(0) */ \ |
446 |
movdqu_r2m(xmm1, *I(1)); /* save I(1) */ \ |
447 |
\ |
448 |
movdqu_m2r(*I(6), xmm0); /* load h3g3g3e3h2g2f2e2 */ \ |
449 |
movdqu_r2r(xmm3, xmm1); /* make a copy */ \
|
450 |
\ |
451 |
punpcklqdq_r2r(xmm0, xmm1); /* xmm1=h2g2f2e2d2c2b2a2 */ \
|
452 |
punpckhqdq_r2r(xmm0, xmm3); /* xmm3=h3g3f3e3d3c3b3a3 */ \
|
453 |
\ |
454 |
movdqu_r2r(xmm2, xmm4); /* make a copy */ \
|
455 |
punpcklqdq_r2r(xmm5, xmm4); /* xmm4=h4g4f4e4d4c4b4a4 */ \
|
456 |
\ |
457 |
punpckhqdq_r2r(xmm5, xmm2); /* xmm2=h5g5f5e5d5c5b5a5 */ \
|
458 |
movdqu_r2m(xmm1, *I(2)); /* save I(2) */ \ |
459 |
\ |
460 |
movdqu_r2m(xmm3, *I(3)); /* save I(3) */ \ |
461 |
movdqu_r2m(xmm4, *I(4)); /* save I(4) */ \ |
462 |
\ |
463 |
movdqu_r2m(xmm2, *I(5)); /* save I(5) */ \ |
464 |
movdqu_r2r(xmm7, xmm5); /* make a copy */ \
|
465 |
\ |
466 |
punpcklqdq_r2r(xmm6, xmm5); /* xmm5=h6g6f6e6d6c6b6a6 */ \
|
467 |
punpckhqdq_r2r(xmm6, xmm7); /* xmm7=h7g7f7e7d7c7b7a7 */ \
|
468 |
\ |
469 |
movdqu_r2m(xmm5, *I(6)); /* save I(6) */ \ |
470 |
movdqu_r2m(xmm7, *I(7)); /* save I(7) */ \ |
471 |
\ |
472 |
} /* End of Transpose Macro */
|
473 |
|
474 |
|
475 |
#define SSE2_Dequantize() { \
|
476 |
movdqu_m2r(*(eax), xmm0); \ |
477 |
\ |
478 |
pmullw_m2r(*(ebx), xmm0); /* xmm0 = 07 06 05 04 03 02 01 00 */ \
|
479 |
movdqu_m2r(*(eax + 16), xmm1); \
|
480 |
\ |
481 |
pmullw_m2r(*(ebx + 16), xmm1); /* xmm1 = 17 16 15 14 13 12 11 10 */ \ |
482 |
pshuflw_r2r(xmm0, xmm3, 0x078); /* xmm3 = 07 06 05 04 01 03 02 00 */ \ |
483 |
\ |
484 |
movdqu_r2r(xmm1, xmm2); /* xmm2 = 17 16 15 14 13 12 11 10 */ \
|
485 |
movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \
|
486 |
\ |
487 |
movdqu_m2r(*(eax + 32), xmm4); \
|
488 |
movdqu_m2r(*(eax + 64), xmm5); \
|
489 |
\ |
490 |
pmullw_m2r(*(ebx + 32), xmm4); /* xmm4 = 27 26 25 24 23 22 21 20 */ \ |
491 |
pmullw_m2r(*(ebx + 64), xmm5); /* xmm5 = 47 46 45 44 43 42 41 40 */ \ |
492 |
\ |
493 |
movdqu_m2r(*(ecx + 16), xmm6); /* xmm6 = -- -- FF FF -- -- -- -- */ \ |
494 |
pand_r2r(xmm2, xmm7); /* xmm7 = -- -- -- -- -- 12 11 -- */ \
|
495 |
\ |
496 |
pand_r2r(xmm4, xmm6); /* xmm6 = -- -- 25 24 -- -- -- -- */ \
|
497 |
pxor_r2r(xmm7, xmm2); /* xmm2 = 17 16 15 14 13 -- -- 10 */ \
|
498 |
\ |
499 |
pxor_r2r(xmm6, xmm4); /* xmm4 = 27 26 -- -- 23 22 21 20 */ \
|
500 |
pslldq_i2r(4, xmm7); /* xmm7 = -- -- -- 12 11 -- -- -- */ \ |
501 |
\ |
502 |
pslldq_i2r(2, xmm6); /* xmm6 = -- 25 24 -- -- -- -- -- */ \ |
503 |
por_r2r(xmm6, xmm7); /* xmm7 = -- 25 24 12 11 -- -- -- */ \
|
504 |
\ |
505 |
movdqu_m2r(*(ecx + 32), xmm0); /* xmm0 = -- -- -- -- -- FF FF FF */ \ |
506 |
movdqu_m2r(*(ecx + 48), xmm6); /* xmm6 = -- -- -- -- FF -- -- -- */ \ |
507 |
\ |
508 |
pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- -- -- 03 02 00 */ \
|
509 |
pand_r2r(xmm5, xmm6); /* xmm6 = -- -- -- -- 43 -- -- -- */ \
|
510 |
\ |
511 |
pxor_r2r(xmm0, xmm3); /* xmm3 = 07 06 05 04 01 -- -- -- */ \
|
512 |
pxor_r2r(xmm6, xmm5); /* xmm5 = 47 46 45 44 -- 42 41 40 */ \
|
513 |
\ |
514 |
por_r2r(xmm7, xmm0); /* xmm0 = -- 25 24 12 11 03 02 00 */ \
|
515 |
pslldq_i2r(8, xmm6); /* xmm6 = 43 -- -- -- -- -- -- -- */ \ |
516 |
\ |
517 |
por_r2r(xmm6, xmm0); /* xmm0 = 43 25 24 12 11 03 02 00 */ \
|
518 |
/* 02345 in use */ \
|
519 |
\ |
520 |
movdqu_m2r(*(ecx + 64 ), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \ |
521 |
pshuflw_r2r(xmm5, xmm5, 0x0B4); /* xmm5 = 47 46 45 44 42 -- 41 40 */ \ |
522 |
\ |
523 |
movdqu_r2r(xmm1, xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \
|
524 |
movdqu_r2r(xmm1, xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \
|
525 |
\ |
526 |
movdqu_r2m(xmm0, *(eax)); /* write 43 25 24 12 11 03 02 00 */ \
|
527 |
pshufhw_r2r(xmm4, xmm4, 0x0C2); /* xmm4 = 27 -- -- 26 23 22 21 20 */ \ |
528 |
\ |
529 |
pand_r2r(xmm4, xmm7); /* xmm7 = -- -- -- 26 23 -- -- -- */ \
|
530 |
pand_r2r(xmm5, xmm1); /* xmm1 = -- -- -- 44 42 -- -- -- */ \
|
531 |
\ |
532 |
pxor_r2r(xmm7, xmm4); /* xmm4 = 27 -- -- -- -- 22 21 20 */ \
|
533 |
pxor_r2r(xmm1, xmm5); /* xmm5 = 47 46 45 -- -- -- 41 40 */ \
|
534 |
\ |
535 |
pshuflw_r2r(xmm2, xmm2, 0x0C6); /* xmm2 = 17 16 15 14 13 10 -- -- */ \ |
536 |
movdqu_r2r(xmm6, xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \
|
537 |
\ |
538 |
pslldq_i2r(2, xmm7); /* xmm7 = -- -- 26 23 -- -- -- -- */ \ |
539 |
pslldq_i2r(6, xmm1); /* xmm1 = 44 42 -- -- -- -- -- -- */ \ |
540 |
\ |
541 |
psrldq_i2r(2, xmm0); /* xmm0 = -- -- -- -- FF FF -- -- */ \ |
542 |
pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- 04 01 -- -- -- */ \
|
543 |
\ |
544 |
pand_r2r(xmm2, xmm0); /* xmm0 = -- -- -- -- 13 10 -- -- */ \
|
545 |
pxor_r2r(xmm6, xmm3); /* xmm3 = 07 06 05 -- -- -- -- -- */ \
|
546 |
\ |
547 |
pxor_r2r(xmm0, xmm2); /* xmm2 = 17 16 15 14 -- -- -- -- */ \
|
548 |
psrldq_i2r(6, xmm6); /* xmm0 = -- -- -- -- -- -- 04 01 */ \ |
549 |
\ |
550 |
por_r2r(xmm7, xmm1); /* xmm1 = 44 42 26 23 -- -- -- -- */ \
|
551 |
por_r2r(xmm6, xmm0); /* xmm1 = -- -- -- -- 13 10 04 01 */ \
|
552 |
/* 12345 in use */ \
|
553 |
por_r2r(xmm0, xmm1); /* xmm1 = 44 42 26 23 13 10 04 01 */ \
|
554 |
pshuflw_r2r(xmm4, xmm4, 0x093); /* xmm4 = 27 -- -- -- 22 21 20 -- */ \ |
555 |
\ |
556 |
pshufhw_r2r(xmm4, xmm4, 0x093); /* xmm4 = -- -- -- 27 22 21 20 -- */ \ |
557 |
movdqu_r2m(xmm1, *(eax + 16)); /* write 44 42 26 23 13 10 04 01 */ \ |
558 |
\ |
559 |
pshufhw_r2r(xmm3, xmm3, 0x0D2); /* xmm3 = 07 05 -- 06 -- -- -- -- */ \ |
560 |
movdqu_m2r(*(ecx + 64), xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \ |
561 |
\ |
562 |
pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- 06 -- -- -- -- */ \
|
563 |
psrldq_i2r(12, xmm3); /* xmm3 = -- -- -- -- -- -- 07 05 */ \ |
564 |
\ |
565 |
psrldq_i2r(8, xmm0); /* xmm0 = -- -- -- -- -- -- -- 06 */ \ |
566 |
\ |
567 |
movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ |
568 |
movdqu_m2r(*(ecx + 96), xmm7); /* xmm7 = -- -- -- -- FF FF -- -- */ \ |
569 |
\ |
570 |
pand_r2r(xmm4, xmm6); /* xmm6 = -- -- -- 27 22 -- -- -- */ \
|
571 |
pxor_r2r(xmm6, xmm4); /* xmm4 = -- -- -- -- -- 21 20 -- */ \
|
572 |
\ |
573 |
por_r2r(xmm6, xmm3); /* xmm3 = -- -- -- 27 22 -- 07 05 */ \
|
574 |
pand_r2r(xmm4, xmm7); /* xmm7 = -- -- -- -- -- 21 -- -- */ \
|
575 |
\ |
576 |
por_r2r(xmm7, xmm0); /* xmm0 = -- -- -- -- -- 21 -- 06 */ \
|
577 |
pxor_r2r(xmm7, xmm4); /* xmm4 = -- -- -- -- -- -- 20 -- */ \
|
578 |
\ |
579 |
movdqu_m2r(*(ecx + 16 ), xmm6); /* xmm6 = -- -- FF FF -- -- -- -- */ \ |
580 |
movdqu_m2r(*(ecx + 64 ), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \ |
581 |
\ |
582 |
pand_r2r(xmm2, xmm6); /* xmm6 = -- -- 15 14 -- -- -- -- */ \
|
583 |
pand_r2r(xmm6, xmm1); /* xmm1 = -- -- -- 14 -- -- -- -- */ \
|
584 |
\ |
585 |
pxor_r2r(xmm6, xmm2); /* xmm2 = 17 16 -- -- -- -- -- -- */ \
|
586 |
pxor_r2r(xmm1, xmm6); /* xmm6 = -- -- 15 -- -- -- -- -- */ \
|
587 |
\ |
588 |
psrldq_i2r(4, xmm1); /* xmm1 = -- -- -- -- -- 14 -- -- */ \ |
589 |
\ |
590 |
psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- 15 -- */ \ |
591 |
por_r2r(xmm1, xmm3); /* xmm3 = -- -- -- 27 22 14 07 05 */ \
|
592 |
\ |
593 |
por_r2r(xmm6, xmm0); /* xmm0 = -- -- -- -- -- 21 15 06 */ \
|
594 |
pshufhw_r2r(xmm5, xmm5, 0x0E1); /* xmm5 = 47 46 -- 45 -- -- 41 40 */ \ |
595 |
\ |
596 |
movdqu_m2r(*(ecx + 64), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \ |
597 |
pshuflw_r2r(xmm5, xmm5, 0x072); /* xmm5 = 47 46 -- 45 41 -- 40 -- */ \ |
598 |
\ |
599 |
movdqu_r2r(xmm1, xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \
|
600 |
pand_r2r(xmm5, xmm1); /* xmm1 = -- -- -- 45 41 -- -- -- */ \
|
601 |
\ |
602 |
pxor_r2r(xmm1, xmm5); /* xmm5 = 47 46 -- -- -- -- 40 -- */ \
|
603 |
pslldq_i2r(4, xmm1); /* xmm1 = -- 45 41 -- -- -- -- -- */ \ |
604 |
\ |
605 |
pshufd_r2r(xmm5, xmm5, 0x09C); /* xmm5 = -- -- -- -- 47 46 40 -- */ \ |
606 |
por_r2r(xmm1, xmm3); /* xmm3 = -- 45 41 27 22 14 07 05 */ \
|
607 |
\ |
608 |
movdqu_m2r(*(eax + 96), xmm1); /* xmm1 = 67 66 65 64 63 62 61 60 */ \ |
609 |
pmullw_m2r(*(ebx + 96), xmm1); \
|
610 |
\ |
611 |
movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \
|
612 |
\ |
613 |
psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \ |
614 |
pand_r2r(xmm5, xmm7); /* xmm7 = -- -- -- -- -- 46 40 -- */ \
|
615 |
\ |
616 |
pand_r2r(xmm1, xmm6); /* xmm6 = -- -- -- -- -- -- -- 60 */ \
|
617 |
pxor_r2r(xmm7, xmm5); /* xmm5 = -- -- -- -- 47 -- -- -- */ \
|
618 |
\ |
619 |
pxor_r2r(xmm6, xmm1); /* xmm1 = 67 66 65 64 63 62 61 -- */ \
|
620 |
pslldq_i2r(2, xmm5); /* xmm5 = -- -- -- 47 -- -- -- -- */ \ |
621 |
\ |
622 |
pslldq_i2r(14, xmm6); /* xmm6 = 60 -- -- -- -- -- -- -- */ \ |
623 |
por_r2r(xmm5, xmm4); /* xmm4 = -- -- -- 47 -- -- 20 -- */ \
|
624 |
\ |
625 |
por_r2r(xmm6, xmm3); /* xmm3 = 60 45 41 27 22 14 07 05 */ \
|
626 |
pslldq_i2r(6, xmm7); /* xmm7 = -- -- 46 40 -- -- -- -- */ \ |
627 |
\ |
628 |
movdqu_r2m(xmm3, *(eax+32)); /* write 60 45 41 27 22 14 07 05 */ \ |
629 |
por_r2r(xmm7, xmm0); /* xmm0 = -- -- 46 40 -- 21 15 06 */ \
|
630 |
/* 0, 1, 2, 4 in use */ \
|
631 |
movdqu_m2r(*(eax + 48), xmm3); /* xmm3 = 37 36 35 34 33 32 31 30 */ \ |
632 |
movdqu_m2r(*(eax + 80), xmm5); /* xmm5 = 57 56 55 54 53 52 51 50 */ \ |
633 |
\ |
634 |
pmullw_m2r(*(ebx + 48), xmm3); \
|
635 |
pmullw_m2r(*(ebx + 80), xmm5); \
|
636 |
\ |
637 |
movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ |
638 |
movdqu_m2r(*(ecx + 64), xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \ |
639 |
\ |
640 |
psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \ |
641 |
pslldq_i2r(8, xmm7); /* xmm7 = FF -- -- -- -- -- -- -- */ \ |
642 |
\ |
643 |
pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- -- -- -- -- 30 */ \
|
644 |
pand_r2r(xmm5, xmm7); /* xmm7 = 57 -- -- -- -- -- -- -- */ \
|
645 |
\ |
646 |
pxor_r2r(xmm6, xmm3); /* xmm3 = 37 36 35 34 33 32 31 -- */ \
|
647 |
pxor_r2r(xmm7, xmm5); /* xmm5 = __ 56 55 54 53 52 51 50 */ \
|
648 |
\ |
649 |
pslldq_i2r(6, xmm6); /* xmm6 = -- -- -- -- 30 -- -- -- */ \ |
650 |
psrldq_i2r(2, xmm7); /* xmm7 = -- 57 -- -- -- -- -- -- */ \ |
651 |
\ |
652 |
por_r2r(xmm7, xmm6); /* xmm6 = -- 57 -- -- 30 -- -- -- */ \
|
653 |
movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \
|
654 |
\ |
655 |
por_r2r(xmm6, xmm0); /* xmm0 = -- 57 46 40 30 21 15 06 */ \
|
656 |
psrldq_i2r(2, xmm7); /* xmm7 = -- -- -- -- -- -- FF FF */ \ |
657 |
\ |
658 |
movdqu_r2r(xmm2, xmm6); /* xmm6 = 17 16 -- -- -- -- -- -- */ \
|
659 |
pand_r2r(xmm1, xmm7); /* xmm7 = -- -- -- -- -- -- 61 -- */ \
|
660 |
\ |
661 |
pslldq_i2r(2, xmm6); /* xmm6 = 16 -- -- -- -- -- -- -- */ \ |
662 |
psrldq_i2r(14, xmm2); /* xmm2 = -- -- -- -- -- -- -- 17 */ \ |
663 |
\ |
664 |
pxor_r2r(xmm7, xmm1); /* xmm1 = 67 66 65 64 63 62 -- -- */ \
|
665 |
pslldq_i2r(12, xmm7); /* xmm7 = 61 -- -- -- -- -- -- -- */ \ |
666 |
\ |
667 |
psrldq_i2r(14, xmm6); /* xmm6 = -- -- -- -- -- -- -- 16 */ \ |
668 |
por_r2r(xmm6, xmm4); /* xmm4 = -- -- -- 47 -- -- 20 16 */ \
|
669 |
\ |
670 |
por_r2r(xmm7, xmm0); /* xmm0 = 61 57 46 40 30 21 15 06 */ \
|
671 |
movdqu_m2r(*(ecx), xmm6); /* xmm6 = -- -- -- -- -- FF FF -- */ \
|
672 |
\ |
673 |
psrldq_i2r(2, xmm6); /* xmm6 = -- -- -- -- -- -- FF FF */ \ |
674 |
movdqu_r2m(xmm0, *(eax+48)); /* write 61 57 46 40 30 21 15 06 */ \ |
675 |
/* 1, 2, 3, 4, 5 in use */\
|
676 |
movdqu_m2r(*(ecx), xmm0); /* xmm0 = -- -- -- -- -- FF FF -- */ \
|
677 |
pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- -- -- -- 31 -- */ \
|
678 |
\ |
679 |
movdqu_r2r(xmm3, xmm7); /* xmm7 = 37 36 35 34 33 32 31 -- */ \
|
680 |
pxor_r2r(xmm6, xmm3); /* xmm3 = 37 36 35 34 33 32 -- -- */ \
|
681 |
\ |
682 |
pslldq_i2r(2, xmm3); /* xmm3 = 36 35 34 33 32 -- -- -- */ \ |
683 |
pand_r2r(xmm1, xmm0); /* xmm0 = -- -- -- -- -- 62 -- -- */ \
|
684 |
\ |
685 |
psrldq_i2r(14, xmm7); /* xmm7 = -- -- -- -- -- -- -- 37 */ \ |
686 |
pxor_r2r(xmm0, xmm1); /* xmm1 = 67 66 65 64 63 -- -- -- */ \
|
687 |
\ |
688 |
por_r2r(xmm7, xmm6); /* xmm6 = -- -- -- -- -- -- 31 37 */ \
|
689 |
movdqu_m2r(*(ecx + 64), xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \ |
690 |
\ |
691 |
pshuflw_r2r(xmm6, xmm6, 0x01E); /* xmm6 = -- -- -- -- 37 31 -- -- */ \ |
692 |
pslldq_i2r(6, xmm7); /* xmm7 = FF FF -- -- -- -- -- -- */ \ |
693 |
\ |
694 |
por_r2r(xmm6, xmm4); /* xmm4 = -- -- -- 47 37 31 20 16 */ \
|
695 |
pand_r2r(xmm5, xmm7); /* xmm7 = -- 56 -- -- -- -- -- -- */ \
|
696 |
\ |
697 |
pslldq_i2r(8, xmm0); /* xmm0 = -- 62 -- -- -- -- -- -- */ \ |
698 |
pxor_r2r(xmm7, xmm5); /* xmm5 = -- -- 55 54 53 52 51 50 */ \
|
699 |
\ |
700 |
psrldq_i2r(2, xmm7); /* xmm7 = -- -- 56 -- -- -- -- -- */ \ |
701 |
\ |
702 |
pshufhw_r2r(xmm3, xmm3, 0x087); /* xmm3 = 35 33 34 36 32 -- -- -- */ \ |
703 |
por_r2r(xmm7, xmm0); /* xmm0 = -- 62 56 -- -- -- -- -- */ \
|
704 |
\ |
705 |
movdqu_m2r(*(eax + 112), xmm7); /* xmm7 = 77 76 75 74 73 72 71 70 */ \ |
706 |
pmullw_m2r(*(ebx + 112), xmm7); \
|
707 |
\ |
708 |
movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ |
709 |
por_r2r(xmm0, xmm4); /* xmm4 = -- 62 56 47 37 31 20 16 */ \
|
710 |
\ |
711 |
pshuflw_r2r(xmm7, xmm7, 0x0E1); /* xmm7 = 77 76 75 74 73 72 70 71 */ \ |
712 |
psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \ |
713 |
\ |
714 |
movdqu_m2r(*(ecx + 64), xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \ |
715 |
pand_r2r(xmm7, xmm6); /* xmm6 = -- -- -- -- -- -- -- 71 */ \
|
716 |
\ |
717 |
pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- 36 32 -- -- -- */ \
|
718 |
pxor_r2r(xmm6, xmm7); /* xmm7 = 77 76 75 74 73 72 70 -- */ \
|
719 |
\ |
720 |
pxor_r2r(xmm0, xmm3); /* xmm3 = 35 33 34 -- -- -- -- -- */ \
|
721 |
pslldq_i2r(14, xmm6); /* xmm6 = 71 -- -- -- -- -- -- -- */ \ |
722 |
\ |
723 |
psrldq_i2r(4, xmm0); /* xmm0 = -- -- -- -- -- 36 32 -- */ \ |
724 |
por_r2r(xmm6, xmm4); /* xmm4 = 71 62 56 47 37 31 20 16 */ \
|
725 |
\ |
726 |
por_r2r(xmm0, xmm2); /* xmm2 = -- -- -- -- -- 36 32 17 */ \
|
727 |
movdqu_r2m(xmm4, *(eax + 64)); /* write 71 62 56 47 37 31 20 16 */ \ |
728 |
/* 1, 2, 3, 5, 7 in use */ \
|
729 |
movdqu_m2r(*(ecx + 80), xmm6); /* xmm6 = -- -- FF -- -- -- -- FF */ \ |
730 |
pshufhw_r2r(xmm7, xmm7, 0x0D2); /* xmm7 = 77 75 74 76 73 72 70 __ */ \ |
731 |
\ |
732 |
movdqu_m2r(*(ecx), xmm4); /* xmm4 = -- -- -- -- -- FF FF -- */ \
|
733 |
movdqu_m2r(*(ecx+48), xmm0); /* xmm0 = -- -- -- -- FF -- -- -- */ \ |
734 |
\ |
735 |
pand_r2r(xmm5, xmm6); /* xmm6 = -- -- 55 -- -- -- -- 50 */ \
|
736 |
pand_r2r(xmm7, xmm4); /* xmm4 = -- -- -- -- -- 72 70 -- */ \
|
737 |
\ |
738 |
pand_r2r(xmm1, xmm0); /* xmm0 = -- -- -- -- 63 -- -- -- */ \
|
739 |
pxor_r2r(xmm6, xmm5); /* xmm5 = -- -- -- 54 53 52 51 -- */ \
|
740 |
\ |
741 |
pxor_r2r(xmm4, xmm7); /* xmm7 = 77 75 74 76 73 -- -- -- */ \
|
742 |
pxor_r2r(xmm0, xmm1); /* xmm1 = 67 66 65 64 -- -- -- -- */ \
|
743 |
\ |
744 |
pshuflw_r2r(xmm6, xmm6, 0x02B); /* xmm6 = -- -- 55 -- 50 -- -- -- */ \ |
745 |
pslldq_i2r(10, xmm4); /* xmm4 = 72 20 -- -- -- -- -- -- */ \ |
746 |
\ |
747 |
pshufhw_r2r(xmm6, xmm6, 0x0B1); /* xmm6 = -- -- -- 55 50 -- -- -- */ \ |
748 |
pslldq_i2r(4, xmm0); /* xmm0 = -- -- 63 -- -- -- -- -- */ \ |
749 |
\ |
750 |
por_r2r(xmm4, xmm6); /* xmm6 = 72 70 -- 55 50 -- -- -- */ \
|
751 |
por_r2r(xmm0, xmm2); /* xmm2 = -- -- 63 -- -- 36 32 17 */ \
|
752 |
\ |
753 |
por_r2r(xmm6, xmm2); /* xmm2 = 72 70 64 55 50 36 32 17 */ \
|
754 |
pshufhw_r2r(xmm1, xmm1, 0x0C9); /* xmm1 = 67 64 66 65 -- -- -- -- */ \ |
755 |
\ |
756 |
movdqu_r2r(xmm3, xmm6); /* xmm6 = 35 33 34 -- -- -- -- -- */ \
|
757 |
movdqu_r2m(xmm2, *(eax+80)); /* write 72 70 64 55 50 36 32 17 */ \ |
758 |
\ |
759 |
psrldq_i2r(12, xmm6); /* xmm6 = -- -- -- -- -- -- 35 33 */ \ |
760 |
pslldq_i2r(4, xmm3); /* xmm3 = 34 -- -- -- -- -- -- -- */ \ |
761 |
\ |
762 |
pshuflw_r2r(xmm5, xmm5, 0x04E); /* xmm5 = -- -- -- 54 51 -- 53 52 */ \ |
763 |
movdqu_r2r(xmm7, xmm4); /* xmm4 = 77 75 74 76 73 -- -- -- */ \
|
764 |
\ |
765 |
movdqu_r2r(xmm5, xmm2); /* xmm2 = -- -- -- 54 51 -- 53 52 */ \
|
766 |
psrldq_i2r(10, xmm7); /* xmm7 = -- -- -- -- -- 77 75 74 */ \ |
767 |
\ |
768 |
pslldq_i2r(6, xmm4); /* xmm4 = 76 73 -- -- -- -- -- -- */ \ |
769 |
pslldq_i2r(12, xmm2); /* xmm2 = 53 52 -- -- -- -- -- -- */ \ |
770 |
\ |
771 |
movdqu_r2r(xmm1, xmm0); /* xmm0 = 67 64 66 65 -- -- -- -- */ \
|
772 |
psrldq_i2r(12, xmm1); /* xmm1 = -- -- -- -- -- -- 67 64 */ \ |
773 |
\ |
774 |
psrldq_i2r(6, xmm5); /* xmm5 = -- -- -- -- -- -- 54 51 */ \ |
775 |
psrldq_i2r(14, xmm3); /* xmm3 = -- -- -- -- -- -- -- 34 */ \ |
776 |
\ |
777 |
pslldq_i2r(10, xmm7); /* xmm7 = 77 75 74 -- -- -- -- -- */ \ |
778 |
por_r2r(xmm6, xmm4); /* xmm4 = 76 73 -- -- -- -- 35 33 */ \
|
779 |
\ |
780 |
psrldq_i2r(10, xmm2); /* xmm2 = -- -- -- -- -- 53 52 -- */ \ |
781 |
pslldq_i2r(4, xmm0); /* xmm0 = 66 65 -- -- -- -- -- -- */ \ |
782 |
\ |
783 |
pslldq_i2r(8, xmm1); /* xmm1 = -- -- 67 64 -- -- -- -- */ \ |
784 |
por_r2r(xmm7, xmm3); /* xmm3 = 77 75 74 -- -- -- -- 34 */ \
|
785 |
\ |
786 |
psrldq_i2r(6, xmm0); /* xmm0 = -- -- -- 66 65 -- -- -- */ \ |
787 |
pslldq_i2r(4, xmm5); /* xmm5 = -- -- -- -- 54 51 -- -- */ \ |
788 |
\ |
789 |
por_r2r(xmm1, xmm4); /* xmm4 = 76 73 67 64 -- -- 35 33 */ \
|
790 |
por_r2r(xmm2, xmm3); /* xmm3 = 77 75 74 -- -- 53 52 34 */ \
|
791 |
\ |
792 |
por_r2r(xmm5, xmm4); /* xmm4 = 76 73 67 64 54 51 35 33 */ \
|
793 |
por_r2r(xmm0, xmm3); /* xmm3 = 77 75 74 66 65 53 52 34 */ \
|
794 |
\ |
795 |
movdqu_r2m(xmm4, *(eax+96)); /* write 76 73 67 64 54 51 35 33 */ \ |
796 |
movdqu_r2m(xmm3, *(eax+112)); /* write 77 75 74 66 65 53 52 34 */ \ |
797 |
\ |
798 |
} /* end of SSE2_Dequantize Macro */
|
799 |
|
800 |
|
801 |
void ff_vp3_idct_sse2(int16_t *input_data)
|
802 |
{ |
803 |
unsigned char *input_bytes = (unsigned char *)input_data; |
804 |
unsigned char *output_data_bytes = (unsigned char *)input_data; |
805 |
unsigned char *idct_data_bytes = (unsigned char *)SSE2_idct_data; |
806 |
unsigned char *Eight = (unsigned char *)eight_data; |
807 |
|
808 |
#define eax input_bytes
|
809 |
//#define ebx dequant_matrix_bytes
|
810 |
#define ecx dequant_const_bytes
|
811 |
#define edx idct_data_bytes
|
812 |
|
813 |
#define I(i) (eax + 16 * i) |
814 |
#define O(i) (ebx + 16 * i) |
815 |
#define C(i) (edx + 16 * (i-1)) |
816 |
|
817 |
// SSE2_Dequantize();
|
818 |
|
819 |
#undef ebx
|
820 |
#define ebx output_data_bytes
|
821 |
|
822 |
SSE2_Row_IDCT(); |
823 |
|
824 |
SSE2_Transpose(); |
825 |
|
826 |
SSE2_Column_IDCT(); |
827 |
} |