ffmpeg / libavcodec / i386 / idct_mmx.c @ 5509bffa
History | View | Annotate | Download (21.5 KB)
1 |
/*
|
---|---|
2 |
* Note: For libavcodec, this code can also be used under the LGPL license
|
3 |
*/
|
4 |
/*
|
5 |
* idct_mmx.c
|
6 |
* Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
|
7 |
*
|
8 |
* This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
|
9 |
*
|
10 |
* mpeg2dec is free software; you can redistribute it and/or modify
|
11 |
* it under the terms of the GNU General Public License as published by
|
12 |
* the Free Software Foundation; either version 2 of the License, or
|
13 |
* (at your option) any later version.
|
14 |
*
|
15 |
* mpeg2dec is distributed in the hope that it will be useful,
|
16 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
17 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
18 |
* GNU General Public License for more details.
|
19 |
*
|
20 |
* You should have received a copy of the GNU General Public License
|
21 |
* along with this program; if not, write to the Free Software
|
22 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
23 |
*/
|
24 |
|
25 |
#include "common.h" |
26 |
#include "../dsputil.h" |
27 |
|
28 |
#include "mmx.h" |
29 |
|
30 |
#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
|
31 |
|
32 |
#define ROW_SHIFT 11 |
33 |
#define COL_SHIFT 6 |
34 |
|
35 |
#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) |
36 |
#define rounder(bias) {round (bias), round (bias)}
|
37 |
|
38 |
#if 0
|
39 |
/* C row IDCT - its just here to document the MMXEXT and MMX versions */
|
40 |
static inline void idct_row (int16_t * row, int offset,
|
41 |
int16_t * table, int32_t * rounder)
|
42 |
{
|
43 |
int C1, C2, C3, C4, C5, C6, C7;
|
44 |
int a0, a1, a2, a3, b0, b1, b2, b3;
|
45 |
|
46 |
row += offset;
|
47 |
|
48 |
C1 = table[1];
|
49 |
C2 = table[2];
|
50 |
C3 = table[3];
|
51 |
C4 = table[4];
|
52 |
C5 = table[5];
|
53 |
C6 = table[6];
|
54 |
C7 = table[7];
|
55 |
|
56 |
a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
|
57 |
a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
|
58 |
a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
|
59 |
a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
|
60 |
|
61 |
b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
|
62 |
b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
|
63 |
b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
|
64 |
b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
|
65 |
|
66 |
row[0] = (a0 + b0) >> ROW_SHIFT;
|
67 |
row[1] = (a1 + b1) >> ROW_SHIFT;
|
68 |
row[2] = (a2 + b2) >> ROW_SHIFT;
|
69 |
row[3] = (a3 + b3) >> ROW_SHIFT;
|
70 |
row[4] = (a3 - b3) >> ROW_SHIFT;
|
71 |
row[5] = (a2 - b2) >> ROW_SHIFT;
|
72 |
row[6] = (a1 - b1) >> ROW_SHIFT;
|
73 |
row[7] = (a0 - b0) >> ROW_SHIFT;
|
74 |
}
|
75 |
#endif
|
76 |
|
77 |
|
78 |
/* MMXEXT row IDCT */
|
79 |
|
80 |
#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \
|
81 |
c4, c6, c4, c6, \ |
82 |
c1, c3, -c1, -c5, \ |
83 |
c5, c7, c3, -c7, \ |
84 |
c4, -c6, c4, -c6, \ |
85 |
-c4, c2, c4, -c2, \ |
86 |
c5, -c1, c3, -c1, \ |
87 |
c7, c3, c7, -c5 } |
88 |
|
89 |
static inline void mmxext_row_head (int16_t * row, int offset, const int16_t * table) |
90 |
{ |
91 |
movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0
|
92 |
|
93 |
movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 |
94 |
movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0
|
95 |
|
96 |
movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4
|
97 |
movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1
|
98 |
|
99 |
movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 |
100 |
pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
|
101 |
|
102 |
pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 |
103 |
} |
104 |
|
105 |
static inline void mmxext_row (const int16_t * table, const int32_t * rounder) |
106 |
{ |
107 |
movq_m2r (*(table+8), mm1); // mm1 = -C5 -C1 C3 C1 |
108 |
pmaddwd_r2r (mm2, mm4); // mm4 = C4*x0+C6*x2 C4*x4+C6*x6
|
109 |
|
110 |
pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x4-C6*x6 C4*x0-C6*x2 |
111 |
pshufw_r2r (mm6, mm6, 0x4e); // mm6 = x3 x1 x7 x5 |
112 |
|
113 |
movq_m2r (*(table+12), mm7); // mm7 = -C7 C3 C7 C5 |
114 |
pmaddwd_r2r (mm5, mm1); // mm1 = -C1*x5-C5*x7 C1*x1+C3*x3
|
115 |
|
116 |
paddd_m2r (*rounder, mm3); // mm3 += rounder
|
117 |
pmaddwd_r2r (mm6, mm7); // mm7 = C3*x1-C7*x3 C5*x5+C7*x7
|
118 |
|
119 |
pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 |
120 |
paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder
|
121 |
|
122 |
pmaddwd_m2r (*(table+24), mm5); // mm5 = C3*x5-C1*x7 C5*x1-C1*x3 |
123 |
movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder
|
124 |
|
125 |
pmaddwd_m2r (*(table+28), mm6); // mm6 = C7*x1-C5*x3 C7*x5+C3*x7 |
126 |
paddd_r2r (mm7, mm1); // mm1 = b1 b0
|
127 |
|
128 |
paddd_m2r (*rounder, mm0); // mm0 += rounder
|
129 |
psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder
|
130 |
|
131 |
psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7
|
132 |
paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder
|
133 |
|
134 |
paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder
|
135 |
psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0
|
136 |
|
137 |
paddd_r2r (mm6, mm5); // mm5 = b3 b2
|
138 |
movq_r2r (mm0, mm4); // mm4 = a3 a2 + rounder
|
139 |
|
140 |
paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder
|
141 |
psubd_r2r (mm5, mm4); // mm4 = a3-b3 a2-b2 + rounder
|
142 |
} |
143 |
|
144 |
static inline void mmxext_row_tail (int16_t * row, int store) |
145 |
{ |
146 |
psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2
|
147 |
|
148 |
psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5
|
149 |
|
150 |
packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0
|
151 |
|
152 |
packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5
|
153 |
|
154 |
movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0
|
155 |
pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 |
156 |
|
157 |
/* slot */
|
158 |
|
159 |
movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 |
160 |
} |
161 |
|
162 |
static inline void mmxext_row_mid (int16_t * row, int store, |
163 |
int offset, const int16_t * table) |
164 |
{ |
165 |
movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0
|
166 |
psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2
|
167 |
|
168 |
movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 |
169 |
psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5
|
170 |
|
171 |
packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0
|
172 |
movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1
|
173 |
|
174 |
packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5
|
175 |
movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0
|
176 |
|
177 |
movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0
|
178 |
pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 |
179 |
|
180 |
movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4
|
181 |
movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 |
182 |
|
183 |
pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
|
184 |
|
185 |
movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 |
186 |
pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 |
187 |
} |
188 |
|
189 |
|
190 |
/* MMX row IDCT */
|
191 |
|
192 |
#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \
|
193 |
c4, c6, -c4, -c2, \ |
194 |
c1, c3, c3, -c7, \ |
195 |
c5, c7, -c1, -c5, \ |
196 |
c4, -c6, c4, -c2, \ |
197 |
-c4, c2, c4, -c6, \ |
198 |
c5, -c1, c7, -c5, \ |
199 |
c7, c3, c3, -c1 } |
200 |
|
201 |
static inline void mmx_row_head (int16_t * row, int offset, const int16_t * table) |
202 |
{ |
203 |
movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0
|
204 |
|
205 |
movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 |
206 |
movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0
|
207 |
|
208 |
movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4
|
209 |
movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1
|
210 |
|
211 |
punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0
|
212 |
|
213 |
movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 |
214 |
pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2
|
215 |
|
216 |
movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 |
217 |
punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4
|
218 |
} |
219 |
|
220 |
static inline void mmx_row (const int16_t * table, const int32_t * rounder) |
221 |
{ |
222 |
pmaddwd_r2r (mm2, mm4); // mm4 = -C4*x4-C2*x6 C4*x4+C6*x6
|
223 |
punpckldq_r2r (mm5, mm5); // mm5 = x3 x1 x3 x1
|
224 |
|
225 |
pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x0-C2*x2 C4*x0-C6*x2 |
226 |
punpckhdq_r2r (mm6, mm6); // mm6 = x7 x5 x7 x5
|
227 |
|
228 |
movq_m2r (*(table+12), mm7); // mm7 = -C5 -C1 C7 C5 |
229 |
pmaddwd_r2r (mm5, mm1); // mm1 = C3*x1-C7*x3 C1*x1+C3*x3
|
230 |
|
231 |
paddd_m2r (*rounder, mm3); // mm3 += rounder
|
232 |
pmaddwd_r2r (mm6, mm7); // mm7 = -C1*x5-C5*x7 C5*x5+C7*x7
|
233 |
|
234 |
pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 |
235 |
paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder
|
236 |
|
237 |
pmaddwd_m2r (*(table+24), mm5); // mm5 = C7*x1-C5*x3 C5*x1-C1*x3 |
238 |
movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder
|
239 |
|
240 |
pmaddwd_m2r (*(table+28), mm6); // mm6 = C3*x5-C1*x7 C7*x5+C3*x7 |
241 |
paddd_r2r (mm7, mm1); // mm1 = b1 b0
|
242 |
|
243 |
paddd_m2r (*rounder, mm0); // mm0 += rounder
|
244 |
psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder
|
245 |
|
246 |
psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7
|
247 |
paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder
|
248 |
|
249 |
paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder
|
250 |
psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0
|
251 |
|
252 |
paddd_r2r (mm6, mm5); // mm5 = b3 b2
|
253 |
movq_r2r (mm0, mm7); // mm7 = a3 a2 + rounder
|
254 |
|
255 |
paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder
|
256 |
psubd_r2r (mm5, mm7); // mm7 = a3-b3 a2-b2 + rounder
|
257 |
} |
258 |
|
259 |
static inline void mmx_row_tail (int16_t * row, int store) |
260 |
{ |
261 |
psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2
|
262 |
|
263 |
psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5
|
264 |
|
265 |
packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0
|
266 |
|
267 |
packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5
|
268 |
|
269 |
movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0
|
270 |
movq_r2r (mm7, mm4); // mm4 = y6 y7 y4 y5
|
271 |
|
272 |
pslld_i2r (16, mm7); // mm7 = y7 0 y5 0 |
273 |
|
274 |
psrld_i2r (16, mm4); // mm4 = 0 y6 0 y4 |
275 |
|
276 |
por_r2r (mm4, mm7); // mm7 = y7 y6 y5 y4
|
277 |
|
278 |
/* slot */
|
279 |
|
280 |
movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 |
281 |
} |
282 |
|
283 |
static inline void mmx_row_mid (int16_t * row, int store, |
284 |
int offset, const int16_t * table) |
285 |
{ |
286 |
movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0
|
287 |
psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2
|
288 |
|
289 |
movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 |
290 |
psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5
|
291 |
|
292 |
packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0
|
293 |
movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1
|
294 |
|
295 |
packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5
|
296 |
movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0
|
297 |
|
298 |
movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0
|
299 |
movq_r2r (mm7, mm1); // mm1 = y6 y7 y4 y5
|
300 |
|
301 |
punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0
|
302 |
psrld_i2r (16, mm7); // mm7 = 0 y6 0 y4 |
303 |
|
304 |
movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4
|
305 |
pslld_i2r (16, mm1); // mm1 = y7 0 y5 0 |
306 |
|
307 |
movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 |
308 |
por_r2r (mm1, mm7); // mm7 = y7 y6 y5 y4
|
309 |
|
310 |
movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 |
311 |
punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4
|
312 |
|
313 |
movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 |
314 |
pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2
|
315 |
} |
316 |
|
317 |
|
318 |
#if 0
|
319 |
// C column IDCT - its just here to document the MMXEXT and MMX versions
|
320 |
static inline void idct_col (int16_t * col, int offset)
|
321 |
{
|
322 |
/* multiplication - as implemented on mmx */
|
323 |
#define F(c,x) (((c) * (x)) >> 16)
|
324 |
|
325 |
/* saturation - it helps us handle torture test cases */
|
326 |
#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
|
327 |
|
328 |
int16_t x0, x1, x2, x3, x4, x5, x6, x7;
|
329 |
int16_t y0, y1, y2, y3, y4, y5, y6, y7;
|
330 |
int16_t a0, a1, a2, a3, b0, b1, b2, b3;
|
331 |
int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
|
332 |
|
333 |
col += offset;
|
334 |
|
335 |
x0 = col[0*8];
|
336 |
x1 = col[1*8];
|
337 |
x2 = col[2*8];
|
338 |
x3 = col[3*8];
|
339 |
x4 = col[4*8];
|
340 |
x5 = col[5*8];
|
341 |
x6 = col[6*8];
|
342 |
x7 = col[7*8];
|
343 |
|
344 |
u04 = S (x0 + x4);
|
345 |
v04 = S (x0 - x4);
|
346 |
u26 = S (F (T2, x6) + x2);
|
347 |
v26 = S (F (T2, x2) - x6);
|
348 |
|
349 |
a0 = S (u04 + u26);
|
350 |
a1 = S (v04 + v26);
|
351 |
a2 = S (v04 - v26);
|
352 |
a3 = S (u04 - u26);
|
353 |
|
354 |
u17 = S (F (T1, x7) + x1);
|
355 |
v17 = S (F (T1, x1) - x7);
|
356 |
u35 = S (F (T3, x5) + x3);
|
357 |
v35 = S (F (T3, x3) - x5);
|
358 |
|
359 |
b0 = S (u17 + u35);
|
360 |
b3 = S (v17 - v35);
|
361 |
u12 = S (u17 - u35);
|
362 |
v12 = S (v17 + v35);
|
363 |
u12 = S (2 * F (C4, u12));
|
364 |
v12 = S (2 * F (C4, v12));
|
365 |
b1 = S (u12 + v12);
|
366 |
b2 = S (u12 - v12);
|
367 |
|
368 |
y0 = S (a0 + b0) >> COL_SHIFT;
|
369 |
y1 = S (a1 + b1) >> COL_SHIFT;
|
370 |
y2 = S (a2 + b2) >> COL_SHIFT;
|
371 |
y3 = S (a3 + b3) >> COL_SHIFT;
|
372 |
|
373 |
y4 = S (a3 - b3) >> COL_SHIFT;
|
374 |
y5 = S (a2 - b2) >> COL_SHIFT;
|
375 |
y6 = S (a1 - b1) >> COL_SHIFT;
|
376 |
y7 = S (a0 - b0) >> COL_SHIFT;
|
377 |
|
378 |
col[0*8] = y0;
|
379 |
col[1*8] = y1;
|
380 |
col[2*8] = y2;
|
381 |
col[3*8] = y3;
|
382 |
col[4*8] = y4;
|
383 |
col[5*8] = y5;
|
384 |
col[6*8] = y6;
|
385 |
col[7*8] = y7;
|
386 |
}
|
387 |
#endif
|
388 |
|
389 |
|
390 |
// MMX column IDCT
|
391 |
static inline void idct_col (int16_t * col, int offset) |
392 |
{ |
393 |
#define T1 13036 |
394 |
#define T2 27146 |
395 |
#define T3 43790 |
396 |
#define C4 23170 |
397 |
|
398 |
static const short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; |
399 |
static const short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; |
400 |
static const short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; |
401 |
static const short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; |
402 |
|
403 |
/* column code adapted from peter gubanov */
|
404 |
/* http://www.elecard.com/peter/idct.shtml */
|
405 |
|
406 |
movq_m2r (*_T1, mm0); // mm0 = T1
|
407 |
|
408 |
movq_m2r (*(col+offset+1*8), mm1); // mm1 = x1 |
409 |
movq_r2r (mm0, mm2); // mm2 = T1
|
410 |
|
411 |
movq_m2r (*(col+offset+7*8), mm4); // mm4 = x7 |
412 |
pmulhw_r2r (mm1, mm0); // mm0 = T1*x1
|
413 |
|
414 |
movq_m2r (*_T3, mm5); // mm5 = T3
|
415 |
pmulhw_r2r (mm4, mm2); // mm2 = T1*x7
|
416 |
|
417 |
movq_m2r (*(col+offset+5*8), mm6); // mm6 = x5 |
418 |
movq_r2r (mm5, mm7); // mm7 = T3-1
|
419 |
|
420 |
movq_m2r (*(col+offset+3*8), mm3); // mm3 = x3 |
421 |
psubsw_r2r (mm4, mm0); // mm0 = v17
|
422 |
|
423 |
movq_m2r (*_T2, mm4); // mm4 = T2
|
424 |
pmulhw_r2r (mm3, mm5); // mm5 = (T3-1)*x3
|
425 |
|
426 |
paddsw_r2r (mm2, mm1); // mm1 = u17
|
427 |
pmulhw_r2r (mm6, mm7); // mm7 = (T3-1)*x5
|
428 |
|
429 |
/* slot */
|
430 |
|
431 |
movq_r2r (mm4, mm2); // mm2 = T2
|
432 |
paddsw_r2r (mm3, mm5); // mm5 = T3*x3
|
433 |
|
434 |
pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2 |
435 |
paddsw_r2r (mm6, mm7); // mm7 = T3*x5
|
436 |
|
437 |
psubsw_r2r (mm6, mm5); // mm5 = v35
|
438 |
paddsw_r2r (mm3, mm7); // mm7 = u35
|
439 |
|
440 |
movq_m2r (*(col+offset+6*8), mm3); // mm3 = x6 |
441 |
movq_r2r (mm0, mm6); // mm6 = v17
|
442 |
|
443 |
pmulhw_r2r (mm3, mm2); // mm2 = T2*x6
|
444 |
psubsw_r2r (mm5, mm0); // mm0 = b3
|
445 |
|
446 |
psubsw_r2r (mm3, mm4); // mm4 = v26
|
447 |
paddsw_r2r (mm6, mm5); // mm5 = v12
|
448 |
|
449 |
movq_r2m (mm0, *(col+offset+3*8)); // save b3 in scratch0 |
450 |
movq_r2r (mm1, mm6); // mm6 = u17
|
451 |
|
452 |
paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26 |
453 |
paddsw_r2r (mm7, mm6); // mm6 = b0
|
454 |
|
455 |
psubsw_r2r (mm7, mm1); // mm1 = u12
|
456 |
movq_r2r (mm1, mm7); // mm7 = u12
|
457 |
|
458 |
movq_m2r (*(col+offset+0*8), mm3); // mm3 = x0 |
459 |
paddsw_r2r (mm5, mm1); // mm1 = u12+v12
|
460 |
|
461 |
movq_m2r (*_C4, mm0); // mm0 = C4/2
|
462 |
psubsw_r2r (mm5, mm7); // mm7 = u12-v12
|
463 |
|
464 |
movq_r2m (mm6, *(col+offset+5*8)); // save b0 in scratch1 |
465 |
pmulhw_r2r (mm0, mm1); // mm1 = b1/2
|
466 |
|
467 |
movq_r2r (mm4, mm6); // mm6 = v26
|
468 |
pmulhw_r2r (mm0, mm7); // mm7 = b2/2
|
469 |
|
470 |
movq_m2r (*(col+offset+4*8), mm5); // mm5 = x4 |
471 |
movq_r2r (mm3, mm0); // mm0 = x0
|
472 |
|
473 |
psubsw_r2r (mm5, mm3); // mm3 = v04
|
474 |
paddsw_r2r (mm5, mm0); // mm0 = u04
|
475 |
|
476 |
paddsw_r2r (mm3, mm4); // mm4 = a1
|
477 |
movq_r2r (mm0, mm5); // mm5 = u04
|
478 |
|
479 |
psubsw_r2r (mm6, mm3); // mm3 = a2
|
480 |
paddsw_r2r (mm2, mm5); // mm5 = a0
|
481 |
|
482 |
paddsw_r2r (mm1, mm1); // mm1 = b1
|
483 |
psubsw_r2r (mm2, mm0); // mm0 = a3
|
484 |
|
485 |
paddsw_r2r (mm7, mm7); // mm7 = b2
|
486 |
movq_r2r (mm3, mm2); // mm2 = a2
|
487 |
|
488 |
movq_r2r (mm4, mm6); // mm6 = a1
|
489 |
paddsw_r2r (mm7, mm3); // mm3 = a2+b2
|
490 |
|
491 |
psraw_i2r (COL_SHIFT, mm3); // mm3 = y2
|
492 |
paddsw_r2r (mm1, mm4); // mm4 = a1+b1
|
493 |
|
494 |
psraw_i2r (COL_SHIFT, mm4); // mm4 = y1
|
495 |
psubsw_r2r (mm1, mm6); // mm6 = a1-b1
|
496 |
|
497 |
movq_m2r (*(col+offset+5*8), mm1); // mm1 = b0 |
498 |
psubsw_r2r (mm7, mm2); // mm2 = a2-b2
|
499 |
|
500 |
psraw_i2r (COL_SHIFT, mm6); // mm6 = y6
|
501 |
movq_r2r (mm5, mm7); // mm7 = a0
|
502 |
|
503 |
movq_r2m (mm4, *(col+offset+1*8)); // save y1 |
504 |
psraw_i2r (COL_SHIFT, mm2); // mm2 = y5
|
505 |
|
506 |
movq_r2m (mm3, *(col+offset+2*8)); // save y2 |
507 |
paddsw_r2r (mm1, mm5); // mm5 = a0+b0
|
508 |
|
509 |
movq_m2r (*(col+offset+3*8), mm4); // mm4 = b3 |
510 |
psubsw_r2r (mm1, mm7); // mm7 = a0-b0
|
511 |
|
512 |
psraw_i2r (COL_SHIFT, mm5); // mm5 = y0
|
513 |
movq_r2r (mm0, mm3); // mm3 = a3
|
514 |
|
515 |
movq_r2m (mm2, *(col+offset+5*8)); // save y5 |
516 |
psubsw_r2r (mm4, mm3); // mm3 = a3-b3
|
517 |
|
518 |
psraw_i2r (COL_SHIFT, mm7); // mm7 = y7
|
519 |
paddsw_r2r (mm0, mm4); // mm4 = a3+b3
|
520 |
|
521 |
movq_r2m (mm5, *(col+offset+0*8)); // save y0 |
522 |
psraw_i2r (COL_SHIFT, mm3); // mm3 = y4
|
523 |
|
524 |
movq_r2m (mm6, *(col+offset+6*8)); // save y6 |
525 |
psraw_i2r (COL_SHIFT, mm4); // mm4 = y3
|
526 |
|
527 |
movq_r2m (mm7, *(col+offset+7*8)); // save y7 |
528 |
|
529 |
movq_r2m (mm3, *(col+offset+4*8)); // save y4 |
530 |
|
531 |
movq_r2m (mm4, *(col+offset+3*8)); // save y3 |
532 |
|
533 |
#undef T1
|
534 |
#undef T2
|
535 |
#undef T3
|
536 |
#undef C4
|
537 |
} |
538 |
|
539 |
static const int32_t rounder0[] ATTR_ALIGN(8) = |
540 |
rounder ((1 << (COL_SHIFT - 1)) - 0.5); |
541 |
static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); |
542 |
static const int32_t rounder1[] ATTR_ALIGN(8) = |
543 |
rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
544 |
static const int32_t rounder7[] ATTR_ALIGN(8) = |
545 |
rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
546 |
static const int32_t rounder2[] ATTR_ALIGN(8) = |
547 |
rounder (0.60355339059); /* C2 * (C6+C2)/2 */ |
548 |
static const int32_t rounder6[] ATTR_ALIGN(8) = |
549 |
rounder (-0.25); /* C2 * (C6-C2)/2 */ |
550 |
static const int32_t rounder3[] ATTR_ALIGN(8) = |
551 |
rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
552 |
static const int32_t rounder5[] ATTR_ALIGN(8) = |
553 |
rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
554 |
|
555 |
#undef COL_SHIFT
|
556 |
#undef ROW_SHIFT
|
557 |
|
558 |
#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
|
559 |
void idct (int16_t * block) \
|
560 |
{ \ |
561 |
static const int16_t table04[] ATTR_ALIGN(16) = \ |
562 |
table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ |
563 |
static const int16_t table17[] ATTR_ALIGN(16) = \ |
564 |
table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ |
565 |
static const int16_t table26[] ATTR_ALIGN(16) = \ |
566 |
table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ |
567 |
static const int16_t table35[] ATTR_ALIGN(16) = \ |
568 |
table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ |
569 |
\ |
570 |
idct_row_head (block, 0*8, table04); \ |
571 |
idct_row (table04, rounder0); \ |
572 |
idct_row_mid (block, 0*8, 4*8, table04); \ |
573 |
idct_row (table04, rounder4); \ |
574 |
idct_row_mid (block, 4*8, 1*8, table17); \ |
575 |
idct_row (table17, rounder1); \ |
576 |
idct_row_mid (block, 1*8, 7*8, table17); \ |
577 |
idct_row (table17, rounder7); \ |
578 |
idct_row_mid (block, 7*8, 2*8, table26); \ |
579 |
idct_row (table26, rounder2); \ |
580 |
idct_row_mid (block, 2*8, 6*8, table26); \ |
581 |
idct_row (table26, rounder6); \ |
582 |
idct_row_mid (block, 6*8, 3*8, table35); \ |
583 |
idct_row (table35, rounder3); \ |
584 |
idct_row_mid (block, 3*8, 5*8, table35); \ |
585 |
idct_row (table35, rounder5); \ |
586 |
idct_row_tail (block, 5*8); \ |
587 |
\ |
588 |
idct_col (block, 0); \
|
589 |
idct_col (block, 4); \
|
590 |
} |
591 |
|
592 |
void ff_mmx_idct(DCTELEM *block);
|
593 |
void ff_mmxext_idct(DCTELEM *block);
|
594 |
|
595 |
declare_idct (ff_mmxext_idct, mmxext_table, |
596 |
mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) |
597 |
|
598 |
declare_idct (ff_mmx_idct, mmx_table, |
599 |
mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) |
600 |
|