ffmpeg / libavcodec / i386 / idct_mmx.c @ 098b4169
History  View  Annotate  Download (22.4 KB)
1 
/*


2 
* idct_mmx.c

3 
* Copyright (C) 19992001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>

4 
*

5 
* This file is part of mpeg2dec, a free MPEG2 video stream decoder.

6 
* See http://libmpeg2.sourceforge.net/ for updates.

7 
*

8 
* mpeg2dec is free software; you can redistribute it and/or modify

9 
* it under the terms of the GNU General Public License as published by

10 
* the Free Software Foundation; either version 2 of the License, or

11 
* (at your option) any later version.

12 
*

13 
* mpeg2dec is distributed in the hope that it will be useful,

14 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

15 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

16 
* GNU General Public License for more details.

17 
*

18 
* You should have received a copy of the GNU General Public License

19 
* along with mpeg2dec; if not, write to the Free Software

20 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

21 
*/

22  
23 
#include "libavutil/common.h" 
24 
#include "libavcodec/dsputil.h" 
25  
26 
#include "mmx.h" 
27  
28 
#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))

29  
30 
#define ROW_SHIFT 11 
31 
#define COL_SHIFT 6 
32  
33 
#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) 
34 
#define rounder(bias) {round (bias), round (bias)}

35  
36  
37 
#if 0

38 
/* C row IDCT  it is just here to document the MMXEXT and MMX versions */

39 
static inline void idct_row (int16_t * row, int offset,

40 
int16_t * table, int32_t * rounder)

41 
{

42 
int C1, C2, C3, C4, C5, C6, C7;

43 
int a0, a1, a2, a3, b0, b1, b2, b3;

44 

45 
row += offset;

46 

47 
C1 = table[1];

48 
C2 = table[2];

49 
C3 = table[3];

50 
C4 = table[4];

51 
C5 = table[5];

52 
C6 = table[6];

53 
C7 = table[7];

54 

55 
a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;

56 
a1 = C4*row[0] + C6*row[2]  C4*row[4]  C2*row[6] + *rounder;

57 
a2 = C4*row[0]  C6*row[2]  C4*row[4] + C2*row[6] + *rounder;

58 
a3 = C4*row[0]  C2*row[2] + C4*row[4]  C6*row[6] + *rounder;

59 

60 
b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];

61 
b1 = C3*row[1]  C7*row[3]  C1*row[5]  C5*row[7];

62 
b2 = C5*row[1]  C1*row[3] + C7*row[5] + C3*row[7];

63 
b3 = C7*row[1]  C5*row[3] + C3*row[5]  C1*row[7];

64 

65 
row[0] = (a0 + b0) >> ROW_SHIFT;

66 
row[1] = (a1 + b1) >> ROW_SHIFT;

67 
row[2] = (a2 + b2) >> ROW_SHIFT;

68 
row[3] = (a3 + b3) >> ROW_SHIFT;

69 
row[4] = (a3  b3) >> ROW_SHIFT;

70 
row[5] = (a2  b2) >> ROW_SHIFT;

71 
row[6] = (a1  b1) >> ROW_SHIFT;

72 
row[7] = (a0  b0) >> ROW_SHIFT;

73 
}

74 
#endif

75  
76  
77 
/* MMXEXT row IDCT */

78  
79 
#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c2, \

80 
c4, c6, c4, c6, \ 
81 
c1, c3, c1, c5, \ 
82 
c5, c7, c3, c7, \ 
83 
c4, c6, c4, c6, \ 
84 
c4, c2, c4, c2, \ 
85 
c5, c1, c3, c1, \ 
86 
c7, c3, c7, c5 } 
87  
88 
static inline void mmxext_row_head (int16_t * const row, const int offset, 
89 
const int16_t * const table) 
90 
{ 
91 
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */

92  
93 
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 
94 
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */

95  
96 
movq_m2r (*table, mm3); /* mm3 = C2 C4 C2 C4 */

97 
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */

98  
99 
movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ 
100 
pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x4C2*x6 C4*x0+C2*x2 */

101  
102 
pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ 
103 
} 
104  
105 
static inline void mmxext_row (const int16_t * const table, 
106 
const int32_t * const rounder) 
107 
{ 
108 
movq_m2r (*(table+8), mm1); /* mm1 = C5 C1 C3 C1 */ 
109 
pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */

110  
111 
pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4C6*x6 C4*x0C6*x2 */ 
112 
pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ 
113  
114 
movq_m2r (*(table+12), mm7); /* mm7 = C7 C3 C7 C5 */ 
115 
pmaddwd_r2r (mm5, mm1); /* mm1 = C1*x5C5*x7 C1*x1+C3*x3 */

116  
117 
paddd_m2r (*rounder, mm3); /* mm3 += rounder */

118 
pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1C7*x3 C5*x5+C7*x7 */

119  
120 
pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0C2*x2 C4*x4+C2*x6 */ 
121 
paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */

122  
123 
pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5C1*x7 C5*x1C1*x3 */ 
124 
movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */

125  
126 
pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1C5*x3 C7*x5+C3*x7 */ 
127 
paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */

128  
129 
paddd_m2r (*rounder, mm0); /* mm0 += rounder */

130 
psubd_r2r (mm1, mm3); /* mm3 = a1b1 a0b0 + rounder */

131  
132 
psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */

133 
paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */

134  
135 
paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */

136 
psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */

137  
138 
paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */

139 
movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */

140  
141 
paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */

142 
psubd_r2r (mm5, mm4); /* mm4 = a3b3 a2b2 + rounder */

143 
} 
144  
145 
static inline void mmxext_row_tail (int16_t * const row, const int store) 
146 
{ 
147 
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */

148  
149 
psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */

150  
151 
packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */

152  
153 
packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */

154  
155 
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */

156 
pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ 
157  
158 
/* slot */

159  
160 
movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ 
161 
} 
162  
163 
static inline void mmxext_row_mid (int16_t * const row, const int store, 
164 
const int offset, 
165 
const int16_t * const table) 
166 
{ 
167 
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */

168 
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */

169  
170 
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 
171 
psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */

172  
173 
packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */

174 
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */

175  
176 
packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */

177 
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */

178  
179 
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */

180 
pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ 
181  
182 
movq_m2r (*table, mm3); /* mm3 = C2 C4 C2 C4 */

183 
movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ 
184  
185 
pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x4C2*x6 C4*x0+C2*x2 */

186  
187 
movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ 
188 
pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ 
189 
} 
190  
191  
192 
/* MMX row IDCT */

193  
194 
#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \

195 
c4, c6, c4, c2, \ 
196 
c1, c3, c3, c7, \ 
197 
c5, c7, c1, c5, \ 
198 
c4, c6, c4, c2, \ 
199 
c4, c2, c4, c6, \ 
200 
c5, c1, c7, c5, \ 
201 
c7, c3, c3, c1 } 
202  
203 
static inline void mmx_row_head (int16_t * const row, const int offset, 
204 
const int16_t * const table) 
205 
{ 
206 
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */

207  
208 
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 
209 
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */

210  
211 
movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */

212 
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */

213  
214 
punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */

215  
216 
movq_m2r (*(table+4), mm4); /* mm4 = C2 C4 C6 C4 */ 
217 
pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */

218  
219 
movq_m2r (*(table+8), mm1); /* mm1 = C7 C3 C3 C1 */ 
220 
punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */

221 
} 
222  
223 
static inline void mmx_row (const int16_t * const table, 
224 
const int32_t * const rounder) 
225 
{ 
226 
pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x4C2*x6 C4*x4+C6*x6 */

227 
punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */

228  
229 
pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0C2*x2 C4*x0C6*x2 */ 
230 
punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */

231  
232 
movq_m2r (*(table+12), mm7); /* mm7 = C5 C1 C7 C5 */ 
233 
pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1C7*x3 C1*x1+C3*x3 */

234  
235 
paddd_m2r (*rounder, mm3); /* mm3 += rounder */

236 
pmaddwd_r2r (mm6, mm7); /* mm7 = C1*x5C5*x7 C5*x5+C7*x7 */

237  
238 
pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4C6*x6 C4*x4+C2*x6 */ 
239 
paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */

240  
241 
pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1C5*x3 C5*x1C1*x3 */ 
242 
movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */

243  
244 
pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5C1*x7 C7*x5+C3*x7 */ 
245 
paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */

246  
247 
paddd_m2r (*rounder, mm0); /* mm0 += rounder */

248 
psubd_r2r (mm1, mm3); /* mm3 = a1b1 a0b0 + rounder */

249  
250 
psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */

251 
paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */

252  
253 
paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */

254 
psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */

255  
256 
paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */

257 
movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */

258  
259 
paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */

260 
psubd_r2r (mm5, mm7); /* mm7 = a3b3 a2b2 + rounder */

261 
} 
262  
263 
static inline void mmx_row_tail (int16_t * const row, const int store) 
264 
{ 
265 
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */

266  
267 
psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */

268  
269 
packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */

270  
271 
packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */

272  
273 
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */

274 
movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */

275  
276 
pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ 
277  
278 
psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ 
279  
280 
por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */

281  
282 
/* slot */

283  
284 
movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ 
285 
} 
286  
287 
static inline void mmx_row_mid (int16_t * const row, const int store, 
288 
const int offset, const int16_t * const table) 
289 
{ 
290 
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */

291 
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */

292  
293 
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 
294 
psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */

295  
296 
packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */

297 
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */

298  
299 
packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */

300 
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */

301  
302 
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */

303 
movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */

304  
305 
punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */

306 
psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ 
307  
308 
movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */

309 
pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ 
310  
311 
movq_m2r (*(table+4), mm4); /* mm4 = C2 C4 C6 C4 */ 
312 
por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */

313  
314 
movq_m2r (*(table+8), mm1); /* mm1 = C7 C3 C3 C1 */ 
315 
punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */

316  
317 
movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ 
318 
pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */

319 
} 
320  
321  
322 
#if 0

323 
/* C column IDCT  it is just here to document the MMXEXT and MMX versions */

324 
static inline void idct_col (int16_t * col, int offset)

325 
{

326 
/* multiplication  as implemented on mmx */

327 
#define F(c,x) (((c) * (x)) >> 16)

328 

329 
/* saturation  it helps us handle torture test cases */

330 
#define S(x) (((x)>32767) ? 32767 : ((x)<32768) ? 32768 : (x))

331 

332 
int16_t x0, x1, x2, x3, x4, x5, x6, x7;

333 
int16_t y0, y1, y2, y3, y4, y5, y6, y7;

334 
int16_t a0, a1, a2, a3, b0, b1, b2, b3;

335 
int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;

336 

337 
col += offset;

338 

339 
x0 = col[0*8];

340 
x1 = col[1*8];

341 
x2 = col[2*8];

342 
x3 = col[3*8];

343 
x4 = col[4*8];

344 
x5 = col[5*8];

345 
x6 = col[6*8];

346 
x7 = col[7*8];

347 

348 
u04 = S (x0 + x4);

349 
v04 = S (x0  x4);

350 
u26 = S (F (T2, x6) + x2);

351 
v26 = S (F (T2, x2)  x6);

352 

353 
a0 = S (u04 + u26);

354 
a1 = S (v04 + v26);

355 
a2 = S (v04  v26);

356 
a3 = S (u04  u26);

357 

358 
u17 = S (F (T1, x7) + x1);

359 
v17 = S (F (T1, x1)  x7);

360 
u35 = S (F (T3, x5) + x3);

361 
v35 = S (F (T3, x3)  x5);

362 

363 
b0 = S (u17 + u35);

364 
b3 = S (v17  v35);

365 
u12 = S (u17  u35);

366 
v12 = S (v17 + v35);

367 
u12 = S (2 * F (C4, u12));

368 
v12 = S (2 * F (C4, v12));

369 
b1 = S (u12 + v12);

370 
b2 = S (u12  v12);

371 

372 
y0 = S (a0 + b0) >> COL_SHIFT;

373 
y1 = S (a1 + b1) >> COL_SHIFT;

374 
y2 = S (a2 + b2) >> COL_SHIFT;

375 
y3 = S (a3 + b3) >> COL_SHIFT;

376 

377 
y4 = S (a3  b3) >> COL_SHIFT;

378 
y5 = S (a2  b2) >> COL_SHIFT;

379 
y6 = S (a1  b1) >> COL_SHIFT;

380 
y7 = S (a0  b0) >> COL_SHIFT;

381 

382 
col[0*8] = y0;

383 
col[1*8] = y1;

384 
col[2*8] = y2;

385 
col[3*8] = y3;

386 
col[4*8] = y4;

387 
col[5*8] = y5;

388 
col[6*8] = y6;

389 
col[7*8] = y7;

390 
}

391 
#endif

392  
393  
394 
/* MMX column IDCT */

395 
static inline void idct_col (int16_t * const col, const int offset) 
396 
{ 
397 
#define T1 13036 
398 
#define T2 27146 
399 
#define T3 43790 
400 
#define C4 23170 
401  
402 
static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; 
403 
static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; 
404 
static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; 
405 
static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; 
406  
407 
/* column code adapted from Peter Gubanov */

408 
/* http://www.elecard.com/peter/idct.shtml */

409  
410 
movq_m2r (*t1_vector, mm0); /* mm0 = T1 */

411  
412 
movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ 
413 
movq_r2r (mm0, mm2); /* mm2 = T1 */

414  
415 
movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ 
416 
pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */

417  
418 
movq_m2r (*t3_vector, mm5); /* mm5 = T3 */

419 
pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */

420  
421 
movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ 
422 
movq_r2r (mm5, mm7); /* mm7 = T31 */

423  
424 
movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ 
425 
psubsw_r2r (mm4, mm0); /* mm0 = v17 */

426  
427 
movq_m2r (*t2_vector, mm4); /* mm4 = T2 */

428 
pmulhw_r2r (mm3, mm5); /* mm5 = (T31)*x3 */

429  
430 
paddsw_r2r (mm2, mm1); /* mm1 = u17 */

431 
pmulhw_r2r (mm6, mm7); /* mm7 = (T31)*x5 */

432  
433 
/* slot */

434  
435 
movq_r2r (mm4, mm2); /* mm2 = T2 */

436 
paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */

437  
438 
pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */ 
439 
paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */

440  
441 
psubsw_r2r (mm6, mm5); /* mm5 = v35 */

442 
paddsw_r2r (mm3, mm7); /* mm7 = u35 */

443  
444 
movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */ 
445 
movq_r2r (mm0, mm6); /* mm6 = v17 */

446  
447 
pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */

448 
psubsw_r2r (mm5, mm0); /* mm0 = b3 */

449  
450 
psubsw_r2r (mm3, mm4); /* mm4 = v26 */

451 
paddsw_r2r (mm6, mm5); /* mm5 = v12 */

452  
453 
movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */ 
454 
movq_r2r (mm1, mm6); /* mm6 = u17 */

455  
456 
paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */ 
457 
paddsw_r2r (mm7, mm6); /* mm6 = b0 */

458  
459 
psubsw_r2r (mm7, mm1); /* mm1 = u12 */

460 
movq_r2r (mm1, mm7); /* mm7 = u12 */

461  
462 
movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ 
463 
paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */

464  
465 
movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */

466 
psubsw_r2r (mm5, mm7); /* mm7 = u12v12 */

467  
468 
movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ 
469 
pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */

470  
471 
movq_r2r (mm4, mm6); /* mm6 = v26 */

472 
pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */

473  
474 
movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */ 
475 
movq_r2r (mm3, mm0); /* mm0 = x0 */

476  
477 
psubsw_r2r (mm5, mm3); /* mm3 = v04 */

478 
paddsw_r2r (mm5, mm0); /* mm0 = u04 */

479  
480 
paddsw_r2r (mm3, mm4); /* mm4 = a1 */

481 
movq_r2r (mm0, mm5); /* mm5 = u04 */

482  
483 
psubsw_r2r (mm6, mm3); /* mm3 = a2 */

484 
paddsw_r2r (mm2, mm5); /* mm5 = a0 */

485  
486 
paddsw_r2r (mm1, mm1); /* mm1 = b1 */

487 
psubsw_r2r (mm2, mm0); /* mm0 = a3 */

488  
489 
paddsw_r2r (mm7, mm7); /* mm7 = b2 */

490 
movq_r2r (mm3, mm2); /* mm2 = a2 */

491  
492 
movq_r2r (mm4, mm6); /* mm6 = a1 */

493 
paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */

494  
495 
psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */

496 
paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */

497  
498 
psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */

499 
psubsw_r2r (mm1, mm6); /* mm6 = a1b1 */

500  
501 
movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */ 
502 
psubsw_r2r (mm7, mm2); /* mm2 = a2b2 */

503  
504 
psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */

505 
movq_r2r (mm5, mm7); /* mm7 = a0 */

506  
507 
movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */ 
508 
psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */

509  
510 
movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */ 
511 
paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */

512  
513 
movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */ 
514 
psubsw_r2r (mm1, mm7); /* mm7 = a0b0 */

515  
516 
psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */

517 
movq_r2r (mm0, mm3); /* mm3 = a3 */

518  
519 
movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */ 
520 
psubsw_r2r (mm4, mm3); /* mm3 = a3b3 */

521  
522 
psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */

523 
paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */

524  
525 
movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */ 
526 
psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */

527  
528 
movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */ 
529 
psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */

530  
531 
movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */ 
532  
533 
movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */ 
534  
535 
movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ 
536  
537 
#undef T1

538 
#undef T2

539 
#undef T3

540 
#undef C4

541 
} 
542  
543  
544 
static const int32_t rounder0[] ATTR_ALIGN(8) = 
545 
rounder ((1 << (COL_SHIFT  1))  0.5); 
546 
static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); 
547 
static const int32_t rounder1[] ATTR_ALIGN(8) = 
548 
rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ 
549 
static const int32_t rounder7[] ATTR_ALIGN(8) = 
550 
rounder (0.25); /* C1*(C7/C4+C7C1)/2 */ 
551 
static const int32_t rounder2[] ATTR_ALIGN(8) = 
552 
rounder (0.60355339059); /* C2 * (C6+C2)/2 */ 
553 
static const int32_t rounder6[] ATTR_ALIGN(8) = 
554 
rounder (0.25); /* C2 * (C6C2)/2 */ 
555 
static const int32_t rounder3[] ATTR_ALIGN(8) = 
556 
rounder (0.087788325588); /* C3*(C3/C4+C3+C5)/2 */ 
557 
static const int32_t rounder5[] ATTR_ALIGN(8) = 
558 
rounder (0.441341716183); /* C3*(C5/C4+C5C3)/2 */ 
559  
560 
#undef COL_SHIFT

561 
#undef ROW_SHIFT

562  
563 
#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \

564 
void idct (int16_t * const block) \ 
565 
{ \ 
566 
static const int16_t table04[] ATTR_ALIGN(16) = \ 
567 
table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ 
568 
static const int16_t table17[] ATTR_ALIGN(16) = \ 
569 
table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ 
570 
static const int16_t table26[] ATTR_ALIGN(16) = \ 
571 
table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ 
572 
static const int16_t table35[] ATTR_ALIGN(16) = \ 
573 
table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ 
574 
\ 
575 
idct_row_head (block, 0*8, table04); \ 
576 
idct_row (table04, rounder0); \ 
577 
idct_row_mid (block, 0*8, 4*8, table04); \ 
578 
idct_row (table04, rounder4); \ 
579 
idct_row_mid (block, 4*8, 1*8, table17); \ 
580 
idct_row (table17, rounder1); \ 
581 
idct_row_mid (block, 1*8, 7*8, table17); \ 
582 
idct_row (table17, rounder7); \ 
583 
idct_row_mid (block, 7*8, 2*8, table26); \ 
584 
idct_row (table26, rounder2); \ 
585 
idct_row_mid (block, 2*8, 6*8, table26); \ 
586 
idct_row (table26, rounder6); \ 
587 
idct_row_mid (block, 6*8, 3*8, table35); \ 
588 
idct_row (table35, rounder3); \ 
589 
idct_row_mid (block, 3*8, 5*8, table35); \ 
590 
idct_row (table35, rounder5); \ 
591 
idct_row_tail (block, 5*8); \ 
592 
\ 
593 
idct_col (block, 0); \

594 
idct_col (block, 4); \

595 
} 
596  
597 
void ff_mmx_idct(DCTELEM *block);

598 
void ff_mmxext_idct(DCTELEM *block);

599  
600 
declare_idct (ff_mmxext_idct, mmxext_table, 
601 
mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) 
602  
603 
declare_idct (ff_mmx_idct, mmx_table, 
604 
mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) 
605 