1 
/*


2 
* idct_mmx.c

3 
* Copyright (C) 19992001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>

4 
*

5 
* This file is part of mpeg2dec, a free MPEG2 video stream decoder.

6 
* See http://libmpeg2.sourceforge.net/ for updates.

7 
*

8 
* mpeg2dec is free software; you can redistribute it and/or modify

9 
* it under the terms of the GNU General Public License as published by

10 
* the Free Software Foundation; either version 2 of the License, or

11 
* (at your option) any later version.

12 
*

13 
* mpeg2dec is distributed in the hope that it will be useful,

14 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

15 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

16 
* GNU General Public License for more details.

17 
*

18 
* You should have received a copy of the GNU General Public License

19 
* along with mpeg2dec; if not, write to the Free Software

20 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

21 
*/

22  
23 
#include "libavutil/common.h" 
24 
#include "libavcodec/dsputil.h" 
25  
26 
#include "dsputil_mmx.h" 
27 
#include "mmx.h" 
28  
29 
#define ROW_SHIFT 11 
30 
#define COL_SHIFT 6 
31  
32 
#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) 
33 
#define rounder(bias) {round (bias), round (bias)}

34  
35  
36 
#if 0

37 
/* C row IDCT  it is just here to document the MMXEXT and MMX versions */

38 
static inline void idct_row (int16_t * row, int offset,

39 
int16_t * table, int32_t * rounder)

40 
{

41 
int C1, C2, C3, C4, C5, C6, C7;

42 
int a0, a1, a2, a3, b0, b1, b2, b3;

43 

44 
row += offset;

45 

46 
C1 = table[1];

47 
C2 = table[2];

48 
C3 = table[3];

49 
C4 = table[4];

50 
C5 = table[5];

51 
C6 = table[6];

52 
C7 = table[7];

53 

54 
a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;

55 
a1 = C4*row[0] + C6*row[2]  C4*row[4]  C2*row[6] + *rounder;

56 
a2 = C4*row[0]  C6*row[2]  C4*row[4] + C2*row[6] + *rounder;

57 
a3 = C4*row[0]  C2*row[2] + C4*row[4]  C6*row[6] + *rounder;

58 

59 
b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];

60 
b1 = C3*row[1]  C7*row[3]  C1*row[5]  C5*row[7];

61 
b2 = C5*row[1]  C1*row[3] + C7*row[5] + C3*row[7];

62 
b3 = C7*row[1]  C5*row[3] + C3*row[5]  C1*row[7];

63 

64 
row[0] = (a0 + b0) >> ROW_SHIFT;

65 
row[1] = (a1 + b1) >> ROW_SHIFT;

66 
row[2] = (a2 + b2) >> ROW_SHIFT;

67 
row[3] = (a3 + b3) >> ROW_SHIFT;

68 
row[4] = (a3  b3) >> ROW_SHIFT;

69 
row[5] = (a2  b2) >> ROW_SHIFT;

70 
row[6] = (a1  b1) >> ROW_SHIFT;

71 
row[7] = (a0  b0) >> ROW_SHIFT;

72 
}

73 
#endif

74  
75  
76 
/* MMXEXT row IDCT */

77  
78 
#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c2, \

79 
c4, c6, c4, c6, \ 
80 
c1, c3, c1, c5, \ 
81 
c5, c7, c3, c7, \ 
82 
c4, c6, c4, c6, \ 
83 
c4, c2, c4, c2, \ 
84 
c5, c1, c3, c1, \ 
85 
c7, c3, c7, c5 } 
86  
87 
static inline void mmxext_row_head (int16_t * const row, const int offset, 
88 
const int16_t * const table) 
89 
{ 
90 
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */

91  
92 
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 
93 
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */

94  
95 
movq_m2r (*table, mm3); /* mm3 = C2 C4 C2 C4 */

96 
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */

97  
98 
movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ 
99 
pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x4C2*x6 C4*x0+C2*x2 */

100  
101 
pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ 
102 
} 
103  
104 
static inline void mmxext_row (const int16_t * const table, 
105 
const int32_t * const rounder) 
106 
{ 
107 
movq_m2r (*(table+8), mm1); /* mm1 = C5 C1 C3 C1 */ 
108 
pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */

109  
110 
pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4C6*x6 C4*x0C6*x2 */ 
111 
pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ 
112  
113 
movq_m2r (*(table+12), mm7); /* mm7 = C7 C3 C7 C5 */ 
114 
pmaddwd_r2r (mm5, mm1); /* mm1 = C1*x5C5*x7 C1*x1+C3*x3 */

115  
116 
paddd_m2r (*rounder, mm3); /* mm3 += rounder */

117 
pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1C7*x3 C5*x5+C7*x7 */

118  
119 
pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0C2*x2 C4*x4+C2*x6 */ 
120 
paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */

121  
122 
pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5C1*x7 C5*x1C1*x3 */ 
123 
movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */

124  
125 
pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1C5*x3 C7*x5+C3*x7 */ 
126 
paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */

127  
128 
paddd_m2r (*rounder, mm0); /* mm0 += rounder */

129 
psubd_r2r (mm1, mm3); /* mm3 = a1b1 a0b0 + rounder */

130  
131 
psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */

132 
paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */

133  
134 
paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */

135 
psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */

136  
137 
paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */

138 
movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */

139  
140 
paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */

141 
psubd_r2r (mm5, mm4); /* mm4 = a3b3 a2b2 + rounder */

142 
} 
143  
144 
static inline void mmxext_row_tail (int16_t * const row, const int store) 
145 
{ 
146 
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */

147  
148 
psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */

149  
150 
packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */

151  
152 
packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */

153  
154 
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */

155 
pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ 
156  
157 
/* slot */

158  
159 
movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ 
160 
} 
161  
162 
static inline void mmxext_row_mid (int16_t * const row, const int store, 
163 
const int offset, 
164 
const int16_t * const table) 
165 
{ 
166 
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */

167 
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */

168  
169 
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 
170 
psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */

171  
172 
packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */

173 
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */

174  
175 
packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */

176 
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */

177  
178 
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */

179 
pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ 
180  
181 
movq_m2r (*table, mm3); /* mm3 = C2 C4 C2 C4 */

182 
movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ 
183  
184 
pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x4C2*x6 C4*x0+C2*x2 */

185  
186 
movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ 
187 
pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ 
188 
} 
189  
190  
191 
/* MMX row IDCT */

192  
193 
#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \

194 
c4, c6, c4, c2, \ 
195 
c1, c3, c3, c7, \ 
196 
c5, c7, c1, c5, \ 
197 
c4, c6, c4, c2, \ 
198 
c4, c2, c4, c6, \ 
199 
c5, c1, c7, c5, \ 
200 
c7, c3, c3, c1 } 
201  
202 
static inline void mmx_row_head (int16_t * const row, const int offset, 
203 
const int16_t * const table) 
204 
{ 
205 
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */

206  
207 
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 
208 
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */

209  
210 
movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */

211 
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */

212  
213 
punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */

214  
215 
movq_m2r (*(table+4), mm4); /* mm4 = C2 C4 C6 C4 */ 
216 
pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */

217  
218 
movq_m2r (*(table+8), mm1); /* mm1 = C7 C3 C3 C1 */ 
219 
punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */

220 
} 
221  
222 
static inline void mmx_row (const int16_t * const table, 
223 
const int32_t * const rounder) 
224 
{ 
225 
pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x4C2*x6 C4*x4+C6*x6 */

226 
punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */

227  
228 
pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0C2*x2 C4*x0C6*x2 */ 
229 
punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */

230  
231 
movq_m2r (*(table+12), mm7); /* mm7 = C5 C1 C7 C5 */ 
232 
pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1C7*x3 C1*x1+C3*x3 */

233  
234 
paddd_m2r (*rounder, mm3); /* mm3 += rounder */

235 
pmaddwd_r2r (mm6, mm7); /* mm7 = C1*x5C5*x7 C5*x5+C7*x7 */

236  
237 
pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4C6*x6 C4*x4+C2*x6 */ 
238 
paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */

239  
240 
pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1C5*x3 C5*x1C1*x3 */ 
241 
movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */

242  
243 
pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5C1*x7 C7*x5+C3*x7 */ 
244 
paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */

245  
246 
paddd_m2r (*rounder, mm0); /* mm0 += rounder */

247 
psubd_r2r (mm1, mm3); /* mm3 = a1b1 a0b0 + rounder */

248  
249 
psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */

250 
paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */

251  
252 
paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */

253 
psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */

254  
255 
paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */

256 
movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */

257  
258 
paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */

259 
psubd_r2r (mm5, mm7); /* mm7 = a3b3 a2b2 + rounder */

260 
} 
261  
262 
static inline void mmx_row_tail (int16_t * const row, const int store) 
263 
{ 
264 
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */

265  
266 
psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */

267  
268 
packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */

269  
270 
packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */

271  
272 
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */

273 
movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */

274  
275 
pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ 
276  
277 
psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ 
278  
279 
por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */

280  
281 
/* slot */

282  
283 
movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ 
284 
} 
285  
286 
static inline void mmx_row_mid (int16_t * const row, const int store, 
287 
const int offset, const int16_t * const table) 
288 
{ 
289 
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */

290 
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */

291  
292 
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 
293 
psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */

294  
295 
packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */

296 
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */

297  
298 
packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */

299 
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */

300  
301 
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */

302 
movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */

303  
304 
punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */

305 
psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ 
306  
307 
movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */

308 
pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ 
309  
310 
movq_m2r (*(table+4), mm4); /* mm4 = C2 C4 C6 C4 */ 
311 
por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */

312  
313 
movq_m2r (*(table+8), mm1); /* mm1 = C7 C3 C3 C1 */ 
314 
punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */

315  
316 
movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ 
317 
pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */

318 
} 
319  
320  
321 
#if 0

322 
/* C column IDCT  it is just here to document the MMXEXT and MMX versions */

323 
static inline void idct_col (int16_t * col, int offset)

324 
{

325 
/* multiplication  as implemented on mmx */

326 
#define F(c,x) (((c) * (x)) >> 16)

327 

328 
/* saturation  it helps us handle torture test cases */

329 
#define S(x) (((x)>32767) ? 32767 : ((x)<32768) ? 32768 : (x))

330 

331 
int16_t x0, x1, x2, x3, x4, x5, x6, x7;

332 
int16_t y0, y1, y2, y3, y4, y5, y6, y7;

333 
int16_t a0, a1, a2, a3, b0, b1, b2, b3;

334 
int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;

335 

336 
col += offset;

337 

338 
x0 = col[0*8];

339 
x1 = col[1*8];

340 
x2 = col[2*8];

341 
x3 = col[3*8];

342 
x4 = col[4*8];

343 
x5 = col[5*8];

344 
x6 = col[6*8];

345 
x7 = col[7*8];

346 

347 
u04 = S (x0 + x4);

348 
v04 = S (x0  x4);

349 
u26 = S (F (T2, x6) + x2);

350 
v26 = S (F (T2, x2)  x6);

351 

352 
a0 = S (u04 + u26);

353 
a1 = S (v04 + v26);

354 
a2 = S (v04  v26);

355 
a3 = S (u04  u26);

356 

357 
u17 = S (F (T1, x7) + x1);

358 
v17 = S (F (T1, x1)  x7);

359 
u35 = S (F (T3, x5) + x3);

360 
v35 = S (F (T3, x3)  x5);

361 

362 
b0 = S (u17 + u35);

363 
b3 = S (v17  v35);

364 
u12 = S (u17  u35);

365 
v12 = S (v17 + v35);

366 
u12 = S (2 * F (C4, u12));

367 
v12 = S (2 * F (C4, v12));

368 
b1 = S (u12 + v12);

369 
b2 = S (u12  v12);

370 

371 
y0 = S (a0 + b0) >> COL_SHIFT;

372 
y1 = S (a1 + b1) >> COL_SHIFT;

373 
y2 = S (a2 + b2) >> COL_SHIFT;

374 
y3 = S (a3 + b3) >> COL_SHIFT;

375 

376 
y4 = S (a3  b3) >> COL_SHIFT;

377 
y5 = S (a2  b2) >> COL_SHIFT;

378 
y6 = S (a1  b1) >> COL_SHIFT;

379 
y7 = S (a0  b0) >> COL_SHIFT;

380 

381 
col[0*8] = y0;

382 
col[1*8] = y1;

383 
col[2*8] = y2;

384 
col[3*8] = y3;

385 
col[4*8] = y4;

386 
col[5*8] = y5;

387 
col[6*8] = y6;

388 
col[7*8] = y7;

389 
}

390 
#endif

391  
392  
393 
/* MMX column IDCT */

394 
static inline void idct_col (int16_t * const col, const int offset) 
395 
{ 
396 
#define T1 13036 
397 
#define T2 27146 
398 
#define T3 43790 
399 
#define C4 23170 
400  
401 
DECLARE_ALIGNED(8, static const short, t1_vector)[] = {T1,T1,T1,T1}; 
402 
DECLARE_ALIGNED(8, static const short, t2_vector)[] = {T2,T2,T2,T2}; 
403 
DECLARE_ALIGNED(8, static const short, t3_vector)[] = {T3,T3,T3,T3}; 
404 
DECLARE_ALIGNED(8, static const short, c4_vector)[] = {C4,C4,C4,C4}; 
405  
406 
/* column code adapted from Peter Gubanov */

407 
/* http://www.elecard.com/peter/idct.shtml */

408  
409 
movq_m2r (*t1_vector, mm0); /* mm0 = T1 */

410  
411 
movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ 
412 
movq_r2r (mm0, mm2); /* mm2 = T1 */

413  
414 
movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ 
415 
pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */

416  
417 
movq_m2r (*t3_vector, mm5); /* mm5 = T3 */

418 
pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */

419  
420 
movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ 
421 
movq_r2r (mm5, mm7); /* mm7 = T31 */

422  
423 
movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ 
424 
psubsw_r2r (mm4, mm0); /* mm0 = v17 */

425  
426 
movq_m2r (*t2_vector, mm4); /* mm4 = T2 */

427 
pmulhw_r2r (mm3, mm5); /* mm5 = (T31)*x3 */

428  
429 
paddsw_r2r (mm2, mm1); /* mm1 = u17 */

430 
pmulhw_r2r (mm6, mm7); /* mm7 = (T31)*x5 */

431  
432 
/* slot */

433  
434 
movq_r2r (mm4, mm2); /* mm2 = T2 */

435 
paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */

436  
437 
pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */ 
438 
paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */

439  
440 
psubsw_r2r (mm6, mm5); /* mm5 = v35 */

441 
paddsw_r2r (mm3, mm7); /* mm7 = u35 */

442  
443 
movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */ 
444 
movq_r2r (mm0, mm6); /* mm6 = v17 */

445  
446 
pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */

447 
psubsw_r2r (mm5, mm0); /* mm0 = b3 */

448  
449 
psubsw_r2r (mm3, mm4); /* mm4 = v26 */

450 
paddsw_r2r (mm6, mm5); /* mm5 = v12 */

451  
452 
movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */ 
453 
movq_r2r (mm1, mm6); /* mm6 = u17 */

454  
455 
paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */ 
456 
paddsw_r2r (mm7, mm6); /* mm6 = b0 */

457  
458 
psubsw_r2r (mm7, mm1); /* mm1 = u12 */

459 
movq_r2r (mm1, mm7); /* mm7 = u12 */

460  
461 
movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ 
462 
paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */

463  
464 
movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */

465 
psubsw_r2r (mm5, mm7); /* mm7 = u12v12 */

466  
467 
movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ 
468 
pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */

469  
470 
movq_r2r (mm4, mm6); /* mm6 = v26 */

471 
pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */

472  
473 
movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */ 
474 
movq_r2r (mm3, mm0); /* mm0 = x0 */

475  
476 
psubsw_r2r (mm5, mm3); /* mm3 = v04 */

477 
paddsw_r2r (mm5, mm0); /* mm0 = u04 */

478  
479 
paddsw_r2r (mm3, mm4); /* mm4 = a1 */

480 
movq_r2r (mm0, mm5); /* mm5 = u04 */

481  
482 
psubsw_r2r (mm6, mm3); /* mm3 = a2 */

483 
paddsw_r2r (mm2, mm5); /* mm5 = a0 */

484  
485 
paddsw_r2r (mm1, mm1); /* mm1 = b1 */

486 
psubsw_r2r (mm2, mm0); /* mm0 = a3 */

487  
488 
paddsw_r2r (mm7, mm7); /* mm7 = b2 */

489 
movq_r2r (mm3, mm2); /* mm2 = a2 */

490  
491 
movq_r2r (mm4, mm6); /* mm6 = a1 */

492 
paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */

493  
494 
psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */

495 
paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */

496  
497 
psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */

498 
psubsw_r2r (mm1, mm6); /* mm6 = a1b1 */

499  
500 
movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */ 
501 
psubsw_r2r (mm7, mm2); /* mm2 = a2b2 */

502  
503 
psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */

504 
movq_r2r (mm5, mm7); /* mm7 = a0 */

505  
506 
movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */ 
507 
psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */

508  
509 
movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */ 
510 
paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */

511  
512 
movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */ 
513 
psubsw_r2r (mm1, mm7); /* mm7 = a0b0 */

514  
515 
psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */

516 
movq_r2r (mm0, mm3); /* mm3 = a3 */

517  
518 
movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */ 
519 
psubsw_r2r (mm4, mm3); /* mm3 = a3b3 */

520  
521 
psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */

522 
paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */

523  
524 
movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */ 
525 
psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */

526  
527 
movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */ 
528 
psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */

529  
530 
movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */ 
531  
532 
movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */ 
533  
534 
movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ 
535  
536 
#undef T1

537 
#undef T2

538 
#undef T3

539 
#undef C4

540 
} 
541  
542  
543 
DECLARE_ALIGNED(8, static const int32_t, rounder0)[] = 
544 
rounder ((1 << (COL_SHIFT  1))  0.5); 
545 
DECLARE_ALIGNED(8, static const int32_t, rounder4)[] = rounder (0); 
546 
DECLARE_ALIGNED(8, static const int32_t, rounder1)[] = 
547 
rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ 
548 
DECLARE_ALIGNED(8, static const int32_t, rounder7)[] = 
549 
rounder (0.25); /* C1*(C7/C4+C7C1)/2 */ 
550 
DECLARE_ALIGNED(8, static const int32_t, rounder2)[] = 
551 
rounder (0.60355339059); /* C2 * (C6+C2)/2 */ 
552 
DECLARE_ALIGNED(8, static const int32_t, rounder6)[] = 
553 
rounder (0.25); /* C2 * (C6C2)/2 */ 
554 
DECLARE_ALIGNED(8, static const int32_t, rounder3)[] = 
555 
rounder (0.087788325588); /* C3*(C3/C4+C3+C5)/2 */ 
556 
DECLARE_ALIGNED(8, static const int32_t, rounder5)[] = 
557 
rounder (0.441341716183); /* C3*(C5/C4+C5C3)/2 */ 
558  
559 
#undef COL_SHIFT

560 
#undef ROW_SHIFT

561  
562 
#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \

563 
void idct (int16_t * const block) \ 
564 
{ \ 
565 
DECLARE_ALIGNED(16, static const int16_t, table04)[] = \ 
566 
table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ 
567 
DECLARE_ALIGNED(16, static const int16_t, table17)[] = \ 
568 
table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ 
569 
DECLARE_ALIGNED(16, static const int16_t, table26)[] = \ 
570 
table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ 
571 
DECLARE_ALIGNED(16, static const int16_t, table35)[] = \ 
572 
table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ 
573 
\ 
574 
idct_row_head (block, 0*8, table04); \ 
575 
idct_row (table04, rounder0); \ 
576 
idct_row_mid (block, 0*8, 4*8, table04); \ 
577 
idct_row (table04, rounder4); \ 
578 
idct_row_mid (block, 4*8, 1*8, table17); \ 
579 
idct_row (table17, rounder1); \ 
580 
idct_row_mid (block, 1*8, 7*8, table17); \ 
581 
idct_row (table17, rounder7); \ 
582 
idct_row_mid (block, 7*8, 2*8, table26); \ 
583 
idct_row (table26, rounder2); \ 
584 
idct_row_mid (block, 2*8, 6*8, table26); \ 
585 
idct_row (table26, rounder6); \ 
586 
idct_row_mid (block, 6*8, 3*8, table35); \ 
587 
idct_row (table35, rounder3); \ 
588 
idct_row_mid (block, 3*8, 5*8, table35); \ 
589 
idct_row (table35, rounder5); \ 
590 
idct_row_tail (block, 5*8); \ 
591 
\ 
592 
idct_col (block, 0); \

593 
idct_col (block, 4); \

594 
} 
595  
596 
declare_idct (ff_mmxext_idct, mmxext_table, 
597 
mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) 
598  
599 
declare_idct (ff_mmx_idct, mmx_table, 
600 
mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) 
601 