Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / fdct_mmx.c @ be449fca

History | View | Annotate | Download (15.9 KB)

1
/*
2
 * MMX optimized forward DCT
3
 * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
4
 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
6
 *
7
 * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
8
 *
9
 *  Intel Application Note AP-922 - fast, precise implementation of DCT
10
 *        http://developer.intel.com/vtune/cbts/appnotes.htm
11
 *
12
 * Also of inspiration:
13
 * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
14
 * Skal's fdct at http://skal.planet-d.net/coding/dct.html
15
 *
16
 * This file is part of FFmpeg.
17
 *
18
 * FFmpeg is free software; you can redistribute it and/or
19
 * modify it under the terms of the GNU Lesser General Public
20
 * License as published by the Free Software Foundation; either
21
 * version 2.1 of the License, or (at your option) any later version.
22
 *
23
 * FFmpeg is distributed in the hope that it will be useful,
24
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26
 * Lesser General Public License for more details.
27
 *
28
 * You should have received a copy of the GNU Lesser General Public
29
 * License along with FFmpeg; if not, write to the Free Software
30
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
31
 */
32

    
33
#include "libavutil/common.h"
34
#include "libavcodec/dsputil.h"
35
#include "mmx.h"
36

    
37
#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
38

    
39
//////////////////////////////////////////////////////////////////////
40
//
41
// constants for the forward DCT
42
// -----------------------------
43
//
44
// Be sure to check that your compiler is aligning all constants to QWORD
45
// (8-byte) memory boundaries!  Otherwise the unaligned memory access will
46
// severely stall MMX execution.
47
//
48
//////////////////////////////////////////////////////////////////////
49

    
50
#define BITS_FRW_ACC   3 //; 2 or 3 for accuracy
51
#define SHIFT_FRW_COL  BITS_FRW_ACC
52
#define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17 - 3)
53
#define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))
54
//#define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1))
55

    
56
#define X8(x) x,x,x,x,x,x,x,x
57

    
58
//concatenated table, for forward DCT transformation
59
static const int16_t fdct_tg_all_16[24] ATTR_ALIGN(16) = {
60
    X8(13036),  // tg * (2<<16) + 0.5
61
    X8(27146),  // tg * (2<<16) + 0.5
62
    X8(-21746)  // tg * (2<<16) + 0.5
63
};
64

    
65
static const int16_t ocos_4_16[8] ATTR_ALIGN(16) = {
66
    X8(23170)   //cos * (2<<15) + 0.5
67
};
68

    
69
static const int16_t fdct_one_corr[8] ATTR_ALIGN(16) = { X8(1) };
70

    
71
static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
72

    
73
static struct
74
{
75
 const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
76
} fdct_r_row_sse2 ATTR_ALIGN(16)=
77
{{
78
 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
79
}};
80
//static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
81

    
82
static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff table
83
  16384,   16384,   22725,   19266,
84
  16384,   16384,   12873,    4520,
85
  21407,    8867,   19266,   -4520,
86
  -8867,  -21407,  -22725,  -12873,
87
  16384,  -16384,   12873,  -22725,
88
 -16384,   16384,    4520,   19266,
89
   8867,  -21407,    4520,  -12873,
90
  21407,   -8867,   19266,  -22725,
91

    
92
  22725,   22725,   31521,   26722,
93
  22725,   22725,   17855,    6270,
94
  29692,   12299,   26722,   -6270,
95
 -12299,  -29692,  -31521,  -17855,
96
  22725,  -22725,   17855,  -31521,
97
 -22725,   22725,    6270,   26722,
98
  12299,  -29692,    6270,  -17855,
99
  29692,  -12299,   26722,  -31521,
100

    
101
  21407,   21407,   29692,   25172,
102
  21407,   21407,   16819,    5906,
103
  27969,   11585,   25172,   -5906,
104
 -11585,  -27969,  -29692,  -16819,
105
  21407,  -21407,   16819,  -29692,
106
 -21407,   21407,    5906,   25172,
107
  11585,  -27969,    5906,  -16819,
108
  27969,  -11585,   25172,  -29692,
109

    
110
  19266,   19266,   26722,   22654,
111
  19266,   19266,   15137,    5315,
112
  25172,   10426,   22654,   -5315,
113
 -10426,  -25172,  -26722,  -15137,
114
  19266,  -19266,   15137,  -26722,
115
 -19266,   19266,    5315,   22654,
116
  10426,  -25172,    5315,  -15137,
117
  25172,  -10426,   22654,  -26722,
118

    
119
  16384,   16384,   22725,   19266,
120
  16384,   16384,   12873,    4520,
121
  21407,    8867,   19266,   -4520,
122
  -8867,  -21407,  -22725,  -12873,
123
  16384,  -16384,   12873,  -22725,
124
 -16384,   16384,    4520,   19266,
125
   8867,  -21407,    4520,  -12873,
126
  21407,   -8867,   19266,  -22725,
127

    
128
  19266,   19266,   26722,   22654,
129
  19266,   19266,   15137,    5315,
130
  25172,   10426,   22654,   -5315,
131
 -10426,  -25172,  -26722,  -15137,
132
  19266,  -19266,   15137,  -26722,
133
 -19266,   19266,    5315,   22654,
134
  10426,  -25172,    5315,  -15137,
135
  25172,  -10426,   22654,  -26722,
136

    
137
  21407,   21407,   29692,   25172,
138
  21407,   21407,   16819,    5906,
139
  27969,   11585,   25172,   -5906,
140
 -11585,  -27969,  -29692,  -16819,
141
  21407,  -21407,   16819,  -29692,
142
 -21407,   21407,    5906,   25172,
143
  11585,  -27969,    5906,  -16819,
144
  27969,  -11585,   25172,  -29692,
145

    
146
  22725,   22725,   31521,   26722,
147
  22725,   22725,   17855,    6270,
148
  29692,   12299,   26722,   -6270,
149
 -12299,  -29692,  -31521,  -17855,
150
  22725,  -22725,   17855,  -31521,
151
 -22725,   22725,    6270,   26722,
152
  12299,  -29692,    6270,  -17855,
153
  29692,  -12299,   26722,  -31521,
154
};
155

    
156
static struct
157
{
158
 const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16);
159
} tab_frw_01234567_sse2 ATTR_ALIGN(16) =
160
{{
161
//static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = {  // forward_dct coeff table
162
#define TABLE_SSE2 C4,  C4,  C1,  C3, -C6, -C2, -C1, -C5, \
163
                   C4,  C4,  C5,  C7,  C2,  C6,  C3, -C7, \
164
                  -C4,  C4,  C7,  C3,  C6, -C2,  C7, -C5, \
165
                   C4, -C4,  C5, -C1,  C2, -C6,  C3, -C1,
166
// c1..c7 * cos(pi/4) * 2^15
167
#define C1 22725
168
#define C2 21407
169
#define C3 19266
170
#define C4 16384
171
#define C5 12873
172
#define C6 8867
173
#define C7 4520
174
TABLE_SSE2
175

    
176
#undef C1
177
#undef C2
178
#undef C3
179
#undef C4
180
#undef C5
181
#undef C6
182
#undef C7
183
#define C1 31521
184
#define C2 29692
185
#define C3 26722
186
#define C4 22725
187
#define C5 17855
188
#define C6 12299
189
#define C7 6270
190
TABLE_SSE2
191

    
192
#undef C1
193
#undef C2
194
#undef C3
195
#undef C4
196
#undef C5
197
#undef C6
198
#undef C7
199
#define C1 29692
200
#define C2 27969
201
#define C3 25172
202
#define C4 21407
203
#define C5 16819
204
#define C6 11585
205
#define C7 5906
206
TABLE_SSE2
207

    
208
#undef C1
209
#undef C2
210
#undef C3
211
#undef C4
212
#undef C5
213
#undef C6
214
#undef C7
215
#define C1 26722
216
#define C2 25172
217
#define C3 22654
218
#define C4 19266
219
#define C5 15137
220
#define C6 10426
221
#define C7 5315
222
TABLE_SSE2
223

    
224
#undef C1
225
#undef C2
226
#undef C3
227
#undef C4
228
#undef C5
229
#undef C6
230
#undef C7
231
#define C1 22725
232
#define C2 21407
233
#define C3 19266
234
#define C4 16384
235
#define C5 12873
236
#define C6 8867
237
#define C7 4520
238
TABLE_SSE2
239

    
240
#undef C1
241
#undef C2
242
#undef C3
243
#undef C4
244
#undef C5
245
#undef C6
246
#undef C7
247
#define C1 26722
248
#define C2 25172
249
#define C3 22654
250
#define C4 19266
251
#define C5 15137
252
#define C6 10426
253
#define C7 5315
254
TABLE_SSE2
255

    
256
#undef C1
257
#undef C2
258
#undef C3
259
#undef C4
260
#undef C5
261
#undef C6
262
#undef C7
263
#define C1 29692
264
#define C2 27969
265
#define C3 25172
266
#define C4 21407
267
#define C5 16819
268
#define C6 11585
269
#define C7 5906
270
TABLE_SSE2
271

    
272
#undef C1
273
#undef C2
274
#undef C3
275
#undef C4
276
#undef C5
277
#undef C6
278
#undef C7
279
#define C1 31521
280
#define C2 29692
281
#define C3 26722
282
#define C4 22725
283
#define C5 17855
284
#define C6 12299
285
#define C7 6270
286
TABLE_SSE2
287
}};
288

    
289
#define FDCT_COL(cpu, mm, mov)\
290
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
291
{\
292
    mov##_m2r(*(in + offset + 1 * 8), mm##0);\
293
    mov##_m2r(*(in + offset + 6 * 8), mm##1);\
294
    mov##_r2r(mm##0, mm##2);\
295
    mov##_m2r(*(in + offset + 2 * 8), mm##3);\
296
    paddsw_r2r(mm##1, mm##0);\
297
    mov##_m2r(*(in + offset + 5 * 8), mm##4);\
298
    psllw_i2r(SHIFT_FRW_COL, mm##0);\
299
    mov##_m2r(*(in + offset + 0 * 8), mm##5);\
300
    paddsw_r2r(mm##3, mm##4);\
301
    paddsw_m2r(*(in + offset + 7 * 8), mm##5);\
302
    psllw_i2r(SHIFT_FRW_COL, mm##4);\
303
    mov##_r2r(mm##0, mm##6);\
304
    psubsw_r2r(mm##1, mm##2);\
305
    mov##_m2r(*(fdct_tg_all_16 + 8), mm##1);\
306
    psubsw_r2r(mm##4, mm##0);\
307
    mov##_m2r(*(in + offset + 3 * 8), mm##7);\
308
    pmulhw_r2r(mm##0, mm##1);\
309
    paddsw_m2r(*(in + offset + 4 * 8), mm##7);\
310
    psllw_i2r(SHIFT_FRW_COL, mm##5);\
311
    paddsw_r2r(mm##4, mm##6);\
312
    psllw_i2r(SHIFT_FRW_COL, mm##7);\
313
    mov##_r2r(mm##5, mm##4);\
314
    psubsw_r2r(mm##7, mm##5);\
315
    paddsw_r2r(mm##5, mm##1);\
316
    paddsw_r2r(mm##7, mm##4);\
317
    por_m2r(*fdct_one_corr, mm##1);\
318
    psllw_i2r(SHIFT_FRW_COL + 1, mm##2);\
319
    pmulhw_m2r(*(fdct_tg_all_16 + 8), mm##5);\
320
    mov##_r2r(mm##4, mm##7);\
321
    psubsw_m2r(*(in + offset + 5 * 8), mm##3);\
322
    psubsw_r2r(mm##6, mm##4);\
323
    mov##_r2m(mm##1, *(out + offset + 2 * 8));\
324
    paddsw_r2r(mm##6, mm##7);\
325
    mov##_m2r(*(in + offset + 3 * 8), mm##1);\
326
    psllw_i2r(SHIFT_FRW_COL + 1, mm##3);\
327
    psubsw_m2r(*(in + offset + 4 * 8), mm##1);\
328
    mov##_r2r(mm##2, mm##6);\
329
    mov##_r2m(mm##4, *(out + offset + 4 * 8));\
330
    paddsw_r2r(mm##3, mm##2);\
331
    pmulhw_m2r(*ocos_4_16, mm##2);\
332
    psubsw_r2r(mm##3, mm##6);\
333
    pmulhw_m2r(*ocos_4_16, mm##6);\
334
    psubsw_r2r(mm##0, mm##5);\
335
    por_m2r(*fdct_one_corr, mm##5);\
336
    psllw_i2r(SHIFT_FRW_COL, mm##1);\
337
    por_m2r(*fdct_one_corr, mm##2);\
338
    mov##_r2r(mm##1, mm##4);\
339
    mov##_m2r(*(in + offset + 0 * 8), mm##3);\
340
    paddsw_r2r(mm##6, mm##1);\
341
    psubsw_m2r(*(in + offset + 7 * 8), mm##3);\
342
    psubsw_r2r(mm##6, mm##4);\
343
    mov##_m2r(*(fdct_tg_all_16 + 0), mm##0);\
344
    psllw_i2r(SHIFT_FRW_COL, mm##3);\
345
    mov##_m2r(*(fdct_tg_all_16 + 16), mm##6);\
346
    pmulhw_r2r(mm##1, mm##0);\
347
    mov##_r2m(mm##7, *(out + offset + 0 * 8));\
348
    pmulhw_r2r(mm##4, mm##6);\
349
    mov##_r2m(mm##5, *(out + offset + 6 * 8));\
350
    mov##_r2r(mm##3, mm##7);\
351
    mov##_m2r(*(fdct_tg_all_16 + 16), mm##5);\
352
    psubsw_r2r(mm##2, mm##7);\
353
    paddsw_r2r(mm##2, mm##3);\
354
    pmulhw_r2r(mm##7, mm##5);\
355
    paddsw_r2r(mm##3, mm##0);\
356
    paddsw_r2r(mm##4, mm##6);\
357
    pmulhw_m2r(*(fdct_tg_all_16 + 0), mm##3);\
358
    por_m2r(*fdct_one_corr, mm##0);\
359
    paddsw_r2r(mm##7, mm##5);\
360
    psubsw_r2r(mm##6, mm##7);\
361
    mov##_r2m(mm##0, *(out + offset + 1 * 8));\
362
    paddsw_r2r(mm##4, mm##5);\
363
    mov##_r2m(mm##7, *(out + offset + 3 * 8));\
364
    psubsw_r2r(mm##1, mm##3);\
365
    mov##_r2m(mm##5, *(out + offset + 5 * 8));\
366
    mov##_r2m(mm##3, *(out + offset + 7 * 8));\
367
}
368

    
369
FDCT_COL(mmx, mm, movq)
370
FDCT_COL(sse2, xmm, movdqa)
371

    
372
static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
373
{
374
    __asm__ volatile(
375
#define FDCT_ROW_SSE2_H1(i,t)                    \
376
        "movq      " #i "(%0), %%xmm2      \n\t" \
377
        "movq      " #i "+8(%0), %%xmm0    \n\t" \
378
        "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
379
        "movdqa    " #t "+48(%1), %%xmm7   \n\t" \
380
        "movdqa    " #t "(%1), %%xmm4      \n\t" \
381
        "movdqa    " #t "+16(%1), %%xmm5   \n\t"
382

    
383
#define FDCT_ROW_SSE2_H2(i,t)                    \
384
        "movq      " #i "(%0), %%xmm2      \n\t" \
385
        "movq      " #i "+8(%0), %%xmm0    \n\t" \
386
        "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
387
        "movdqa    " #t "+48(%1), %%xmm7   \n\t"
388

    
389
#define FDCT_ROW_SSE2(i)                      \
390
        "movq      %%xmm2, %%xmm1       \n\t" \
391
        "pshuflw   $27, %%xmm0, %%xmm0  \n\t" \
392
        "paddsw    %%xmm0, %%xmm1       \n\t" \
393
        "psubsw    %%xmm0, %%xmm2       \n\t" \
394
        "punpckldq %%xmm2, %%xmm1       \n\t" \
395
        "pshufd    $78, %%xmm1, %%xmm2  \n\t" \
396
        "pmaddwd   %%xmm2, %%xmm3       \n\t" \
397
        "pmaddwd   %%xmm1, %%xmm7       \n\t" \
398
        "pmaddwd   %%xmm5, %%xmm2       \n\t" \
399
        "pmaddwd   %%xmm4, %%xmm1       \n\t" \
400
        "paddd     %%xmm7, %%xmm3       \n\t" \
401
        "paddd     %%xmm2, %%xmm1       \n\t" \
402
        "paddd     %%xmm6, %%xmm3       \n\t" \
403
        "paddd     %%xmm6, %%xmm1       \n\t" \
404
        "psrad     %3, %%xmm3           \n\t" \
405
        "psrad     %3, %%xmm1           \n\t" \
406
        "packssdw  %%xmm3, %%xmm1       \n\t" \
407
        "movdqa    %%xmm1, " #i "(%4)   \n\t"
408

    
409
        "movdqa    (%2), %%xmm6         \n\t"
410
        FDCT_ROW_SSE2_H1(0,0)
411
        FDCT_ROW_SSE2(0)
412
        FDCT_ROW_SSE2_H2(64,0)
413
        FDCT_ROW_SSE2(64)
414

    
415
        FDCT_ROW_SSE2_H1(16,64)
416
        FDCT_ROW_SSE2(16)
417
        FDCT_ROW_SSE2_H2(112,64)
418
        FDCT_ROW_SSE2(112)
419

    
420
        FDCT_ROW_SSE2_H1(32,128)
421
        FDCT_ROW_SSE2(32)
422
        FDCT_ROW_SSE2_H2(96,128)
423
        FDCT_ROW_SSE2(96)
424

    
425
        FDCT_ROW_SSE2_H1(48,192)
426
        FDCT_ROW_SSE2(48)
427
        FDCT_ROW_SSE2_H2(80,192)
428
        FDCT_ROW_SSE2(80)
429
        :
430
        : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
431
    );
432
}
433

    
434
static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
435
{
436
    pshufw_m2r(*(in + 4), mm5, 0x1B);
437
    movq_m2r(*(in + 0), mm0);
438
    movq_r2r(mm0, mm1);
439
    paddsw_r2r(mm5, mm0);
440
    psubsw_r2r(mm5, mm1);
441
    movq_r2r(mm0, mm2);
442
    punpckldq_r2r(mm1, mm0);
443
    punpckhdq_r2r(mm1, mm2);
444
    movq_m2r(*(table + 0), mm1);
445
    movq_m2r(*(table + 4), mm3);
446
    movq_m2r(*(table + 8), mm4);
447
    movq_m2r(*(table + 12), mm5);
448
    movq_m2r(*(table + 16), mm6);
449
    movq_m2r(*(table + 20), mm7);
450
    pmaddwd_r2r(mm0, mm1);
451
    pmaddwd_r2r(mm2, mm3);
452
    pmaddwd_r2r(mm0, mm4);
453
    pmaddwd_r2r(mm2, mm5);
454
    pmaddwd_r2r(mm0, mm6);
455
    pmaddwd_r2r(mm2, mm7);
456
    pmaddwd_m2r(*(table + 24), mm0);
457
    pmaddwd_m2r(*(table + 28), mm2);
458
    paddd_r2r(mm1, mm3);
459
    paddd_r2r(mm4, mm5);
460
    paddd_r2r(mm6, mm7);
461
    paddd_r2r(mm0, mm2);
462
    movq_m2r(*fdct_r_row, mm0);
463
    paddd_r2r(mm0, mm3);
464
    paddd_r2r(mm0, mm5);
465
    paddd_r2r(mm0, mm7);
466
    paddd_r2r(mm0, mm2);
467
    psrad_i2r(SHIFT_FRW_ROW, mm3);
468
    psrad_i2r(SHIFT_FRW_ROW, mm5);
469
    psrad_i2r(SHIFT_FRW_ROW, mm7);
470
    psrad_i2r(SHIFT_FRW_ROW, mm2);
471
    packssdw_r2r(mm5, mm3);
472
    packssdw_r2r(mm2, mm7);
473
    movq_r2m(mm3, *(out + 0));
474
    movq_r2m(mm7, *(out + 4));
475
}
476

    
477
static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
478
{
479
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
480
    movd_m2r(*(in + 6), mm1);
481
    punpcklwd_m2r(*(in + 4), mm1);
482
    movq_r2r(mm1, mm2);
483
    psrlq_i2r(0x20, mm1);
484
    movq_m2r(*(in + 0), mm0);
485
    punpcklwd_r2r(mm2, mm1);
486
    movq_r2r(mm0, mm5);
487
    paddsw_r2r(mm1, mm0);
488
    psubsw_r2r(mm1, mm5);
489
    movq_r2r(mm0, mm2);
490
    punpckldq_r2r(mm5, mm0);
491
    punpckhdq_r2r(mm5, mm2);
492
    movq_m2r(*(table + 0), mm1);
493
    movq_m2r(*(table + 4), mm3);
494
    movq_m2r(*(table + 8), mm4);
495
    movq_m2r(*(table + 12), mm5);
496
    movq_m2r(*(table + 16), mm6);
497
    movq_m2r(*(table + 20), mm7);
498
    pmaddwd_r2r(mm0, mm1);
499
    pmaddwd_r2r(mm2, mm3);
500
    pmaddwd_r2r(mm0, mm4);
501
    pmaddwd_r2r(mm2, mm5);
502
    pmaddwd_r2r(mm0, mm6);
503
    pmaddwd_r2r(mm2, mm7);
504
    pmaddwd_m2r(*(table + 24), mm0);
505
    pmaddwd_m2r(*(table + 28), mm2);
506
    paddd_r2r(mm1, mm3);
507
    paddd_r2r(mm4, mm5);
508
    paddd_r2r(mm6, mm7);
509
    paddd_r2r(mm0, mm2);
510
    movq_m2r(*fdct_r_row, mm0);
511
    paddd_r2r(mm0, mm3);
512
    paddd_r2r(mm0, mm5);
513
    paddd_r2r(mm0, mm7);
514
    paddd_r2r(mm0, mm2);
515
    psrad_i2r(SHIFT_FRW_ROW, mm3);
516
    psrad_i2r(SHIFT_FRW_ROW, mm5);
517
    psrad_i2r(SHIFT_FRW_ROW, mm7);
518
    psrad_i2r(SHIFT_FRW_ROW, mm2);
519
    packssdw_r2r(mm5, mm3);
520
    packssdw_r2r(mm2, mm7);
521
    movq_r2m(mm3, *(out + 0));
522
    movq_r2m(mm7, *(out + 4));
523
}
524

    
525
void ff_fdct_mmx(int16_t *block)
526
{
527
    int64_t align_tmp[16] ATTR_ALIGN(8);
528
    int16_t * block1= (int16_t*)align_tmp;
529
    const int16_t *table= tab_frw_01234567;
530
    int i;
531

    
532
    fdct_col_mmx(block, block1, 0);
533
    fdct_col_mmx(block, block1, 4);
534

    
535
    for(i=8;i>0;i--) {
536
        fdct_row_mmx(block1, block, table);
537
        block1 += 8;
538
        table += 32;
539
        block += 8;
540
    }
541
}
542

    
543
void ff_fdct_mmx2(int16_t *block)
544
{
545
    int64_t align_tmp[16] ATTR_ALIGN(8);
546
    int16_t *block1= (int16_t*)align_tmp;
547
    const int16_t *table= tab_frw_01234567;
548
    int i;
549

    
550
    fdct_col_mmx(block, block1, 0);
551
    fdct_col_mmx(block, block1, 4);
552

    
553
    for(i=8;i>0;i--) {
554
        fdct_row_mmx2(block1, block, table);
555
        block1 += 8;
556
        table += 32;
557
        block += 8;
558
    }
559
}
560

    
561
void ff_fdct_sse2(int16_t *block)
562
{
563
    int64_t align_tmp[16] ATTR_ALIGN(16);
564
    int16_t * const block1= (int16_t*)align_tmp;
565

    
566
    fdct_col_sse2(block, block1, 0);
567
    fdct_row_sse2(block1, block);
568
}
569