Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / fdct_mmx.c @ 755bfeab

History | View | Annotate | Download (15.9 KB)

1
/*
2
 * MMX optimized forward DCT
3
 * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
4
 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
6
 *
7
 * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
8
 *
9
 *  Intel Application Note AP-922 - fast, precise implementation of DCT
10
 *        http://developer.intel.com/vtune/cbts/appnotes.htm
11
 *
12
 * Also of inspiration:
13
 * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
14
 * Skal's fdct at http://skal.planet-d.net/coding/dct.html
15
 *
16
 * This file is part of FFmpeg.
17
 *
18
 * FFmpeg is free software; you can redistribute it and/or
19
 * modify it under the terms of the GNU Lesser General Public
20
 * License as published by the Free Software Foundation; either
21
 * version 2.1 of the License, or (at your option) any later version.
22
 *
23
 * FFmpeg is distributed in the hope that it will be useful,
24
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26
 * Lesser General Public License for more details.
27
 *
28
 * You should have received a copy of the GNU Lesser General Public
29
 * License along with FFmpeg; if not, write to the Free Software
30
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
31
 */
32
#include "common.h"
33
#include "dsputil.h"
34
#include "mmx.h"
35

    
36
#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
37

    
38
//////////////////////////////////////////////////////////////////////
39
//
40
// constants for the forward DCT
41
// -----------------------------
42
//
43
// Be sure to check that your compiler is aligning all constants to QWORD
44
// (8-byte) memory boundaries!  Otherwise the unaligned memory access will
45
// severely stall MMX execution.
46
//
47
//////////////////////////////////////////////////////////////////////
48

    
49
#define BITS_FRW_ACC   3 //; 2 or 3 for accuracy
50
#define SHIFT_FRW_COL  BITS_FRW_ACC
51
#define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17 - 3)
52
#define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))
53
//#define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1))
54

    
55
#define X8(x) x,x,x,x,x,x,x,x
56

    
57
//concatenated table, for forward DCT transformation
58
static const int16_t fdct_tg_all_16[24] ATTR_ALIGN(16) = {
59
    X8(13036),  // tg * (2<<16) + 0.5
60
    X8(27146),  // tg * (2<<16) + 0.5
61
    X8(-21746)  // tg * (2<<16) + 0.5
62
};
63

    
64
static const int16_t ocos_4_16[8] ATTR_ALIGN(16) = {
65
    X8(23170)   //cos * (2<<15) + 0.5
66
};
67

    
68
static const int16_t fdct_one_corr[8] ATTR_ALIGN(16) = { X8(1) };
69

    
70
static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
71

    
72
static struct
73
{
74
 const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
75
} fdct_r_row_sse2 ATTR_ALIGN(16)=
76
{{
77
 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
78
}};
79
//static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
80

    
81
static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff table
82
  16384,   16384,   22725,   19266,
83
  16384,   16384,   12873,    4520,
84
  21407,    8867,   19266,   -4520,
85
  -8867,  -21407,  -22725,  -12873,
86
  16384,  -16384,   12873,  -22725,
87
 -16384,   16384,    4520,   19266,
88
   8867,  -21407,    4520,  -12873,
89
  21407,   -8867,   19266,  -22725,
90

    
91
  22725,   22725,   31521,   26722,
92
  22725,   22725,   17855,    6270,
93
  29692,   12299,   26722,   -6270,
94
 -12299,  -29692,  -31521,  -17855,
95
  22725,  -22725,   17855,  -31521,
96
 -22725,   22725,    6270,   26722,
97
  12299,  -29692,    6270,  -17855,
98
  29692,  -12299,   26722,  -31521,
99

    
100
  21407,   21407,   29692,   25172,
101
  21407,   21407,   16819,    5906,
102
  27969,   11585,   25172,   -5906,
103
 -11585,  -27969,  -29692,  -16819,
104
  21407,  -21407,   16819,  -29692,
105
 -21407,   21407,    5906,   25172,
106
  11585,  -27969,    5906,  -16819,
107
  27969,  -11585,   25172,  -29692,
108

    
109
  19266,   19266,   26722,   22654,
110
  19266,   19266,   15137,    5315,
111
  25172,   10426,   22654,   -5315,
112
 -10426,  -25172,  -26722,  -15137,
113
  19266,  -19266,   15137,  -26722,
114
 -19266,   19266,    5315,   22654,
115
  10426,  -25172,    5315,  -15137,
116
  25172,  -10426,   22654,  -26722,
117

    
118
  16384,   16384,   22725,   19266,
119
  16384,   16384,   12873,    4520,
120
  21407,    8867,   19266,   -4520,
121
  -8867,  -21407,  -22725,  -12873,
122
  16384,  -16384,   12873,  -22725,
123
 -16384,   16384,    4520,   19266,
124
   8867,  -21407,    4520,  -12873,
125
  21407,   -8867,   19266,  -22725,
126

    
127
  19266,   19266,   26722,   22654,
128
  19266,   19266,   15137,    5315,
129
  25172,   10426,   22654,   -5315,
130
 -10426,  -25172,  -26722,  -15137,
131
  19266,  -19266,   15137,  -26722,
132
 -19266,   19266,    5315,   22654,
133
  10426,  -25172,    5315,  -15137,
134
  25172,  -10426,   22654,  -26722,
135

    
136
  21407,   21407,   29692,   25172,
137
  21407,   21407,   16819,    5906,
138
  27969,   11585,   25172,   -5906,
139
 -11585,  -27969,  -29692,  -16819,
140
  21407,  -21407,   16819,  -29692,
141
 -21407,   21407,    5906,   25172,
142
  11585,  -27969,    5906,  -16819,
143
  27969,  -11585,   25172,  -29692,
144

    
145
  22725,   22725,   31521,   26722,
146
  22725,   22725,   17855,    6270,
147
  29692,   12299,   26722,   -6270,
148
 -12299,  -29692,  -31521,  -17855,
149
  22725,  -22725,   17855,  -31521,
150
 -22725,   22725,    6270,   26722,
151
  12299,  -29692,    6270,  -17855,
152
  29692,  -12299,   26722,  -31521,
153
};
154

    
155
static struct
156
{
157
 const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16);
158
} tab_frw_01234567_sse2 ATTR_ALIGN(16) =
159
{{
160
//static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = {  // forward_dct coeff table
161
#define TABLE_SSE2 C4,  C4,  C1,  C3, -C6, -C2, -C1, -C5, \
162
                   C4,  C4,  C5,  C7,  C2,  C6,  C3, -C7, \
163
                  -C4,  C4,  C7,  C3,  C6, -C2,  C7, -C5, \
164
                   C4, -C4,  C5, -C1,  C2, -C6,  C3, -C1,
165
// c1..c7 * cos(pi/4) * 2^15
166
#define C1 22725
167
#define C2 21407
168
#define C3 19266
169
#define C4 16384
170
#define C5 12873
171
#define C6 8867
172
#define C7 4520
173
TABLE_SSE2
174

    
175
#undef C1
176
#undef C2
177
#undef C3
178
#undef C4
179
#undef C5
180
#undef C6
181
#undef C7
182
#define C1 31521
183
#define C2 29692
184
#define C3 26722
185
#define C4 22725
186
#define C5 17855
187
#define C6 12299
188
#define C7 6270
189
TABLE_SSE2
190

    
191
#undef C1
192
#undef C2
193
#undef C3
194
#undef C4
195
#undef C5
196
#undef C6
197
#undef C7
198
#define C1 29692
199
#define C2 27969
200
#define C3 25172
201
#define C4 21407
202
#define C5 16819
203
#define C6 11585
204
#define C7 5906
205
TABLE_SSE2
206

    
207
#undef C1
208
#undef C2
209
#undef C3
210
#undef C4
211
#undef C5
212
#undef C6
213
#undef C7
214
#define C1 26722
215
#define C2 25172
216
#define C3 22654
217
#define C4 19266
218
#define C5 15137
219
#define C6 10426
220
#define C7 5315
221
TABLE_SSE2
222

    
223
#undef C1
224
#undef C2
225
#undef C3
226
#undef C4
227
#undef C5
228
#undef C6
229
#undef C7
230
#define C1 22725
231
#define C2 21407
232
#define C3 19266
233
#define C4 16384
234
#define C5 12873
235
#define C6 8867
236
#define C7 4520
237
TABLE_SSE2
238

    
239
#undef C1
240
#undef C2
241
#undef C3
242
#undef C4
243
#undef C5
244
#undef C6
245
#undef C7
246
#define C1 26722
247
#define C2 25172
248
#define C3 22654
249
#define C4 19266
250
#define C5 15137
251
#define C6 10426
252
#define C7 5315
253
TABLE_SSE2
254

    
255
#undef C1
256
#undef C2
257
#undef C3
258
#undef C4
259
#undef C5
260
#undef C6
261
#undef C7
262
#define C1 29692
263
#define C2 27969
264
#define C3 25172
265
#define C4 21407
266
#define C5 16819
267
#define C6 11585
268
#define C7 5906
269
TABLE_SSE2
270

    
271
#undef C1
272
#undef C2
273
#undef C3
274
#undef C4
275
#undef C5
276
#undef C6
277
#undef C7
278
#define C1 31521
279
#define C2 29692
280
#define C3 26722
281
#define C4 22725
282
#define C5 17855
283
#define C6 12299
284
#define C7 6270
285
TABLE_SSE2
286
}};
287

    
288
#define FDCT_COL(cpu, mm, mov)\
289
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
290
{\
291
    mov##_m2r(*(in + offset + 1 * 8), mm##0);\
292
    mov##_m2r(*(in + offset + 6 * 8), mm##1);\
293
    mov##_r2r(mm##0, mm##2);\
294
    mov##_m2r(*(in + offset + 2 * 8), mm##3);\
295
    paddsw_r2r(mm##1, mm##0);\
296
    mov##_m2r(*(in + offset + 5 * 8), mm##4);\
297
    psllw_i2r(SHIFT_FRW_COL, mm##0);\
298
    mov##_m2r(*(in + offset + 0 * 8), mm##5);\
299
    paddsw_r2r(mm##3, mm##4);\
300
    paddsw_m2r(*(in + offset + 7 * 8), mm##5);\
301
    psllw_i2r(SHIFT_FRW_COL, mm##4);\
302
    mov##_r2r(mm##0, mm##6);\
303
    psubsw_r2r(mm##1, mm##2);\
304
    mov##_m2r(*(fdct_tg_all_16 + 8), mm##1);\
305
    psubsw_r2r(mm##4, mm##0);\
306
    mov##_m2r(*(in + offset + 3 * 8), mm##7);\
307
    pmulhw_r2r(mm##0, mm##1);\
308
    paddsw_m2r(*(in + offset + 4 * 8), mm##7);\
309
    psllw_i2r(SHIFT_FRW_COL, mm##5);\
310
    paddsw_r2r(mm##4, mm##6);\
311
    psllw_i2r(SHIFT_FRW_COL, mm##7);\
312
    mov##_r2r(mm##5, mm##4);\
313
    psubsw_r2r(mm##7, mm##5);\
314
    paddsw_r2r(mm##5, mm##1);\
315
    paddsw_r2r(mm##7, mm##4);\
316
    por_m2r(*fdct_one_corr, mm##1);\
317
    psllw_i2r(SHIFT_FRW_COL + 1, mm##2);\
318
    pmulhw_m2r(*(fdct_tg_all_16 + 8), mm##5);\
319
    mov##_r2r(mm##4, mm##7);\
320
    psubsw_m2r(*(in + offset + 5 * 8), mm##3);\
321
    psubsw_r2r(mm##6, mm##4);\
322
    mov##_r2m(mm##1, *(out + offset + 2 * 8));\
323
    paddsw_r2r(mm##6, mm##7);\
324
    mov##_m2r(*(in + offset + 3 * 8), mm##1);\
325
    psllw_i2r(SHIFT_FRW_COL + 1, mm##3);\
326
    psubsw_m2r(*(in + offset + 4 * 8), mm##1);\
327
    mov##_r2r(mm##2, mm##6);\
328
    mov##_r2m(mm##4, *(out + offset + 4 * 8));\
329
    paddsw_r2r(mm##3, mm##2);\
330
    pmulhw_m2r(*ocos_4_16, mm##2);\
331
    psubsw_r2r(mm##3, mm##6);\
332
    pmulhw_m2r(*ocos_4_16, mm##6);\
333
    psubsw_r2r(mm##0, mm##5);\
334
    por_m2r(*fdct_one_corr, mm##5);\
335
    psllw_i2r(SHIFT_FRW_COL, mm##1);\
336
    por_m2r(*fdct_one_corr, mm##2);\
337
    mov##_r2r(mm##1, mm##4);\
338
    mov##_m2r(*(in + offset + 0 * 8), mm##3);\
339
    paddsw_r2r(mm##6, mm##1);\
340
    psubsw_m2r(*(in + offset + 7 * 8), mm##3);\
341
    psubsw_r2r(mm##6, mm##4);\
342
    mov##_m2r(*(fdct_tg_all_16 + 0), mm##0);\
343
    psllw_i2r(SHIFT_FRW_COL, mm##3);\
344
    mov##_m2r(*(fdct_tg_all_16 + 16), mm##6);\
345
    pmulhw_r2r(mm##1, mm##0);\
346
    mov##_r2m(mm##7, *(out + offset + 0 * 8));\
347
    pmulhw_r2r(mm##4, mm##6);\
348
    mov##_r2m(mm##5, *(out + offset + 6 * 8));\
349
    mov##_r2r(mm##3, mm##7);\
350
    mov##_m2r(*(fdct_tg_all_16 + 16), mm##5);\
351
    psubsw_r2r(mm##2, mm##7);\
352
    paddsw_r2r(mm##2, mm##3);\
353
    pmulhw_r2r(mm##7, mm##5);\
354
    paddsw_r2r(mm##3, mm##0);\
355
    paddsw_r2r(mm##4, mm##6);\
356
    pmulhw_m2r(*(fdct_tg_all_16 + 0), mm##3);\
357
    por_m2r(*fdct_one_corr, mm##0);\
358
    paddsw_r2r(mm##7, mm##5);\
359
    psubsw_r2r(mm##6, mm##7);\
360
    mov##_r2m(mm##0, *(out + offset + 1 * 8));\
361
    paddsw_r2r(mm##4, mm##5);\
362
    mov##_r2m(mm##7, *(out + offset + 3 * 8));\
363
    psubsw_r2r(mm##1, mm##3);\
364
    mov##_r2m(mm##5, *(out + offset + 5 * 8));\
365
    mov##_r2m(mm##3, *(out + offset + 7 * 8));\
366
}
367

    
368
FDCT_COL(mmx, mm, movq)
369
FDCT_COL(sse2, xmm, movdqa)
370

    
371
static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
372
{
373
    asm volatile(
374
#define FDCT_ROW_SSE2_H1(i,t)                    \
375
        "movq      " #i "(%0), %%xmm2      \n\t" \
376
        "movq      " #i "+8(%0), %%xmm0    \n\t" \
377
        "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
378
        "movdqa    " #t "+48(%1), %%xmm7   \n\t" \
379
        "movdqa    " #t "(%1), %%xmm4      \n\t" \
380
        "movdqa    " #t "+16(%1), %%xmm5   \n\t"
381

    
382
#define FDCT_ROW_SSE2_H2(i,t)                    \
383
        "movq      " #i "(%0), %%xmm2      \n\t" \
384
        "movq      " #i "+8(%0), %%xmm0    \n\t" \
385
        "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
386
        "movdqa    " #t "+48(%1), %%xmm7   \n\t"
387

    
388
#define FDCT_ROW_SSE2(i)                      \
389
        "movq      %%xmm2, %%xmm1       \n\t" \
390
        "pshuflw   $27, %%xmm0, %%xmm0  \n\t" \
391
        "paddsw    %%xmm0, %%xmm1       \n\t" \
392
        "psubsw    %%xmm0, %%xmm2       \n\t" \
393
        "punpckldq %%xmm2, %%xmm1       \n\t" \
394
        "pshufd    $78, %%xmm1, %%xmm2  \n\t" \
395
        "pmaddwd   %%xmm2, %%xmm3       \n\t" \
396
        "pmaddwd   %%xmm1, %%xmm7       \n\t" \
397
        "pmaddwd   %%xmm5, %%xmm2       \n\t" \
398
        "pmaddwd   %%xmm4, %%xmm1       \n\t" \
399
        "paddd     %%xmm7, %%xmm3       \n\t" \
400
        "paddd     %%xmm2, %%xmm1       \n\t" \
401
        "paddd     %%xmm6, %%xmm3       \n\t" \
402
        "paddd     %%xmm6, %%xmm1       \n\t" \
403
        "psrad     %3, %%xmm3           \n\t" \
404
        "psrad     %3, %%xmm1           \n\t" \
405
        "packssdw  %%xmm3, %%xmm1       \n\t" \
406
        "movdqa    %%xmm1, " #i "(%4)   \n\t"
407

    
408
        "movdqa    (%2), %%xmm6         \n\t"
409
        FDCT_ROW_SSE2_H1(0,0)
410
        FDCT_ROW_SSE2(0)
411
        FDCT_ROW_SSE2_H2(64,0)
412
        FDCT_ROW_SSE2(64)
413

    
414
        FDCT_ROW_SSE2_H1(16,64)
415
        FDCT_ROW_SSE2(16)
416
        FDCT_ROW_SSE2_H2(112,64)
417
        FDCT_ROW_SSE2(112)
418

    
419
        FDCT_ROW_SSE2_H1(32,128)
420
        FDCT_ROW_SSE2(32)
421
        FDCT_ROW_SSE2_H2(96,128)
422
        FDCT_ROW_SSE2(96)
423

    
424
        FDCT_ROW_SSE2_H1(48,192)
425
        FDCT_ROW_SSE2(48)
426
        FDCT_ROW_SSE2_H2(80,192)
427
        FDCT_ROW_SSE2(80)
428
        :
429
        : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
430
    );
431
}
432

    
433
static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
434
{
435
    pshufw_m2r(*(in + 4), mm5, 0x1B);
436
    movq_m2r(*(in + 0), mm0);
437
    movq_r2r(mm0, mm1);
438
    paddsw_r2r(mm5, mm0);
439
    psubsw_r2r(mm5, mm1);
440
    movq_r2r(mm0, mm2);
441
    punpckldq_r2r(mm1, mm0);
442
    punpckhdq_r2r(mm1, mm2);
443
    movq_m2r(*(table + 0), mm1);
444
    movq_m2r(*(table + 4), mm3);
445
    movq_m2r(*(table + 8), mm4);
446
    movq_m2r(*(table + 12), mm5);
447
    movq_m2r(*(table + 16), mm6);
448
    movq_m2r(*(table + 20), mm7);
449
    pmaddwd_r2r(mm0, mm1);
450
    pmaddwd_r2r(mm2, mm3);
451
    pmaddwd_r2r(mm0, mm4);
452
    pmaddwd_r2r(mm2, mm5);
453
    pmaddwd_r2r(mm0, mm6);
454
    pmaddwd_r2r(mm2, mm7);
455
    pmaddwd_m2r(*(table + 24), mm0);
456
    pmaddwd_m2r(*(table + 28), mm2);
457
    paddd_r2r(mm1, mm3);
458
    paddd_r2r(mm4, mm5);
459
    paddd_r2r(mm6, mm7);
460
    paddd_r2r(mm0, mm2);
461
    movq_m2r(*fdct_r_row, mm0);
462
    paddd_r2r(mm0, mm3);
463
    paddd_r2r(mm0, mm5);
464
    paddd_r2r(mm0, mm7);
465
    paddd_r2r(mm0, mm2);
466
    psrad_i2r(SHIFT_FRW_ROW, mm3);
467
    psrad_i2r(SHIFT_FRW_ROW, mm5);
468
    psrad_i2r(SHIFT_FRW_ROW, mm7);
469
    psrad_i2r(SHIFT_FRW_ROW, mm2);
470
    packssdw_r2r(mm5, mm3);
471
    packssdw_r2r(mm2, mm7);
472
    movq_r2m(mm3, *(out + 0));
473
    movq_r2m(mm7, *(out + 4));
474
}
475

    
476
static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
477
{
478
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
479
    movd_m2r(*(in + 6), mm1);
480
    punpcklwd_m2r(*(in + 4), mm1);
481
    movq_r2r(mm1, mm2);
482
    psrlq_i2r(0x20, mm1);
483
    movq_m2r(*(in + 0), mm0);
484
    punpcklwd_r2r(mm2, mm1);
485
    movq_r2r(mm0, mm5);
486
    paddsw_r2r(mm1, mm0);
487
    psubsw_r2r(mm1, mm5);
488
    movq_r2r(mm0, mm2);
489
    punpckldq_r2r(mm5, mm0);
490
    punpckhdq_r2r(mm5, mm2);
491
    movq_m2r(*(table + 0), mm1);
492
    movq_m2r(*(table + 4), mm3);
493
    movq_m2r(*(table + 8), mm4);
494
    movq_m2r(*(table + 12), mm5);
495
    movq_m2r(*(table + 16), mm6);
496
    movq_m2r(*(table + 20), mm7);
497
    pmaddwd_r2r(mm0, mm1);
498
    pmaddwd_r2r(mm2, mm3);
499
    pmaddwd_r2r(mm0, mm4);
500
    pmaddwd_r2r(mm2, mm5);
501
    pmaddwd_r2r(mm0, mm6);
502
    pmaddwd_r2r(mm2, mm7);
503
    pmaddwd_m2r(*(table + 24), mm0);
504
    pmaddwd_m2r(*(table + 28), mm2);
505
    paddd_r2r(mm1, mm3);
506
    paddd_r2r(mm4, mm5);
507
    paddd_r2r(mm6, mm7);
508
    paddd_r2r(mm0, mm2);
509
    movq_m2r(*fdct_r_row, mm0);
510
    paddd_r2r(mm0, mm3);
511
    paddd_r2r(mm0, mm5);
512
    paddd_r2r(mm0, mm7);
513
    paddd_r2r(mm0, mm2);
514
    psrad_i2r(SHIFT_FRW_ROW, mm3);
515
    psrad_i2r(SHIFT_FRW_ROW, mm5);
516
    psrad_i2r(SHIFT_FRW_ROW, mm7);
517
    psrad_i2r(SHIFT_FRW_ROW, mm2);
518
    packssdw_r2r(mm5, mm3);
519
    packssdw_r2r(mm2, mm7);
520
    movq_r2m(mm3, *(out + 0));
521
    movq_r2m(mm7, *(out + 4));
522
}
523

    
524
void ff_fdct_mmx(int16_t *block)
525
{
526
    int64_t align_tmp[16] ATTR_ALIGN(8);
527
    int16_t * block1= (int16_t*)align_tmp;
528
    const int16_t *table= tab_frw_01234567;
529
    int i;
530

    
531
    fdct_col_mmx(block, block1, 0);
532
    fdct_col_mmx(block, block1, 4);
533

    
534
    for(i=8;i>0;i--) {
535
        fdct_row_mmx(block1, block, table);
536
        block1 += 8;
537
        table += 32;
538
        block += 8;
539
    }
540
}
541

    
542
void ff_fdct_mmx2(int16_t *block)
543
{
544
    int64_t align_tmp[16] ATTR_ALIGN(8);
545
    int16_t *block1= (int16_t*)align_tmp;
546
    const int16_t *table= tab_frw_01234567;
547
    int i;
548

    
549
    fdct_col_mmx(block, block1, 0);
550
    fdct_col_mmx(block, block1, 4);
551

    
552
    for(i=8;i>0;i--) {
553
        fdct_row_mmx2(block1, block, table);
554
        block1 += 8;
555
        table += 32;
556
        block += 8;
557
    }
558
}
559

    
560
void ff_fdct_sse2(int16_t *block)
561
{
562
    int64_t align_tmp[16] ATTR_ALIGN(16);
563
    int16_t * const block1= (int16_t*)align_tmp;
564

    
565
    fdct_col_sse2(block, block1, 0);
566
    fdct_row_sse2(block1, block);
567
}
568