Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / fdct_mmx.c @ bc0219fd

History | View | Annotate | Download (10.4 KB)

1 694ec061 Fabrice Bellard
/*
2
 * MMX optimized forward DCT
3 ff4ec49e Fabrice Bellard
 * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
4 694ec061 Fabrice Bellard
 *
5
 * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
6
 * 
7
 *  Intel Application Note AP-922 - fast, precise implementation of DCT
8
 *        http://developer.intel.com/vtune/cbts/appnotes.htm
9
 */
10
#include "../common.h"
11
#include "mmx.h"
12
13
#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
14
15
//////////////////////////////////////////////////////////////////////
16
//
17
// constants for the forward DCT
18
// -----------------------------
19
//
20
// Be sure to check that your compiler is aligning all constants to QWORD
21
// (8-byte) memory boundaries!  Otherwise the unaligned memory access will
22
// severely stall MMX execution.
23
//
24
//////////////////////////////////////////////////////////////////////
25
26
#define BITS_FRW_ACC        3 //; 2 or 3 for accuracy
27
#define SHIFT_FRW_COL        BITS_FRW_ACC
28 ad324c93 Michael Niedermayer
#define SHIFT_FRW_ROW        (BITS_FRW_ACC + 17 - 3)
29 694ec061 Fabrice Bellard
//#define RND_FRW_ROW                (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)
30
#define RND_FRW_ROW                (1 << (SHIFT_FRW_ROW-1))
31
//#define RND_FRW_COL                (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1)
32
#define RND_FRW_COL                (1 << (SHIFT_FRW_COL-1))
33
34
//concatenated table, for forward DCT transformation
35 3d71589f Michael Niedermayer
static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
36 694ec061 Fabrice Bellard
    13036, 13036, 13036, 13036,                // tg * (2<<16) + 0.5
37
    27146, 27146, 27146, 27146,                // tg * (2<<16) + 0.5
38
    -21746, -21746, -21746, -21746,        // tg * (2<<16) + 0.5
39
};
40 3d71589f Michael Niedermayer
static const int16_t cos_4_16[4] ATTR_ALIGN(8) = {
41 694ec061 Fabrice Bellard
    -19195, -19195, -19195, -19195,        //cos * (2<<16) + 0.5
42
};
43
44 3d71589f Michael Niedermayer
static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
45 694ec061 Fabrice Bellard
    23170, 23170, 23170, 23170,        //cos * (2<<15) + 0.5
46
};
47
48 3d71589f Michael Niedermayer
static const long long  fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
49
static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
50 694ec061 Fabrice Bellard
51 3d71589f Michael Niedermayer
static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff table
52 6e0593e8 Michael Niedermayer
  16384,   16384,   -8867,  -21407, 
53
  16384,   16384,   21407,    8867, 
54
  16384,  -16384,   21407,   -8867, 
55
 -16384,   16384,    8867,  -21407, 
56
  22725,   19266,  -22725,  -12873, 
57
  12873,    4520,   19266,   -4520, 
58
  12873,  -22725,   19266,  -22725, 
59
   4520,   19266,    4520,  -12873, 
60 694ec061 Fabrice Bellard
61 6e0593e8 Michael Niedermayer
  22725,   22725,  -12299,  -29692, 
62
  22725,   22725,   29692,   12299, 
63
  22725,  -22725,   29692,  -12299, 
64
 -22725,   22725,   12299,  -29692, 
65
  31521,   26722,  -31521,  -17855, 
66
  17855,    6270,   26722,   -6270, 
67
  17855,  -31521,   26722,  -31521, 
68
   6270,   26722,    6270,  -17855, 
69 694ec061 Fabrice Bellard
70 6e0593e8 Michael Niedermayer
  21407,   21407,  -11585,  -27969, 
71
  21407,   21407,   27969,   11585, 
72
  21407,  -21407,   27969,  -11585, 
73
 -21407,   21407,   11585,  -27969, 
74
  29692,   25172,  -29692,  -16819, 
75
  16819,    5906,   25172,   -5906, 
76
  16819,  -29692,   25172,  -29692, 
77
   5906,   25172,    5906,  -16819, 
78 694ec061 Fabrice Bellard
79 6e0593e8 Michael Niedermayer
  19266,   19266,  -10426,  -25172, 
80
  19266,   19266,   25172,   10426, 
81
  19266,  -19266,   25172,  -10426, 
82
 -19266,   19266,   10426,  -25172, 
83
  26722,   22654,  -26722,  -15137, 
84
  15137,    5315,   22654,   -5315, 
85
  15137,  -26722,   22654,  -26722, 
86
   5315,   22654,    5315,  -15137, 
87 694ec061 Fabrice Bellard
88 6e0593e8 Michael Niedermayer
  16384,   16384,   -8867,  -21407, 
89
  16384,   16384,   21407,    8867, 
90
  16384,  -16384,   21407,   -8867, 
91
 -16384,   16384,    8867,  -21407, 
92
  22725,   19266,  -22725,  -12873, 
93
  12873,    4520,   19266,   -4520, 
94
  12873,  -22725,   19266,  -22725, 
95
   4520,   19266,    4520,  -12873, 
96 694ec061 Fabrice Bellard
97 6e0593e8 Michael Niedermayer
  19266,   19266,  -10426,  -25172, 
98
  19266,   19266,   25172,   10426, 
99
  19266,  -19266,   25172,  -10426, 
100
 -19266,   19266,   10426,  -25172, 
101
  26722,   22654,  -26722,  -15137, 
102
  15137,    5315,   22654,   -5315, 
103
  15137,  -26722,   22654,  -26722, 
104
   5315,   22654,    5315,  -15137, 
105 694ec061 Fabrice Bellard
106 6e0593e8 Michael Niedermayer
  21407,   21407,  -11585,  -27969, 
107
  21407,   21407,   27969,   11585, 
108
  21407,  -21407,   27969,  -11585, 
109
 -21407,   21407,   11585,  -27969, 
110
  29692,   25172,  -29692,  -16819, 
111
  16819,    5906,   25172,   -5906, 
112
  16819,  -29692,   25172,  -29692, 
113
   5906,   25172,    5906,  -16819, 
114 694ec061 Fabrice Bellard
115 6e0593e8 Michael Niedermayer
  22725,   22725,  -12299,  -29692, 
116
  22725,   22725,   29692,   12299, 
117
  22725,  -22725,   29692,  -12299, 
118
 -22725,   22725,   12299,  -29692, 
119
  31521,   26722,  -31521,  -17855, 
120
  17855,    6270,   26722,   -6270, 
121
  17855,  -31521,   26722,  -31521, 
122
   6270,   26722,    6270,  -17855, 
123 694ec061 Fabrice Bellard
};
124
125
126 1745173b Michael Niedermayer
static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
127 694ec061 Fabrice Bellard
{
128
    movq_m2r(*(in + offset + 1 * 8), mm0);
129
    movq_m2r(*(in + offset + 6 * 8), mm1);
130
    movq_r2r(mm0, mm2);
131
    movq_m2r(*(in + offset + 2 * 8), mm3);
132
    paddsw_r2r(mm1, mm0);
133
    movq_m2r(*(in + offset + 5 * 8), mm4);
134
    psllw_i2r(SHIFT_FRW_COL, mm0);
135
    movq_m2r(*(in + offset + 0 * 8), mm5);
136
    paddsw_r2r(mm3, mm4);
137
    paddsw_m2r(*(in + offset + 7 * 8), mm5);
138
    psllw_i2r(SHIFT_FRW_COL, mm4);
139
    movq_r2r(mm0, mm6);
140
    psubsw_r2r(mm1, mm2);
141
    movq_m2r(*(fdct_tg_all_16 + 4), mm1);
142
    psubsw_r2r(mm4, mm0);
143
    movq_m2r(*(in + offset + 3 * 8), mm7);
144
    pmulhw_r2r(mm0, mm1);
145
    paddsw_m2r(*(in + offset + 4 * 8), mm7);
146
    psllw_i2r(SHIFT_FRW_COL, mm5);
147
    paddsw_r2r(mm4, mm6);
148
    psllw_i2r(SHIFT_FRW_COL, mm7);
149
    movq_r2r(mm5, mm4);
150
    psubsw_r2r(mm7, mm5);
151
    paddsw_r2r(mm5, mm1);
152
    paddsw_r2r(mm7, mm4);
153
    por_m2r(fdct_one_corr, mm1);
154
    psllw_i2r(SHIFT_FRW_COL + 1, mm2);
155
    pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5);
156
    movq_r2r(mm4, mm7);
157
    psubsw_m2r(*(in + offset + 5 * 8), mm3);
158
    psubsw_r2r(mm6, mm4);
159
    movq_r2m(mm1, *(out + offset + 2 * 8));
160
    paddsw_r2r(mm6, mm7);
161
    movq_m2r(*(in + offset + 3 * 8), mm1);
162
    psllw_i2r(SHIFT_FRW_COL + 1, mm3);
163
    psubsw_m2r(*(in + offset + 4 * 8), mm1);
164
    movq_r2r(mm2, mm6);
165
    movq_r2m(mm4, *(out + offset + 4 * 8));
166
    paddsw_r2r(mm3, mm2);
167
    pmulhw_m2r(*ocos_4_16, mm2);
168
    psubsw_r2r(mm3, mm6);
169
    pmulhw_m2r(*ocos_4_16, mm6);
170
    psubsw_r2r(mm0, mm5);
171
    por_m2r(fdct_one_corr, mm5);
172
    psllw_i2r(SHIFT_FRW_COL, mm1);
173
    por_m2r(fdct_one_corr, mm2);
174
    movq_r2r(mm1, mm4);
175
    movq_m2r(*(in + offset + 0 * 8), mm3);
176
    paddsw_r2r(mm6, mm1);
177
    psubsw_m2r(*(in + offset + 7 * 8), mm3);
178
    psubsw_r2r(mm6, mm4);
179
    movq_m2r(*(fdct_tg_all_16 + 0), mm0);
180
    psllw_i2r(SHIFT_FRW_COL, mm3);
181
    movq_m2r(*(fdct_tg_all_16 + 8), mm6);
182
    pmulhw_r2r(mm1, mm0);
183
    movq_r2m(mm7, *(out + offset + 0 * 8));
184
    pmulhw_r2r(mm4, mm6);
185
    movq_r2m(mm5, *(out + offset + 6 * 8));
186
    movq_r2r(mm3, mm7);
187
    movq_m2r(*(fdct_tg_all_16 + 8), mm5);
188
    psubsw_r2r(mm2, mm7);
189
    paddsw_r2r(mm2, mm3);
190
    pmulhw_r2r(mm7, mm5);
191
    paddsw_r2r(mm3, mm0);
192
    paddsw_r2r(mm4, mm6);
193
    pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3);
194
    por_m2r(fdct_one_corr, mm0);
195
    paddsw_r2r(mm7, mm5);
196
    psubsw_r2r(mm6, mm7);
197
    movq_r2m(mm0, *(out + offset + 1 * 8));
198
    paddsw_r2r(mm4, mm5);
199
    movq_r2m(mm7, *(out + offset + 3 * 8));
200
    psubsw_r2r(mm1, mm3);
201
    movq_r2m(mm5, *(out + offset + 5 * 8));
202
    movq_r2m(mm3, *(out + offset + 7 * 8));
203
}
204
205 99200bae Michael Niedermayer
static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
206 1745173b Michael Niedermayer
{ 
207 cf3bf5bb Michael Niedermayer
    pshufw_m2r(*(in + 4), mm5, 0x1B);
208
    movq_m2r(*(in + 0), mm0);
209 99200bae Michael Niedermayer
    movq_r2r(mm0, mm1);        
210
    paddsw_r2r(mm5, mm0);
211
    psubsw_r2r(mm5, mm1);
212 6e0593e8 Michael Niedermayer
    pshufw_r2r(mm0, mm2, 0x4E);
213
    pshufw_r2r(mm1, mm3, 0x4E);
214
    movq_m2r(*(table + 0), mm4);
215
    movq_m2r(*(table + 4), mm6);
216
    movq_m2r(*(table + 16), mm5);
217 99200bae Michael Niedermayer
    movq_m2r(*(table + 20), mm7);
218 6e0593e8 Michael Niedermayer
    pmaddwd_r2r(mm0, mm4);
219
    pmaddwd_r2r(mm1, mm5);
220
    pmaddwd_r2r(mm2, mm6);
221
    pmaddwd_r2r(mm3, mm7);
222 99200bae Michael Niedermayer
    pmaddwd_m2r(*(table + 8), mm0);
223 6e0593e8 Michael Niedermayer
    pmaddwd_m2r(*(table + 12), mm2);
224
    pmaddwd_m2r(*(table + 24), mm1);
225
    pmaddwd_m2r(*(table + 28), mm3);
226
    paddd_r2r(mm6, mm4);
227
    paddd_r2r(mm7, mm5);
228
    paddd_r2r(mm2, mm0);
229
    paddd_r2r(mm3, mm1);
230 99200bae Michael Niedermayer
    movq_m2r(*fdct_r_row, mm7);
231 6e0593e8 Michael Niedermayer
    paddd_r2r(mm7, mm4);
232 99200bae Michael Niedermayer
    paddd_r2r(mm7, mm5);
233 6e0593e8 Michael Niedermayer
    paddd_r2r(mm7, mm0);
234
    paddd_r2r(mm7, mm1);
235
    psrad_i2r(SHIFT_FRW_ROW, mm4);
236 99200bae Michael Niedermayer
    psrad_i2r(SHIFT_FRW_ROW, mm5);
237 6e0593e8 Michael Niedermayer
    psrad_i2r(SHIFT_FRW_ROW, mm0);
238
    psrad_i2r(SHIFT_FRW_ROW, mm1);
239
    packssdw_r2r(mm0, mm4);
240
    packssdw_r2r(mm1, mm5);
241
    movq_r2r(mm4, mm2);
242
    punpcklwd_r2r(mm5, mm4);
243
    punpckhwd_r2r(mm5, mm2);
244
    movq_r2m(mm4, *(out + 0));
245
    movq_r2m(mm2, *(out + 4));
246 99200bae Michael Niedermayer
}
247
248
static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
249
{ 
250 6e0593e8 Michael Niedermayer
    movd_m2r(*(in + 6), mm1);
251
    punpcklwd_m2r(*(in + 4), mm1);
252
    movq_r2r(mm1, mm2);
253
    psrlq_i2r(0x20, mm1);
254 694ec061 Fabrice Bellard
    movq_m2r(*(in + 0), mm0);
255 6e0593e8 Michael Niedermayer
    punpcklwd_r2r(mm2, mm1);
256
    movq_r2r(mm0, mm5);        
257
    paddsw_r2r(mm1, mm0);
258
    psubsw_r2r(mm1, mm5);
259
    movq_r2r(mm0, mm1);            
260
    movq_r2r(mm5, mm6);        
261
    punpckldq_r2r(mm5, mm3);
262
    punpckhdq_r2r(mm3, mm6);
263 694ec061 Fabrice Bellard
    movq_m2r(*(table + 0), mm3);
264
    movq_m2r(*(table + 4), mm4);
265 6e0593e8 Michael Niedermayer
    punpckldq_r2r(mm0, mm2);
266 694ec061 Fabrice Bellard
    pmaddwd_r2r(mm0, mm3);
267 6e0593e8 Michael Niedermayer
    punpckhdq_r2r(mm2, mm1);
268 99200bae Michael Niedermayer
    movq_m2r(*(table + 16), mm2);
269
    pmaddwd_r2r(mm1, mm4);
270 694ec061 Fabrice Bellard
    pmaddwd_m2r(*(table + 8), mm0);
271
    movq_m2r(*(table + 20), mm7);
272 99200bae Michael Niedermayer
    pmaddwd_r2r(mm5, mm2);
273 694ec061 Fabrice Bellard
    paddd_m2r(*fdct_r_row, mm3);
274
    pmaddwd_r2r(mm6, mm7);
275 99200bae Michael Niedermayer
    pmaddwd_m2r(*(table + 12), mm1);
276 694ec061 Fabrice Bellard
    paddd_r2r(mm4, mm3);
277
    pmaddwd_m2r(*(table + 24), mm5);
278
    pmaddwd_m2r(*(table + 28), mm6);
279 99200bae Michael Niedermayer
    paddd_r2r(mm7, mm2);
280 694ec061 Fabrice Bellard
    paddd_m2r(*fdct_r_row, mm0);
281
    psrad_i2r(SHIFT_FRW_ROW, mm3);
282 99200bae Michael Niedermayer
    paddd_m2r(*fdct_r_row, mm2);
283
    paddd_r2r(mm1, mm0);
284 694ec061 Fabrice Bellard
    paddd_m2r(*fdct_r_row, mm5);
285 99200bae Michael Niedermayer
    psrad_i2r(SHIFT_FRW_ROW, mm2);
286 694ec061 Fabrice Bellard
    paddd_r2r(mm6, mm5);
287
    psrad_i2r(SHIFT_FRW_ROW, mm0);
288
    psrad_i2r(SHIFT_FRW_ROW, mm5);
289
    packssdw_r2r(mm0, mm3);
290 99200bae Michael Niedermayer
    packssdw_r2r(mm5, mm2);
291 694ec061 Fabrice Bellard
    movq_r2r(mm3, mm6);
292 99200bae Michael Niedermayer
    punpcklwd_r2r(mm2, mm3);
293
    punpckhwd_r2r(mm2, mm6);
294 694ec061 Fabrice Bellard
    movq_r2m(mm3, *(out + 0));
295
    movq_r2m(mm6, *(out + 4));
296
}
297
298 3f09f52a Arpi
void ff_fdct_mmx(int16_t *block)
299 694ec061 Fabrice Bellard
{
300 3d71589f Michael Niedermayer
    int64_t align_tmp[16] ATTR_ALIGN(8);
301
    int16_t * const block_tmp= (int16_t*)align_tmp;
302 694ec061 Fabrice Bellard
    int16_t *block1, *out;
303
    const int16_t *table;
304
    int i;
305
306
    block1 = block_tmp;
307
    fdct_col(block, block1, 0);
308
    fdct_col(block, block1, 4);
309
310
    block1 = block_tmp;
311
    table = tab_frw_01234567;
312
    out = block;
313
    for(i=8;i>0;i--) {
314 99200bae Michael Niedermayer
        fdct_row_mmx(block1, out, table);
315 cf3bf5bb Michael Niedermayer
        block1 += 8;
316
        table += 32;
317
        out += 8;
318
    }
319
}
320
321
void ff_fdct_mmx2(int16_t *block)
322
{
323
    int64_t align_tmp[16] ATTR_ALIGN(8);
324
    int16_t * const block_tmp= (int16_t*)align_tmp;
325
    int16_t *block1, *out;
326
    const int16_t *table;
327
    int i;
328
329
    block1 = block_tmp;
330
    fdct_col(block, block1, 0);
331
    fdct_col(block, block1, 4);
332
333
    block1 = block_tmp;
334
    table = tab_frw_01234567;
335
    out = block;
336
    for(i=8;i>0;i--) {
337 99200bae Michael Niedermayer
        fdct_row_mmx2(block1, out, table);
338 694ec061 Fabrice Bellard
        block1 += 8;
339
        table += 32;
340
        out += 8;
341
    }
342
}