Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / fdct_mmx.c @ 8f2ab833

History | View | Annotate | Download (10.5 KB)

1
/*
2
 * MMX optimized forward DCT
3
 * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
4
 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
7
 * 
8
 *  Intel Application Note AP-922 - fast, precise implementation of DCT
9
 *        http://developer.intel.com/vtune/cbts/appnotes.htm
10
 */
11
#include "../common.h"
12
#include "mmx.h"
13

    
14
#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
15

    
16
//////////////////////////////////////////////////////////////////////
17
//
18
// constants for the forward DCT
19
// -----------------------------
20
//
21
// Be sure to check that your compiler is aligning all constants to QWORD
22
// (8-byte) memory boundaries!  Otherwise the unaligned memory access will
23
// severely stall MMX execution.
24
//
25
//////////////////////////////////////////////////////////////////////
26

    
27
#define BITS_FRW_ACC        3 //; 2 or 3 for accuracy
28
#define SHIFT_FRW_COL        BITS_FRW_ACC
29
#define SHIFT_FRW_ROW        (BITS_FRW_ACC + 17 - 3)
30
//#define RND_FRW_ROW                (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)
31
#define RND_FRW_ROW                (1 << (SHIFT_FRW_ROW-1))
32
//#define RND_FRW_COL                (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1)
33
#define RND_FRW_COL                (1 << (SHIFT_FRW_COL-1))
34

    
35
//concatenated table, for forward DCT transformation
36
static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
37
    13036, 13036, 13036, 13036,                // tg * (2<<16) + 0.5
38
    27146, 27146, 27146, 27146,                // tg * (2<<16) + 0.5
39
    -21746, -21746, -21746, -21746,        // tg * (2<<16) + 0.5
40
};
41
static const int16_t cos_4_16[4] ATTR_ALIGN(8) = {
42
    -19195, -19195, -19195, -19195,        //cos * (2<<16) + 0.5
43
};
44

    
45
static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
46
    23170, 23170, 23170, 23170,        //cos * (2<<15) + 0.5
47
};
48

    
49
static const long long  fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
50
static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
51

    
52
static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff table
53
  16384,   16384,   -8867,  -21407, 
54
  16384,   16384,   21407,    8867, 
55
  16384,  -16384,   21407,   -8867, 
56
 -16384,   16384,    8867,  -21407, 
57
  22725,   19266,  -22725,  -12873, 
58
  12873,    4520,   19266,   -4520, 
59
  12873,  -22725,   19266,  -22725, 
60
   4520,   19266,    4520,  -12873, 
61

    
62
  22725,   22725,  -12299,  -29692, 
63
  22725,   22725,   29692,   12299, 
64
  22725,  -22725,   29692,  -12299, 
65
 -22725,   22725,   12299,  -29692, 
66
  31521,   26722,  -31521,  -17855, 
67
  17855,    6270,   26722,   -6270, 
68
  17855,  -31521,   26722,  -31521, 
69
   6270,   26722,    6270,  -17855, 
70

    
71
  21407,   21407,  -11585,  -27969, 
72
  21407,   21407,   27969,   11585, 
73
  21407,  -21407,   27969,  -11585, 
74
 -21407,   21407,   11585,  -27969, 
75
  29692,   25172,  -29692,  -16819, 
76
  16819,    5906,   25172,   -5906, 
77
  16819,  -29692,   25172,  -29692, 
78
   5906,   25172,    5906,  -16819, 
79

    
80
  19266,   19266,  -10426,  -25172, 
81
  19266,   19266,   25172,   10426, 
82
  19266,  -19266,   25172,  -10426, 
83
 -19266,   19266,   10426,  -25172, 
84
  26722,   22654,  -26722,  -15137, 
85
  15137,    5315,   22654,   -5315, 
86
  15137,  -26722,   22654,  -26722, 
87
   5315,   22654,    5315,  -15137, 
88

    
89
  16384,   16384,   -8867,  -21407, 
90
  16384,   16384,   21407,    8867, 
91
  16384,  -16384,   21407,   -8867, 
92
 -16384,   16384,    8867,  -21407, 
93
  22725,   19266,  -22725,  -12873, 
94
  12873,    4520,   19266,   -4520, 
95
  12873,  -22725,   19266,  -22725, 
96
   4520,   19266,    4520,  -12873, 
97

    
98
  19266,   19266,  -10426,  -25172, 
99
  19266,   19266,   25172,   10426, 
100
  19266,  -19266,   25172,  -10426, 
101
 -19266,   19266,   10426,  -25172, 
102
  26722,   22654,  -26722,  -15137, 
103
  15137,    5315,   22654,   -5315, 
104
  15137,  -26722,   22654,  -26722, 
105
   5315,   22654,    5315,  -15137, 
106

    
107
  21407,   21407,  -11585,  -27969, 
108
  21407,   21407,   27969,   11585, 
109
  21407,  -21407,   27969,  -11585, 
110
 -21407,   21407,   11585,  -27969, 
111
  29692,   25172,  -29692,  -16819, 
112
  16819,    5906,   25172,   -5906, 
113
  16819,  -29692,   25172,  -29692, 
114
   5906,   25172,    5906,  -16819, 
115

    
116
  22725,   22725,  -12299,  -29692, 
117
  22725,   22725,   29692,   12299, 
118
  22725,  -22725,   29692,  -12299, 
119
 -22725,   22725,   12299,  -29692, 
120
  31521,   26722,  -31521,  -17855, 
121
  17855,    6270,   26722,   -6270, 
122
  17855,  -31521,   26722,  -31521, 
123
   6270,   26722,    6270,  -17855, 
124
};
125

    
126

    
127
static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
128
{
129
    movq_m2r(*(in + offset + 1 * 8), mm0);
130
    movq_m2r(*(in + offset + 6 * 8), mm1);
131
    movq_r2r(mm0, mm2);
132
    movq_m2r(*(in + offset + 2 * 8), mm3);
133
    paddsw_r2r(mm1, mm0);
134
    movq_m2r(*(in + offset + 5 * 8), mm4);
135
    psllw_i2r(SHIFT_FRW_COL, mm0);
136
    movq_m2r(*(in + offset + 0 * 8), mm5);
137
    paddsw_r2r(mm3, mm4);
138
    paddsw_m2r(*(in + offset + 7 * 8), mm5);
139
    psllw_i2r(SHIFT_FRW_COL, mm4);
140
    movq_r2r(mm0, mm6);
141
    psubsw_r2r(mm1, mm2);
142
    movq_m2r(*(fdct_tg_all_16 + 4), mm1);
143
    psubsw_r2r(mm4, mm0);
144
    movq_m2r(*(in + offset + 3 * 8), mm7);
145
    pmulhw_r2r(mm0, mm1);
146
    paddsw_m2r(*(in + offset + 4 * 8), mm7);
147
    psllw_i2r(SHIFT_FRW_COL, mm5);
148
    paddsw_r2r(mm4, mm6);
149
    psllw_i2r(SHIFT_FRW_COL, mm7);
150
    movq_r2r(mm5, mm4);
151
    psubsw_r2r(mm7, mm5);
152
    paddsw_r2r(mm5, mm1);
153
    paddsw_r2r(mm7, mm4);
154
    por_m2r(fdct_one_corr, mm1);
155
    psllw_i2r(SHIFT_FRW_COL + 1, mm2);
156
    pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5);
157
    movq_r2r(mm4, mm7);
158
    psubsw_m2r(*(in + offset + 5 * 8), mm3);
159
    psubsw_r2r(mm6, mm4);
160
    movq_r2m(mm1, *(out + offset + 2 * 8));
161
    paddsw_r2r(mm6, mm7);
162
    movq_m2r(*(in + offset + 3 * 8), mm1);
163
    psllw_i2r(SHIFT_FRW_COL + 1, mm3);
164
    psubsw_m2r(*(in + offset + 4 * 8), mm1);
165
    movq_r2r(mm2, mm6);
166
    movq_r2m(mm4, *(out + offset + 4 * 8));
167
    paddsw_r2r(mm3, mm2);
168
    pmulhw_m2r(*ocos_4_16, mm2);
169
    psubsw_r2r(mm3, mm6);
170
    pmulhw_m2r(*ocos_4_16, mm6);
171
    psubsw_r2r(mm0, mm5);
172
    por_m2r(fdct_one_corr, mm5);
173
    psllw_i2r(SHIFT_FRW_COL, mm1);
174
    por_m2r(fdct_one_corr, mm2);
175
    movq_r2r(mm1, mm4);
176
    movq_m2r(*(in + offset + 0 * 8), mm3);
177
    paddsw_r2r(mm6, mm1);
178
    psubsw_m2r(*(in + offset + 7 * 8), mm3);
179
    psubsw_r2r(mm6, mm4);
180
    movq_m2r(*(fdct_tg_all_16 + 0), mm0);
181
    psllw_i2r(SHIFT_FRW_COL, mm3);
182
    movq_m2r(*(fdct_tg_all_16 + 8), mm6);
183
    pmulhw_r2r(mm1, mm0);
184
    movq_r2m(mm7, *(out + offset + 0 * 8));
185
    pmulhw_r2r(mm4, mm6);
186
    movq_r2m(mm5, *(out + offset + 6 * 8));
187
    movq_r2r(mm3, mm7);
188
    movq_m2r(*(fdct_tg_all_16 + 8), mm5);
189
    psubsw_r2r(mm2, mm7);
190
    paddsw_r2r(mm2, mm3);
191
    pmulhw_r2r(mm7, mm5);
192
    paddsw_r2r(mm3, mm0);
193
    paddsw_r2r(mm4, mm6);
194
    pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3);
195
    por_m2r(fdct_one_corr, mm0);
196
    paddsw_r2r(mm7, mm5);
197
    psubsw_r2r(mm6, mm7);
198
    movq_r2m(mm0, *(out + offset + 1 * 8));
199
    paddsw_r2r(mm4, mm5);
200
    movq_r2m(mm7, *(out + offset + 3 * 8));
201
    psubsw_r2r(mm1, mm3);
202
    movq_r2m(mm5, *(out + offset + 5 * 8));
203
    movq_r2m(mm3, *(out + offset + 7 * 8));
204
}
205

    
206
static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
207
{ 
208
    pshufw_m2r(*(in + 4), mm5, 0x1B);
209
    movq_m2r(*(in + 0), mm0);
210
    movq_r2r(mm0, mm1);        
211
    paddsw_r2r(mm5, mm0);
212
    psubsw_r2r(mm5, mm1);
213
    pshufw_r2r(mm0, mm2, 0x4E);
214
    pshufw_r2r(mm1, mm3, 0x4E);
215
    movq_m2r(*(table + 0), mm4);
216
    movq_m2r(*(table + 4), mm6);
217
    movq_m2r(*(table + 16), mm5);
218
    movq_m2r(*(table + 20), mm7);
219
    pmaddwd_r2r(mm0, mm4);
220
    pmaddwd_r2r(mm1, mm5);
221
    pmaddwd_r2r(mm2, mm6);
222
    pmaddwd_r2r(mm3, mm7);
223
    pmaddwd_m2r(*(table + 8), mm0);
224
    pmaddwd_m2r(*(table + 12), mm2);
225
    pmaddwd_m2r(*(table + 24), mm1);
226
    pmaddwd_m2r(*(table + 28), mm3);
227
    paddd_r2r(mm6, mm4);
228
    paddd_r2r(mm7, mm5);
229
    paddd_r2r(mm2, mm0);
230
    paddd_r2r(mm3, mm1);
231
    movq_m2r(*fdct_r_row, mm7);
232
    paddd_r2r(mm7, mm4);
233
    paddd_r2r(mm7, mm5);
234
    paddd_r2r(mm7, mm0);
235
    paddd_r2r(mm7, mm1);
236
    psrad_i2r(SHIFT_FRW_ROW, mm4);
237
    psrad_i2r(SHIFT_FRW_ROW, mm5);
238
    psrad_i2r(SHIFT_FRW_ROW, mm0);
239
    psrad_i2r(SHIFT_FRW_ROW, mm1);
240
    packssdw_r2r(mm0, mm4);
241
    packssdw_r2r(mm1, mm5);
242
    movq_r2r(mm4, mm2);
243
    punpcklwd_r2r(mm5, mm4);
244
    punpckhwd_r2r(mm5, mm2);
245
    movq_r2m(mm4, *(out + 0));
246
    movq_r2m(mm2, *(out + 4));
247
}
248

    
249
static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
250
{ 
251
    movd_m2r(*(in + 6), mm1);
252
    punpcklwd_m2r(*(in + 4), mm1);
253
    movq_r2r(mm1, mm2);
254
    psrlq_i2r(0x20, mm1);
255
    movq_m2r(*(in + 0), mm0);
256
    punpcklwd_r2r(mm2, mm1);
257
    movq_r2r(mm0, mm5);        
258
    paddsw_r2r(mm1, mm0);
259
    psubsw_r2r(mm1, mm5);
260
    movq_r2r(mm0, mm1);            
261
    movq_r2r(mm5, mm6);        
262
    punpckldq_r2r(mm5, mm3);
263
    punpckhdq_r2r(mm3, mm6);
264
    movq_m2r(*(table + 0), mm3);
265
    movq_m2r(*(table + 4), mm4);
266
    punpckldq_r2r(mm0, mm2);
267
    pmaddwd_r2r(mm0, mm3);
268
    punpckhdq_r2r(mm2, mm1);
269
    movq_m2r(*(table + 16), mm2);
270
    pmaddwd_r2r(mm1, mm4);
271
    pmaddwd_m2r(*(table + 8), mm0);
272
    movq_m2r(*(table + 20), mm7);
273
    pmaddwd_r2r(mm5, mm2);
274
    paddd_m2r(*fdct_r_row, mm3);
275
    pmaddwd_r2r(mm6, mm7);
276
    pmaddwd_m2r(*(table + 12), mm1);
277
    paddd_r2r(mm4, mm3);
278
    pmaddwd_m2r(*(table + 24), mm5);
279
    pmaddwd_m2r(*(table + 28), mm6);
280
    paddd_r2r(mm7, mm2);
281
    paddd_m2r(*fdct_r_row, mm0);
282
    psrad_i2r(SHIFT_FRW_ROW, mm3);
283
    paddd_m2r(*fdct_r_row, mm2);
284
    paddd_r2r(mm1, mm0);
285
    paddd_m2r(*fdct_r_row, mm5);
286
    psrad_i2r(SHIFT_FRW_ROW, mm2);
287
    paddd_r2r(mm6, mm5);
288
    psrad_i2r(SHIFT_FRW_ROW, mm0);
289
    psrad_i2r(SHIFT_FRW_ROW, mm5);
290
    packssdw_r2r(mm0, mm3);
291
    packssdw_r2r(mm5, mm2);
292
    movq_r2r(mm3, mm6);
293
    punpcklwd_r2r(mm2, mm3);
294
    punpckhwd_r2r(mm2, mm6);
295
    movq_r2m(mm3, *(out + 0));
296
    movq_r2m(mm6, *(out + 4));
297
}
298

    
299
void ff_fdct_mmx(int16_t *block)
300
{
301
    int64_t align_tmp[16] ATTR_ALIGN(8);
302
    int16_t * const block_tmp= (int16_t*)align_tmp;
303
    int16_t *block1, *out;
304
    const int16_t *table;
305
    int i;
306

    
307
    block1 = block_tmp;
308
    fdct_col(block, block1, 0);
309
    fdct_col(block, block1, 4);
310

    
311
    block1 = block_tmp;
312
    table = tab_frw_01234567;
313
    out = block;
314
    for(i=8;i>0;i--) {
315
        fdct_row_mmx(block1, out, table);
316
        block1 += 8;
317
        table += 32;
318
        out += 8;
319
    }
320
}
321

    
322
void ff_fdct_mmx2(int16_t *block)
323
{
324
    int64_t align_tmp[16] ATTR_ALIGN(8);
325
    int16_t * const block_tmp= (int16_t*)align_tmp;
326
    int16_t *block1, *out;
327
    const int16_t *table;
328
    int i;
329

    
330
    block1 = block_tmp;
331
    fdct_col(block, block1, 0);
332
    fdct_col(block, block1, 4);
333

    
334
    block1 = block_tmp;
335
    table = tab_frw_01234567;
336
    out = block;
337
    for(i=8;i>0;i--) {
338
        fdct_row_mmx2(block1, out, table);
339
        block1 += 8;
340
        table += 32;
341
        out += 8;
342
    }
343
}