Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / fdct_mmx.c @ 1745173b

History | View | Annotate | Download (10.2 KB)

1
/*
2
 * MMX optimized forward DCT
3
 * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
4
 *
5
 * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
6
 * 
7
 *  Intel Application Note AP-922 - fast, precise implementation of DCT
8
 *        http://developer.intel.com/vtune/cbts/appnotes.htm
9
 */
10
#include "../common.h"
11
#include "mmx.h"
12

    
13
#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
14

    
15
//////////////////////////////////////////////////////////////////////
16
//
17
// constants for the forward DCT
18
// -----------------------------
19
//
20
// Be sure to check that your compiler is aligning all constants to QWORD
21
// (8-byte) memory boundaries!  Otherwise the unaligned memory access will
22
// severely stall MMX execution.
23
//
24
//////////////////////////////////////////////////////////////////////
25

    
26
#define BITS_FRW_ACC        3 //; 2 or 3 for accuracy
27
#define SHIFT_FRW_COL        BITS_FRW_ACC
28
#define SHIFT_FRW_ROW        (BITS_FRW_ACC + 17 - 3)
29
//#define RND_FRW_ROW                (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)
30
#define RND_FRW_ROW                (1 << (SHIFT_FRW_ROW-1))
31
//#define RND_FRW_COL                (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1)
32
#define RND_FRW_COL                (1 << (SHIFT_FRW_COL-1))
33

    
34
//concatenated table, for forward DCT transformation
35
static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
36
    13036, 13036, 13036, 13036,                // tg * (2<<16) + 0.5
37
    27146, 27146, 27146, 27146,                // tg * (2<<16) + 0.5
38
    -21746, -21746, -21746, -21746,        // tg * (2<<16) + 0.5
39
};
40
static const int16_t cos_4_16[4] ATTR_ALIGN(8) = {
41
    -19195, -19195, -19195, -19195,        //cos * (2<<16) + 0.5
42
};
43

    
44
static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
45
    23170, 23170, 23170, 23170,        //cos * (2<<15) + 0.5
46
};
47

    
48
static const long long  fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
49
static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
50

    
51
static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff table
52
    //row0
53
    16384, 16384, 21407, -8867,     //    w09 w01 w08 w00
54
    16384, 16384, 8867, -21407,     //    w13 w05 w12 w04
55
    16384, -16384, 8867, 21407,     //    w11 w03 w10 w02
56
    -16384, 16384, -21407, -8867,   //    w15 w07 w14 w06
57
    22725, 12873, 19266, -22725,    //    w22 w20 w18 w16
58
    19266, 4520, -4520, -12873,     //    w23 w21 w19 w17
59
    12873, 4520, 4520, 19266,       //    w30 w28 w26 w24
60
    -22725, 19266, -12873, -22725,  //    w31 w29 w27 w25
61

    
62
    //row1
63
    22725, 22725, 29692, -12299,    //    w09 w01 w08 w00
64
    22725, 22725, 12299, -29692,    //    w13 w05 w12 w04
65
    22725, -22725, 12299, 29692,    //    w11 w03 w10 w02
66
    -22725, 22725, -29692, -12299,  //    w15 w07 w14 w06
67
    31521, 17855, 26722, -31521,    //    w22 w20 w18 w16
68
    26722, 6270, -6270, -17855,     //    w23 w21 w19 w17
69
    17855, 6270, 6270, 26722,       //    w30 w28 w26 w24
70
    -31521, 26722, -17855, -31521,  //    w31 w29 w27 w25
71

    
72
    //row2
73
    21407, 21407, 27969, -11585,    //    w09 w01 w08 w00
74
    21407, 21407, 11585, -27969,    //    w13 w05 w12 w04
75
    21407, -21407, 11585, 27969,    //    w11 w03 w10 w02
76
    -21407, 21407, -27969, -11585,  //    w15 w07 w14 w06
77
    29692, 16819, 25172, -29692,    //    w22 w20 w18 w16
78
    25172, 5906, -5906, -16819,     //    w23 w21 w19 w17
79
    16819, 5906, 5906, 25172,       //    w30 w28 w26 w24
80
    -29692, 25172, -16819, -29692,  //    w31 w29 w27 w25
81

    
82
    //row3
83
    19266, 19266, 25172, -10426,    //    w09 w01 w08 w00
84
    19266, 19266, 10426, -25172,    //    w13 w05 w12 w04
85
    19266, -19266, 10426, 25172,    //    w11 w03 w10 w02
86
    -19266, 19266, -25172, -10426,  //    w15 w07 w14 w06, 
87
    26722, 15137, 22654, -26722,    //    w22 w20 w18 w16
88
    22654, 5315, -5315, -15137,     //    w23 w21 w19 w17
89
    15137, 5315, 5315, 22654,       //    w30 w28 w26 w24
90
    -26722, 22654, -15137, -26722,  //    w31 w29 w27 w25, 
91

    
92
    //row4
93
    16384, 16384, 21407, -8867,     //    w09 w01 w08 w00
94
    16384, 16384, 8867, -21407,     //    w13 w05 w12 w04
95
    16384, -16384, 8867, 21407,     //    w11 w03 w10 w02
96
    -16384, 16384, -21407, -8867,   //    w15 w07 w14 w06
97
    22725, 12873, 19266, -22725,    //    w22 w20 w18 w16
98
    19266, 4520, -4520, -12873,     //    w23 w21 w19 w17
99
    12873, 4520, 4520, 19266,       //    w30 w28 w26 w24
100
    -22725, 19266, -12873, -22725,  //    w31 w29 w27 w25 
101

    
102
    //row5
103
    19266, 19266, 25172, -10426,    //    w09 w01 w08 w00
104
    19266, 19266, 10426, -25172,    //    w13 w05 w12 w04
105
    19266, -19266, 10426, 25172,    //    w11 w03 w10 w02
106
    -19266, 19266, -25172, -10426,  //    w15 w07 w14 w06
107
    26722, 15137, 22654, -26722,    //    w22 w20 w18 w16
108
    22654, 5315, -5315, -15137,     //    w23 w21 w19 w17
109
    15137, 5315, 5315, 22654,       //    w30 w28 w26 w24
110
    -26722, 22654, -15137, -26722,  //    w31 w29 w27 w25
111

    
112
    //row6
113
    21407, 21407, 27969, -11585,    //    w09 w01 w08 w00
114
    21407, 21407, 11585, -27969,    //    w13 w05 w12 w04
115
    21407, -21407, 11585, 27969,    //    w11 w03 w10 w02
116
    -21407, 21407, -27969, -11585,  //    w15 w07 w14 w06, 
117
    29692, 16819, 25172, -29692,    //    w22 w20 w18 w16
118
    25172, 5906, -5906, -16819,     //    w23 w21 w19 w17
119
    16819, 5906, 5906, 25172,       //    w30 w28 w26 w24
120
    -29692, 25172, -16819, -29692,  //    w31 w29 w27 w25, 
121

    
122
    //row7
123
    22725, 22725, 29692, -12299,    //    w09 w01 w08 w00
124
    22725, 22725, 12299, -29692,    //    w13 w05 w12 w04
125
    22725, -22725, 12299, 29692,    //    w11 w03 w10 w02
126
    -22725, 22725, -29692, -12299,  //    w15 w07 w14 w06, 
127
    31521, 17855, 26722, -31521,    //    w22 w20 w18 w16
128
    26722, 6270, -6270, -17855,     //    w23 w21 w19 w17
129
    17855, 6270, 6270, 26722,       //    w30 w28 w26 w24
130
    -31521, 26722, -17855, -31521   //    w31 w29 w27 w25
131
};
132

    
133

    
134
static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
135
{
136
    movq_m2r(*(in + offset + 1 * 8), mm0);
137
    movq_m2r(*(in + offset + 6 * 8), mm1);
138
    movq_r2r(mm0, mm2);
139
    movq_m2r(*(in + offset + 2 * 8), mm3);
140
    paddsw_r2r(mm1, mm0);
141
    movq_m2r(*(in + offset + 5 * 8), mm4);
142
    psllw_i2r(SHIFT_FRW_COL, mm0);
143
    movq_m2r(*(in + offset + 0 * 8), mm5);
144
    paddsw_r2r(mm3, mm4);
145
    paddsw_m2r(*(in + offset + 7 * 8), mm5);
146
    psllw_i2r(SHIFT_FRW_COL, mm4);
147
    movq_r2r(mm0, mm6);
148
    psubsw_r2r(mm1, mm2);
149
    movq_m2r(*(fdct_tg_all_16 + 4), mm1);
150
    psubsw_r2r(mm4, mm0);
151
    movq_m2r(*(in + offset + 3 * 8), mm7);
152
    pmulhw_r2r(mm0, mm1);
153
    paddsw_m2r(*(in + offset + 4 * 8), mm7);
154
    psllw_i2r(SHIFT_FRW_COL, mm5);
155
    paddsw_r2r(mm4, mm6);
156
    psllw_i2r(SHIFT_FRW_COL, mm7);
157
    movq_r2r(mm5, mm4);
158
    psubsw_r2r(mm7, mm5);
159
    paddsw_r2r(mm5, mm1);
160
    paddsw_r2r(mm7, mm4);
161
    por_m2r(fdct_one_corr, mm1);
162
    psllw_i2r(SHIFT_FRW_COL + 1, mm2);
163
    pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5);
164
    movq_r2r(mm4, mm7);
165
    psubsw_m2r(*(in + offset + 5 * 8), mm3);
166
    psubsw_r2r(mm6, mm4);
167
    movq_r2m(mm1, *(out + offset + 2 * 8));
168
    paddsw_r2r(mm6, mm7);
169
    movq_m2r(*(in + offset + 3 * 8), mm1);
170
    psllw_i2r(SHIFT_FRW_COL + 1, mm3);
171
    psubsw_m2r(*(in + offset + 4 * 8), mm1);
172
    movq_r2r(mm2, mm6);
173
    movq_r2m(mm4, *(out + offset + 4 * 8));
174
    paddsw_r2r(mm3, mm2);
175
    pmulhw_m2r(*ocos_4_16, mm2);
176
    psubsw_r2r(mm3, mm6);
177
    pmulhw_m2r(*ocos_4_16, mm6);
178
    psubsw_r2r(mm0, mm5);
179
    por_m2r(fdct_one_corr, mm5);
180
    psllw_i2r(SHIFT_FRW_COL, mm1);
181
    por_m2r(fdct_one_corr, mm2);
182
    movq_r2r(mm1, mm4);
183
    movq_m2r(*(in + offset + 0 * 8), mm3);
184
    paddsw_r2r(mm6, mm1);
185
    psubsw_m2r(*(in + offset + 7 * 8), mm3);
186
    psubsw_r2r(mm6, mm4);
187
    movq_m2r(*(fdct_tg_all_16 + 0), mm0);
188
    psllw_i2r(SHIFT_FRW_COL, mm3);
189
    movq_m2r(*(fdct_tg_all_16 + 8), mm6);
190
    pmulhw_r2r(mm1, mm0);
191
    movq_r2m(mm7, *(out + offset + 0 * 8));
192
    pmulhw_r2r(mm4, mm6);
193
    movq_r2m(mm5, *(out + offset + 6 * 8));
194
    movq_r2r(mm3, mm7);
195
    movq_m2r(*(fdct_tg_all_16 + 8), mm5);
196
    psubsw_r2r(mm2, mm7);
197
    paddsw_r2r(mm2, mm3);
198
    pmulhw_r2r(mm7, mm5);
199
    paddsw_r2r(mm3, mm0);
200
    paddsw_r2r(mm4, mm6);
201
    pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3);
202
    por_m2r(fdct_one_corr, mm0);
203
    paddsw_r2r(mm7, mm5);
204
    psubsw_r2r(mm6, mm7);
205
    movq_r2m(mm0, *(out + offset + 1 * 8));
206
    paddsw_r2r(mm4, mm5);
207
    movq_r2m(mm7, *(out + offset + 3 * 8));
208
    psubsw_r2r(mm1, mm3);
209
    movq_r2m(mm5, *(out + offset + 5 * 8));
210
    movq_r2m(mm3, *(out + offset + 7 * 8));
211
}
212

    
213
static always_inline void fdct_row(const int16_t *in, int16_t *out, const int16_t *table)
214
{ 
215
    movd_m2r(*(in + 6), mm5);
216
    punpcklwd_m2r(*(in + 4), mm5);
217
    movq_r2r(mm5, mm2);
218
    psrlq_i2r(0x20, mm5);
219
    movq_m2r(*(in + 0), mm0);
220
    punpcklwd_r2r(mm2, mm5);
221
    movq_r2r(mm0, mm1);        
222
    paddsw_r2r(mm5, mm0);
223
    psubsw_r2r(mm5, mm1);
224
    movq_r2r(mm0, mm2);
225
    punpcklwd_r2r(mm1, mm0);
226
    punpckhwd_r2r(mm1, mm2);
227
    movq_r2r(mm2, mm1);
228
    movq_r2r(mm0, mm2);
229
    movq_m2r(*(table + 0), mm3);
230
    punpcklwd_r2r(mm1, mm0);
231
    movq_r2r(mm0, mm5);
232
    punpckldq_r2r(mm0, mm0);
233
    movq_m2r(*(table + 4), mm4);
234
    punpckhwd_r2r(mm1, mm2);
235
    pmaddwd_r2r(mm0, mm3);
236
    movq_r2r(mm2, mm6);
237
    movq_m2r(*(table + 16), mm1);
238
    punpckldq_r2r(mm2, mm2);
239
    pmaddwd_r2r(mm2, mm4);
240
    punpckhdq_r2r(mm5, mm5);
241
    pmaddwd_m2r(*(table + 8), mm0);
242
    punpckhdq_r2r(mm6, mm6);
243
    movq_m2r(*(table + 20), mm7);
244
    pmaddwd_r2r(mm5, mm1);
245
    paddd_m2r(*fdct_r_row, mm3);
246
    pmaddwd_r2r(mm6, mm7);
247
    pmaddwd_m2r(*(table + 12), mm2);
248
    paddd_r2r(mm4, mm3);
249
    pmaddwd_m2r(*(table + 24), mm5);
250
    pmaddwd_m2r(*(table + 28), mm6);
251
    paddd_r2r(mm7, mm1);
252
    paddd_m2r(*fdct_r_row, mm0);
253
    psrad_i2r(SHIFT_FRW_ROW, mm3);
254
    paddd_m2r(*fdct_r_row, mm1);
255
    paddd_r2r(mm2, mm0);
256
    paddd_m2r(*fdct_r_row, mm5);
257
    psrad_i2r(SHIFT_FRW_ROW, mm1);
258
    paddd_r2r(mm6, mm5);
259
    psrad_i2r(SHIFT_FRW_ROW, mm0);
260
    psrad_i2r(SHIFT_FRW_ROW, mm5);
261
    packssdw_r2r(mm0, mm3);
262
    packssdw_r2r(mm5, mm1);
263
    movq_r2r(mm3, mm6);
264
    punpcklwd_r2r(mm1, mm3);
265
    punpckhwd_r2r(mm1, mm6);
266
    movq_r2m(mm3, *(out + 0));
267
    movq_r2m(mm6, *(out + 4));
268
}
269

    
270
void ff_fdct_mmx(int16_t *block)
271
{
272
    int64_t align_tmp[16] ATTR_ALIGN(8);
273
    int16_t * const block_tmp= (int16_t*)align_tmp;
274
    int16_t *block1, *out;
275
    const int16_t *table;
276
    int i;
277

    
278
    block1 = block_tmp;
279
    fdct_col(block, block1, 0);
280
    fdct_col(block, block1, 4);
281

    
282
    block1 = block_tmp;
283
    table = tab_frw_01234567;
284
    out = block;
285
    for(i=8;i>0;i--) {
286
        fdct_row(block1, out, table);
287
        block1 += 8;
288
        table += 32;
289
        out += 8;
290
    }
291
}