Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / vp3dsp_mmx.c @ 5509bffa

History | View | Annotate | Download (11.4 KB)

1
/*
2
 * Copyright (C) 2004 the ffmpeg project
3
 *
4
 * This library is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU Lesser General Public
6
 * License as published by the Free Software Foundation; either
7
 * version 2 of the License, or (at your option) any later version.
8
 *
9
 * This library is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
 * Lesser General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU Lesser General Public
15
 * License along with this library; if not, write to the Free Software
16
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
 */
18

    
19
/**
20
 * @file vp3dsp_mmx.c
21
 * MMX-optimized functions cribbed from the original VP3 source code.
22
 */
23

    
24
#include "../dsputil.h"
25
#include "mmx.h"
26

    
27
#define IdctAdjustBeforeShift 8
28

    
29
/* (12 * 4) 2-byte memory locations ( = 96 bytes total)
30
 * idct_constants[0..15] = Mask table (M(I))
31
 * idct_constants[16..43] = Cosine table (C(I))
32
 * idct_constants[44..47] = 8
33
 */
34
static uint16_t idct_constants[(4 + 7 + 1) * 4];
35
static const uint16_t idct_cosine_table[7] = {
36
    64277, 60547, 54491, 46341, 36410, 25080, 12785
37
};
38

    
39
#define r0 mm0
40
#define r1 mm1
41
#define r2 mm2
42
#define r3 mm3
43
#define r4 mm4
44
#define r5 mm5
45
#define r6 mm6
46
#define r7 mm7
47

    
48
/* from original comments: The Macro does IDct on 4 1-D Dcts */
49
#define BeginIDCT() { \
50
    movq_m2r(*I(3), r2); \
51
    movq_m2r(*C(3), r6); \
52
    movq_r2r(r2, r4); \
53
    movq_m2r(*J(5), r7); \
54
    pmulhw_r2r(r6, r4);       /* r4 = c3*i3 - i3 */ \
55
    movq_m2r(*C(5), r1); \
56
    pmulhw_r2r(r7, r6);       /* r6 = c3*i5 - i5 */ \
57
    movq_r2r(r1, r5); \
58
    pmulhw_r2r(r2, r1);       /* r1 = c5*i3 - i3 */ \
59
    movq_m2r(*I(1), r3); \
60
    pmulhw_r2r(r7, r5);       /* r5 = c5*i5 - i5 */ \
61
    movq_m2r(*C(1), r0);      /* (all registers are in use) */ \
62
    paddw_r2r(r2, r4);        /* r4 = c3*i3 */ \
63
    paddw_r2r(r7, r6);        /* r6 = c3*i5 */ \
64
    paddw_r2r(r1, r2);        /* r2 = c5*i3 */ \
65
    movq_m2r(*J(7), r1); \
66
    paddw_r2r(r5, r7);        /* r7 = c5*i5 */ \
67
    movq_r2r(r0, r5);         /* r5 = c1 */ \
68
    pmulhw_r2r(r3, r0);       /* r0 = c1*i1 - i1 */ \
69
    paddsw_r2r(r7, r4);       /* r4 = C = c3*i3 + c5*i5 */ \
70
    pmulhw_r2r(r1, r5);       /* r5 = c1*i7 - i7 */ \
71
    movq_m2r(*C(7), r7); \
72
    psubsw_r2r(r2, r6);       /* r6 = D = c3*i5 - c5*i3 */ \
73
    paddw_r2r(r3, r0);        /* r0 = c1*i1 */ \
74
    pmulhw_r2r(r7, r3);       /* r3 = c7*i1 */ \
75
    movq_m2r(*I(2), r2); \
76
    pmulhw_r2r(r1, r7);       /* r7 = c7*i7 */ \
77
    paddw_r2r(r1, r5);        /* r5 = c1*i7 */ \
78
    movq_r2r(r2, r1);         /* r1 = i2 */ \
79
    pmulhw_m2r(*C(2), r2);    /* r2 = c2*i2 - i2 */ \
80
    psubsw_r2r(r5, r3);       /* r3 = B = c7*i1 - c1*i7 */ \
81
    movq_m2r(*J(6), r5); \
82
    paddsw_r2r(r7, r0);       /* r0 = A = c1*i1 + c7*i7 */ \
83
    movq_r2r(r5, r7);         /* r7 = i6 */ \
84
    psubsw_r2r(r4, r0);       /* r0 = A - C */ \
85
    pmulhw_m2r(*C(2), r5);    /* r5 = c2*i6 - i6 */ \
86
    paddw_r2r(r1, r2);        /* r2 = c2*i2 */ \
87
    pmulhw_m2r(*C(6), r1);    /* r1 = c6*i2 */ \
88
    paddsw_r2r(r4, r4);       /* r4 = C + C */ \
89
    paddsw_r2r(r0, r4);       /* r4 = C. = A + C */ \
90
    psubsw_r2r(r6, r3);       /* r3 = B - D */ \
91
    paddw_r2r(r7, r5);        /* r5 = c2*i6 */ \
92
    paddsw_r2r(r6, r6);       /* r6 = D + D */ \
93
    pmulhw_m2r(*C(6), r7);    /* r7 = c6*i6 */ \
94
    paddsw_r2r(r3, r6);       /* r6 = D. = B + D */ \
95
    movq_r2m(r4, *I(1));      /* save C. at I(1) */ \
96
    psubsw_r2r(r5, r1);       /* r1 = H = c6*i2 - c2*i6 */ \
97
    movq_m2r(*C(4), r4); \
98
    movq_r2r(r3, r5);         /* r5 = B - D */ \
99
    pmulhw_r2r(r4, r3);       /* r3 = (c4 - 1) * (B - D) */ \
100
    paddsw_r2r(r2, r7);       /* r7 = G = c6*i6 + c2*i2 */ \
101
    movq_r2m(r6, *I(2));      /* save D. at I(2) */ \
102
    movq_r2r(r0, r2);         /* r2 = A - C */ \
103
    movq_m2r(*I(0), r6); \
104
    pmulhw_r2r(r4, r0);       /* r0 = (c4 - 1) * (A - C) */ \
105
    paddw_r2r(r3, r5);        /* r5 = B. = c4 * (B - D) */ \
106
    movq_m2r(*J(4), r3); \
107
    psubsw_r2r(r1, r5);       /* r5 = B.. = B. - H */ \
108
    paddw_r2r(r0, r2);        /* r0 = A. = c4 * (A - C) */ \
109
    psubsw_r2r(r3, r6);       /* r6 = i0 - i4 */ \
110
    movq_r2r(r6, r0); \
111
    pmulhw_r2r(r4, r6);       /* r6 = (c4 - 1) * (i0 - i4) */ \
112
    paddsw_r2r(r3, r3);       /* r3 = i4 + i4 */ \
113
    paddsw_r2r(r1, r1);       /* r1 = H + H */ \
114
    paddsw_r2r(r0, r3);       /* r3 = i0 + i4 */ \
115
    paddsw_r2r(r5, r1);       /* r1 = H. = B + H */ \
116
    pmulhw_r2r(r3, r4);       /* r4 = (c4 - 1) * (i0 + i4) */ \
117
    paddsw_r2r(r0, r6);       /* r6 = F = c4 * (i0 - i4) */ \
118
    psubsw_r2r(r2, r6);       /* r6 = F. = F - A. */ \
119
    paddsw_r2r(r2, r2);       /* r2 = A. + A. */ \
120
    movq_m2r(*I(1), r0);      /* r0 = C. */ \
121
    paddsw_r2r(r6, r2);       /* r2 = A.. = F + A. */ \
122
    paddw_r2r(r3, r4);        /* r4 = E = c4 * (i0 + i4) */ \
123
    psubsw_r2r(r1, r2);       /* r2 = R2 = A.. - H. */ \
124
}
125

    
126
/* RowIDCT gets ready to transpose */
127
#define RowIDCT() { \
128
    \
129
    BeginIDCT(); \
130
    \
131
    movq_m2r(*I(2), r3);   /* r3 = D. */ \
132
    psubsw_r2r(r7, r4);    /* r4 = E. = E - G */ \
133
    paddsw_r2r(r1, r1);    /* r1 = H. + H. */ \
134
    paddsw_r2r(r7, r7);    /* r7 = G + G */ \
135
    paddsw_r2r(r2, r1);    /* r1 = R1 = A.. + H. */ \
136
    paddsw_r2r(r4, r7);    /* r7 = G. = E + G */ \
137
    psubsw_r2r(r3, r4);    /* r4 = R4 = E. - D. */ \
138
    paddsw_r2r(r3, r3); \
139
    psubsw_r2r(r5, r6);    /* r6 = R6 = F. - B.. */ \
140
    paddsw_r2r(r5, r5); \
141
    paddsw_r2r(r4, r3);    /* r3 = R3 = E. + D. */ \
142
    paddsw_r2r(r6, r5);    /* r5 = R5 = F. + B.. */ \
143
    psubsw_r2r(r0, r7);    /* r7 = R7 = G. - C. */ \
144
    paddsw_r2r(r0, r0); \
145
    movq_r2m(r1, *I(1));   /* save R1 */ \
146
    paddsw_r2r(r7, r0);    /* r0 = R0 = G. + C. */ \
147
}
148

    
149
/* Column IDCT normalizes and stores final results */
150
#define ColumnIDCT() { \
151
    \
152
    BeginIDCT(); \
153
    \
154
    paddsw_m2r(*Eight, r2);    /* adjust R2 (and R1) for shift */ \
155
    paddsw_r2r(r1, r1);        /* r1 = H. + H. */ \
156
    paddsw_r2r(r2, r1);        /* r1 = R1 = A.. + H. */ \
157
    psraw_i2r(4, r2);          /* r2 = NR2 */ \
158
    psubsw_r2r(r7, r4);        /* r4 = E. = E - G */ \
159
    psraw_i2r(4, r1);          /* r1 = NR1 */ \
160
    movq_m2r(*I(2), r3);       /* r3 = D. */ \
161
    paddsw_r2r(r7, r7);        /* r7 = G + G */ \
162
    movq_r2m(r2, *I(2));       /* store NR2 at I2 */ \
163
    paddsw_r2r(r4, r7);        /* r7 = G. = E + G */ \
164
    movq_r2m(r1, *I(1));       /* store NR1 at I1 */ \
165
    psubsw_r2r(r3, r4);        /* r4 = R4 = E. - D. */ \
166
    paddsw_m2r(*Eight, r4);    /* adjust R4 (and R3) for shift */ \
167
    paddsw_r2r(r3, r3);        /* r3 = D. + D. */ \
168
    paddsw_r2r(r4, r3);        /* r3 = R3 = E. + D. */ \
169
    psraw_i2r(4, r4);          /* r4 = NR4 */ \
170
    psubsw_r2r(r5, r6);        /* r6 = R6 = F. - B.. */ \
171
    psraw_i2r(4, r3);          /* r3 = NR3 */ \
172
    paddsw_m2r(*Eight, r6);    /* adjust R6 (and R5) for shift */ \
173
    paddsw_r2r(r5, r5);        /* r5 = B.. + B.. */ \
174
    paddsw_r2r(r6, r5);        /* r5 = R5 = F. + B.. */ \
175
    psraw_i2r(4, r6);          /* r6 = NR6 */ \
176
    movq_r2m(r4, *J(4));       /* store NR4 at J4 */ \
177
    psraw_i2r(4, r5);          /* r5 = NR5 */ \
178
    movq_r2m(r3, *I(3));       /* store NR3 at I3 */ \
179
    psubsw_r2r(r0, r7);        /* r7 = R7 = G. - C. */ \
180
    paddsw_m2r(*Eight, r7);    /* adjust R7 (and R0) for shift */ \
181
    paddsw_r2r(r0, r0);        /* r0 = C. + C. */ \
182
    paddsw_r2r(r7, r0);        /* r0 = R0 = G. + C. */ \
183
    psraw_i2r(4, r7);          /* r7 = NR7 */ \
184
    movq_r2m(r6, *J(6));       /* store NR6 at J6 */ \
185
    psraw_i2r(4, r0);          /* r0 = NR0 */ \
186
    movq_r2m(r5, *J(5));       /* store NR5 at J5 */ \
187
    movq_r2m(r7, *J(7));       /* store NR7 at J7 */ \
188
    movq_r2m(r0, *I(0));       /* store NR0 at I0 */ \
189
}
190

    
191
/* Following macro does two 4x4 transposes in place.
192

193
  At entry (we assume):
194

195
    r0 = a3 a2 a1 a0
196
    I(1) = b3 b2 b1 b0
197
    r2 = c3 c2 c1 c0
198
    r3 = d3 d2 d1 d0
199

200
    r4 = e3 e2 e1 e0
201
    r5 = f3 f2 f1 f0
202
    r6 = g3 g2 g1 g0
203
    r7 = h3 h2 h1 h0
204

205
  At exit, we have:
206

207
    I(0) = d0 c0 b0 a0
208
    I(1) = d1 c1 b1 a1
209
    I(2) = d2 c2 b2 a2
210
    I(3) = d3 c3 b3 a3
211

212
    J(4) = h0 g0 f0 e0
213
    J(5) = h1 g1 f1 e1
214
    J(6) = h2 g2 f2 e2
215
    J(7) = h3 g3 f3 e3
216

217
   I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
218
   J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
219

220
   Since r1 is free at entry, we calculate the Js first. */
221

    
222
#define Transpose() { \
223
    movq_r2r(r4, r1);         /* r1 = e3 e2 e1 e0 */ \
224
    punpcklwd_r2r(r5, r4);    /* r4 = f1 e1 f0 e0 */ \
225
    movq_r2m(r0, *I(0));      /* save a3 a2 a1 a0 */ \
226
    punpckhwd_r2r(r5, r1);    /* r1 = f3 e3 f2 e2 */ \
227
    movq_r2r(r6, r0);         /* r0 = g3 g2 g1 g0 */ \
228
    punpcklwd_r2r(r7, r6);    /* r6 = h1 g1 h0 g0 */ \
229
    movq_r2r(r4, r5);         /* r5 = f1 e1 f0 e0 */ \
230
    punpckldq_r2r(r6, r4);    /* r4 = h0 g0 f0 e0 = R4 */ \
231
    punpckhdq_r2r(r6, r5);    /* r5 = h1 g1 f1 e1 = R5 */ \
232
    movq_r2r(r1, r6);         /* r6 = f3 e3 f2 e2 */ \
233
    movq_r2m(r4, *J(4)); \
234
    punpckhwd_r2r(r7, r0);    /* r0 = h3 g3 h2 g2 */ \
235
    movq_r2m(r5, *J(5)); \
236
    punpckhdq_r2r(r0, r6);    /* r6 = h3 g3 f3 e3 = R7 */ \
237
    movq_m2r(*I(0), r4);      /* r4 = a3 a2 a1 a0 */ \
238
    punpckldq_r2r(r0, r1);    /* r1 = h2 g2 f2 e2 = R6 */ \
239
    movq_m2r(*I(1), r5);      /* r5 = b3 b2 b1 b0 */ \
240
    movq_r2r(r4, r0);         /* r0 = a3 a2 a1 a0 */ \
241
    movq_r2m(r6, *J(7)); \
242
    punpcklwd_r2r(r5, r0);    /* r0 = b1 a1 b0 a0 */ \
243
    movq_r2m(r1, *J(6)); \
244
    punpckhwd_r2r(r5, r4);    /* r4 = b3 a3 b2 a2 */ \
245
    movq_r2r(r2, r5);         /* r5 = c3 c2 c1 c0 */ \
246
    punpcklwd_r2r(r3, r2);    /* r2 = d1 c1 d0 c0 */ \
247
    movq_r2r(r0, r1);         /* r1 = b1 a1 b0 a0 */ \
248
    punpckldq_r2r(r2, r0);    /* r0 = d0 c0 b0 a0 = R0 */ \
249
    punpckhdq_r2r(r2, r1);    /* r1 = d1 c1 b1 a1 = R1 */ \
250
    movq_r2r(r4, r2);         /* r2 = b3 a3 b2 a2 */ \
251
    movq_r2m(r0, *I(0)); \
252
    punpckhwd_r2r(r3, r5);    /* r5 = d3 c3 d2 c2 */ \
253
    movq_r2m(r1, *I(1)); \
254
    punpckhdq_r2r(r5, r4);    /* r4 = d3 c3 b3 a3 = R3 */ \
255
    punpckldq_r2r(r5, r2);    /* r2 = d2 c2 b2 a2 = R2 */ \
256
    movq_r2m(r4, *I(3)); \
257
    movq_r2m(r2, *I(2)); \
258
}
259

    
260
void ff_vp3_dsp_init_mmx(void)
261
{
262
    int j = 16;
263
    uint16_t *p;
264

    
265
    j = 1;
266
    do {
267
        p = idct_constants + ((j + 3) << 2);
268
        p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
269
    } while (++j <= 7);
270

    
271
    idct_constants[44] = idct_constants[45] =
272
    idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
273
}
274

    
275
void ff_vp3_idct_mmx(int16_t *output_data)
276
{
277
    /* eax = quantized input
278
     * ebx = dequantizer matrix
279
     * ecx = IDCT constants
280
     *  M(I) = ecx + MaskOffset(0) + I * 8
281
     *  C(I) = ecx + CosineOffset(32) + (I-1) * 8
282
     * edx = output
283
     * r0..r7 = mm0..mm7
284
     */
285

    
286
#define C(x) (idct_constants + 16 + (x - 1) * 4)
287
#define Eight (idct_constants + 44)
288

    
289
    /* at this point, function has completed dequantization + dezigzag +
290
     * partial transposition; now do the idct itself */
291
#define I(K) (output_data + K * 8)
292
#define J(K) (output_data + ((K - 4) * 8) + 4)
293

    
294
    RowIDCT();
295
    Transpose();
296

    
297
#undef I
298
#undef J
299
#define I(K) (output_data + (K * 8) + 32)
300
#define J(K) (output_data + ((K - 4) * 8) + 36)
301

    
302
    RowIDCT();
303
    Transpose();
304

    
305
#undef I
306
#undef J
307
#define I(K) (output_data + K * 8)
308
#define J(K) (output_data + K * 8)
309

    
310
    ColumnIDCT();
311

    
312
#undef I
313
#undef J
314
#define I(K) (output_data + (K * 8) + 4)
315
#define J(K) (output_data + (K * 8) + 4)
316

    
317
    ColumnIDCT();
318

    
319
#undef I
320
#undef J
321

    
322
}