Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / vp3dsp_mmx.c @ 699b3f99

History | View | Annotate | Download (11.7 KB)

1 44cb64ee Mike Melanson
/*
2
 * Copyright (C) 2004 the ffmpeg project
3
 *
4 b78e7197 Diego Biurrun
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7 44cb64ee Mike Melanson
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9 b78e7197 Diego Biurrun
 * version 2.1 of the License, or (at your option) any later version.
10 44cb64ee Mike Melanson
 *
11 b78e7197 Diego Biurrun
 * FFmpeg is distributed in the hope that it will be useful,
12 44cb64ee Mike Melanson
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17 b78e7197 Diego Biurrun
 * License along with FFmpeg; if not, write to the Free Software
18 5509bffa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 44cb64ee Mike Melanson
 */
20
21
/**
22
 * @file vp3dsp_mmx.c
23
 * MMX-optimized functions cribbed from the original VP3 source code.
24
 */
25
26 b550bfaa Ronald S. Bultje
#include "dsputil.h"
27 44cb64ee Mike Melanson
#include "mmx.h"
28
29
#define IdctAdjustBeforeShift 8
30
31
/* (12 * 4) 2-byte memory locations ( = 96 bytes total)
32
 * idct_constants[0..15] = Mask table (M(I))
33
 * idct_constants[16..43] = Cosine table (C(I))
34
 * idct_constants[44..47] = 8
35
 */
36
static uint16_t idct_constants[(4 + 7 + 1) * 4];
37 c26ae41d Michael Niedermayer
static const uint16_t idct_cosine_table[7] = {
38 44cb64ee Mike Melanson
    64277, 60547, 54491, 46341, 36410, 25080, 12785
39
};
40
41
#define r0 mm0
42
#define r1 mm1
43
#define r2 mm2
44
#define r3 mm3
45
#define r4 mm4
46
#define r5 mm5
47
#define r6 mm6
48
#define r7 mm7
49
50
/* from original comments: The Macro does IDct on 4 1-D Dcts */
51 daae8699 Mike Melanson
#define BeginIDCT() { \
52 44cb64ee Mike Melanson
    movq_m2r(*I(3), r2); \
53
    movq_m2r(*C(3), r6); \
54
    movq_r2r(r2, r4); \
55
    movq_m2r(*J(5), r7); \
56 daae8699 Mike Melanson
    pmulhw_r2r(r6, r4);       /* r4 = c3*i3 - i3 */ \
57 44cb64ee Mike Melanson
    movq_m2r(*C(5), r1); \
58 daae8699 Mike Melanson
    pmulhw_r2r(r7, r6);       /* r6 = c3*i5 - i5 */ \
59 44cb64ee Mike Melanson
    movq_r2r(r1, r5); \
60 daae8699 Mike Melanson
    pmulhw_r2r(r2, r1);       /* r1 = c5*i3 - i3 */ \
61 44cb64ee Mike Melanson
    movq_m2r(*I(1), r3); \
62 daae8699 Mike Melanson
    pmulhw_r2r(r7, r5);       /* r5 = c5*i5 - i5 */ \
63
    movq_m2r(*C(1), r0);      /* (all registers are in use) */ \
64
    paddw_r2r(r2, r4);        /* r4 = c3*i3 */ \
65
    paddw_r2r(r7, r6);        /* r6 = c3*i5 */ \
66
    paddw_r2r(r1, r2);        /* r2 = c5*i3 */ \
67 44cb64ee Mike Melanson
    movq_m2r(*J(7), r1); \
68 daae8699 Mike Melanson
    paddw_r2r(r5, r7);        /* r7 = c5*i5 */ \
69
    movq_r2r(r0, r5);         /* r5 = c1 */ \
70
    pmulhw_r2r(r3, r0);       /* r0 = c1*i1 - i1 */ \
71
    paddsw_r2r(r7, r4);       /* r4 = C = c3*i3 + c5*i5 */ \
72
    pmulhw_r2r(r1, r5);       /* r5 = c1*i7 - i7 */ \
73 44cb64ee Mike Melanson
    movq_m2r(*C(7), r7); \
74 daae8699 Mike Melanson
    psubsw_r2r(r2, r6);       /* r6 = D = c3*i5 - c5*i3 */ \
75
    paddw_r2r(r3, r0);        /* r0 = c1*i1 */ \
76
    pmulhw_r2r(r7, r3);       /* r3 = c7*i1 */ \
77 44cb64ee Mike Melanson
    movq_m2r(*I(2), r2); \
78 daae8699 Mike Melanson
    pmulhw_r2r(r1, r7);       /* r7 = c7*i7 */ \
79
    paddw_r2r(r1, r5);        /* r5 = c1*i7 */ \
80
    movq_r2r(r2, r1);         /* r1 = i2 */ \
81
    pmulhw_m2r(*C(2), r2);    /* r2 = c2*i2 - i2 */ \
82
    psubsw_r2r(r5, r3);       /* r3 = B = c7*i1 - c1*i7 */ \
83 44cb64ee Mike Melanson
    movq_m2r(*J(6), r5); \
84 daae8699 Mike Melanson
    paddsw_r2r(r7, r0);       /* r0 = A = c1*i1 + c7*i7 */ \
85
    movq_r2r(r5, r7);         /* r7 = i6 */ \
86
    psubsw_r2r(r4, r0);       /* r0 = A - C */ \
87
    pmulhw_m2r(*C(2), r5);    /* r5 = c2*i6 - i6 */ \
88
    paddw_r2r(r1, r2);        /* r2 = c2*i2 */ \
89
    pmulhw_m2r(*C(6), r1);    /* r1 = c6*i2 */ \
90
    paddsw_r2r(r4, r4);       /* r4 = C + C */ \
91
    paddsw_r2r(r0, r4);       /* r4 = C. = A + C */ \
92
    psubsw_r2r(r6, r3);       /* r3 = B - D */ \
93
    paddw_r2r(r7, r5);        /* r5 = c2*i6 */ \
94
    paddsw_r2r(r6, r6);       /* r6 = D + D */ \
95
    pmulhw_m2r(*C(6), r7);    /* r7 = c6*i6 */ \
96
    paddsw_r2r(r3, r6);       /* r6 = D. = B + D */ \
97
    movq_r2m(r4, *I(1));      /* save C. at I(1) */ \
98
    psubsw_r2r(r5, r1);       /* r1 = H = c6*i2 - c2*i6 */ \
99 44cb64ee Mike Melanson
    movq_m2r(*C(4), r4); \
100 daae8699 Mike Melanson
    movq_r2r(r3, r5);         /* r5 = B - D */ \
101
    pmulhw_r2r(r4, r3);       /* r3 = (c4 - 1) * (B - D) */ \
102
    paddsw_r2r(r2, r7);       /* r7 = G = c6*i6 + c2*i2 */ \
103
    movq_r2m(r6, *I(2));      /* save D. at I(2) */ \
104
    movq_r2r(r0, r2);         /* r2 = A - C */ \
105 44cb64ee Mike Melanson
    movq_m2r(*I(0), r6); \
106 daae8699 Mike Melanson
    pmulhw_r2r(r4, r0);       /* r0 = (c4 - 1) * (A - C) */ \
107
    paddw_r2r(r3, r5);        /* r5 = B. = c4 * (B - D) */ \
108 44cb64ee Mike Melanson
    movq_m2r(*J(4), r3); \
109 daae8699 Mike Melanson
    psubsw_r2r(r1, r5);       /* r5 = B.. = B. - H */ \
110
    paddw_r2r(r0, r2);        /* r0 = A. = c4 * (A - C) */ \
111
    psubsw_r2r(r3, r6);       /* r6 = i0 - i4 */ \
112 44cb64ee Mike Melanson
    movq_r2r(r6, r0); \
113 daae8699 Mike Melanson
    pmulhw_r2r(r4, r6);       /* r6 = (c4 - 1) * (i0 - i4) */ \
114
    paddsw_r2r(r3, r3);       /* r3 = i4 + i4 */ \
115
    paddsw_r2r(r1, r1);       /* r1 = H + H */ \
116
    paddsw_r2r(r0, r3);       /* r3 = i0 + i4 */ \
117
    paddsw_r2r(r5, r1);       /* r1 = H. = B + H */ \
118
    pmulhw_r2r(r3, r4);       /* r4 = (c4 - 1) * (i0 + i4) */ \
119
    paddsw_r2r(r0, r6);       /* r6 = F = c4 * (i0 - i4) */ \
120
    psubsw_r2r(r2, r6);       /* r6 = F. = F - A. */ \
121
    paddsw_r2r(r2, r2);       /* r2 = A. + A. */ \
122
    movq_m2r(*I(1), r0);      /* r0 = C. */ \
123
    paddsw_r2r(r6, r2);       /* r2 = A.. = F + A. */ \
124
    paddw_r2r(r3, r4);        /* r4 = E = c4 * (i0 + i4) */ \
125
    psubsw_r2r(r1, r2);       /* r2 = R2 = A.. - H. */ \
126
}
127 44cb64ee Mike Melanson
128
/* RowIDCT gets ready to transpose */
129 daae8699 Mike Melanson
#define RowIDCT() { \
130 44cb64ee Mike Melanson
    \
131 daae8699 Mike Melanson
    BeginIDCT(); \
132 44cb64ee Mike Melanson
    \
133 daae8699 Mike Melanson
    movq_m2r(*I(2), r3);   /* r3 = D. */ \
134
    psubsw_r2r(r7, r4);    /* r4 = E. = E - G */ \
135
    paddsw_r2r(r1, r1);    /* r1 = H. + H. */ \
136
    paddsw_r2r(r7, r7);    /* r7 = G + G */ \
137
    paddsw_r2r(r2, r1);    /* r1 = R1 = A.. + H. */ \
138
    paddsw_r2r(r4, r7);    /* r7 = G. = E + G */ \
139
    psubsw_r2r(r3, r4);    /* r4 = R4 = E. - D. */ \
140
    paddsw_r2r(r3, r3); \
141
    psubsw_r2r(r5, r6);    /* r6 = R6 = F. - B.. */ \
142 44cb64ee Mike Melanson
    paddsw_r2r(r5, r5); \
143 daae8699 Mike Melanson
    paddsw_r2r(r4, r3);    /* r3 = R3 = E. + D. */ \
144
    paddsw_r2r(r6, r5);    /* r5 = R5 = F. + B.. */ \
145
    psubsw_r2r(r0, r7);    /* r7 = R7 = G. - C. */ \
146 44cb64ee Mike Melanson
    paddsw_r2r(r0, r0); \
147 daae8699 Mike Melanson
    movq_r2m(r1, *I(1));   /* save R1 */ \
148
    paddsw_r2r(r7, r0);    /* r0 = R0 = G. + C. */ \
149
}
150 44cb64ee Mike Melanson
151
/* Column IDCT normalizes and stores final results */
152 daae8699 Mike Melanson
#define ColumnIDCT() { \
153 44cb64ee Mike Melanson
    \
154 daae8699 Mike Melanson
    BeginIDCT(); \
155 44cb64ee Mike Melanson
    \
156 daae8699 Mike Melanson
    paddsw_m2r(*Eight, r2);    /* adjust R2 (and R1) for shift */ \
157
    paddsw_r2r(r1, r1);        /* r1 = H. + H. */ \
158
    paddsw_r2r(r2, r1);        /* r1 = R1 = A.. + H. */ \
159
    psraw_i2r(4, r2);          /* r2 = NR2 */ \
160
    psubsw_r2r(r7, r4);        /* r4 = E. = E - G */ \
161
    psraw_i2r(4, r1);          /* r1 = NR1 */ \
162
    movq_m2r(*I(2), r3);       /* r3 = D. */ \
163
    paddsw_r2r(r7, r7);        /* r7 = G + G */ \
164
    movq_r2m(r2, *I(2));       /* store NR2 at I2 */ \
165
    paddsw_r2r(r4, r7);        /* r7 = G. = E + G */ \
166
    movq_r2m(r1, *I(1));       /* store NR1 at I1 */ \
167
    psubsw_r2r(r3, r4);        /* r4 = R4 = E. - D. */ \
168
    paddsw_m2r(*Eight, r4);    /* adjust R4 (and R3) for shift */ \
169
    paddsw_r2r(r3, r3);        /* r3 = D. + D. */ \
170
    paddsw_r2r(r4, r3);        /* r3 = R3 = E. + D. */ \
171
    psraw_i2r(4, r4);          /* r4 = NR4 */ \
172
    psubsw_r2r(r5, r6);        /* r6 = R6 = F. - B.. */ \
173
    psraw_i2r(4, r3);          /* r3 = NR3 */ \
174
    paddsw_m2r(*Eight, r6);    /* adjust R6 (and R5) for shift */ \
175
    paddsw_r2r(r5, r5);        /* r5 = B.. + B.. */ \
176
    paddsw_r2r(r6, r5);        /* r5 = R5 = F. + B.. */ \
177
    psraw_i2r(4, r6);          /* r6 = NR6 */ \
178
    movq_r2m(r4, *J(4));       /* store NR4 at J4 */ \
179
    psraw_i2r(4, r5);          /* r5 = NR5 */ \
180
    movq_r2m(r3, *I(3));       /* store NR3 at I3 */ \
181
    psubsw_r2r(r0, r7);        /* r7 = R7 = G. - C. */ \
182
    paddsw_m2r(*Eight, r7);    /* adjust R7 (and R0) for shift */ \
183
    paddsw_r2r(r0, r0);        /* r0 = C. + C. */ \
184
    paddsw_r2r(r7, r0);        /* r0 = R0 = G. + C. */ \
185
    psraw_i2r(4, r7);          /* r7 = NR7 */ \
186
    movq_r2m(r6, *J(6));       /* store NR6 at J6 */ \
187
    psraw_i2r(4, r0);          /* r0 = NR0 */ \
188
    movq_r2m(r5, *J(5));       /* store NR5 at J5 */ \
189
    movq_r2m(r7, *J(7));       /* store NR7 at J7 */ \
190
    movq_r2m(r0, *I(0));       /* store NR0 at I0 */ \
191
}
192 44cb64ee Mike Melanson
193
/* Following macro does two 4x4 transposes in place.
194

195
  At entry (we assume):
196

197 daae8699 Mike Melanson
    r0 = a3 a2 a1 a0
198
    I(1) = b3 b2 b1 b0
199
    r2 = c3 c2 c1 c0
200
    r3 = d3 d2 d1 d0
201 44cb64ee Mike Melanson

202 daae8699 Mike Melanson
    r4 = e3 e2 e1 e0
203
    r5 = f3 f2 f1 f0
204
    r6 = g3 g2 g1 g0
205
    r7 = h3 h2 h1 h0
206 44cb64ee Mike Melanson

207 daae8699 Mike Melanson
  At exit, we have:
208 44cb64ee Mike Melanson

209 daae8699 Mike Melanson
    I(0) = d0 c0 b0 a0
210
    I(1) = d1 c1 b1 a1
211
    I(2) = d2 c2 b2 a2
212
    I(3) = d3 c3 b3 a3
213 115329f1 Diego Biurrun

214 daae8699 Mike Melanson
    J(4) = h0 g0 f0 e0
215
    J(5) = h1 g1 f1 e1
216
    J(6) = h2 g2 f2 e2
217
    J(7) = h3 g3 f3 e3
218 44cb64ee Mike Melanson

219
   I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
220
   J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
221

222
   Since r1 is free at entry, we calculate the Js first. */
223
224 daae8699 Mike Melanson
#define Transpose() { \
225
    movq_r2r(r4, r1);         /* r1 = e3 e2 e1 e0 */ \
226
    punpcklwd_r2r(r5, r4);    /* r4 = f1 e1 f0 e0 */ \
227
    movq_r2m(r0, *I(0));      /* save a3 a2 a1 a0 */ \
228
    punpckhwd_r2r(r5, r1);    /* r1 = f3 e3 f2 e2 */ \
229
    movq_r2r(r6, r0);         /* r0 = g3 g2 g1 g0 */ \
230
    punpcklwd_r2r(r7, r6);    /* r6 = h1 g1 h0 g0 */ \
231
    movq_r2r(r4, r5);         /* r5 = f1 e1 f0 e0 */ \
232
    punpckldq_r2r(r6, r4);    /* r4 = h0 g0 f0 e0 = R4 */ \
233
    punpckhdq_r2r(r6, r5);    /* r5 = h1 g1 f1 e1 = R5 */ \
234
    movq_r2r(r1, r6);         /* r6 = f3 e3 f2 e2 */ \
235 44cb64ee Mike Melanson
    movq_r2m(r4, *J(4)); \
236 daae8699 Mike Melanson
    punpckhwd_r2r(r7, r0);    /* r0 = h3 g3 h2 g2 */ \
237 44cb64ee Mike Melanson
    movq_r2m(r5, *J(5)); \
238 daae8699 Mike Melanson
    punpckhdq_r2r(r0, r6);    /* r6 = h3 g3 f3 e3 = R7 */ \
239
    movq_m2r(*I(0), r4);      /* r4 = a3 a2 a1 a0 */ \
240
    punpckldq_r2r(r0, r1);    /* r1 = h2 g2 f2 e2 = R6 */ \
241
    movq_m2r(*I(1), r5);      /* r5 = b3 b2 b1 b0 */ \
242
    movq_r2r(r4, r0);         /* r0 = a3 a2 a1 a0 */ \
243 44cb64ee Mike Melanson
    movq_r2m(r6, *J(7)); \
244 daae8699 Mike Melanson
    punpcklwd_r2r(r5, r0);    /* r0 = b1 a1 b0 a0 */ \
245 44cb64ee Mike Melanson
    movq_r2m(r1, *J(6)); \
246 daae8699 Mike Melanson
    punpckhwd_r2r(r5, r4);    /* r4 = b3 a3 b2 a2 */ \
247
    movq_r2r(r2, r5);         /* r5 = c3 c2 c1 c0 */ \
248
    punpcklwd_r2r(r3, r2);    /* r2 = d1 c1 d0 c0 */ \
249
    movq_r2r(r0, r1);         /* r1 = b1 a1 b0 a0 */ \
250
    punpckldq_r2r(r2, r0);    /* r0 = d0 c0 b0 a0 = R0 */ \
251
    punpckhdq_r2r(r2, r1);    /* r1 = d1 c1 b1 a1 = R1 */ \
252
    movq_r2r(r4, r2);         /* r2 = b3 a3 b2 a2 */ \
253 44cb64ee Mike Melanson
    movq_r2m(r0, *I(0)); \
254 daae8699 Mike Melanson
    punpckhwd_r2r(r3, r5);    /* r5 = d3 c3 d2 c2 */ \
255 44cb64ee Mike Melanson
    movq_r2m(r1, *I(1)); \
256 daae8699 Mike Melanson
    punpckhdq_r2r(r5, r4);    /* r4 = d3 c3 b3 a3 = R3 */ \
257
    punpckldq_r2r(r5, r2);    /* r2 = d2 c2 b2 a2 = R2 */ \
258 44cb64ee Mike Melanson
    movq_r2m(r4, *I(3)); \
259 daae8699 Mike Melanson
    movq_r2m(r2, *I(2)); \
260
}
261 44cb64ee Mike Melanson
262 5773a746 Michael Niedermayer
void ff_vp3_dsp_init_mmx(void)
263 44cb64ee Mike Melanson
{
264
    int j = 16;
265
    uint16_t *p;
266
267
    j = 1;
268
    do {
269
        p = idct_constants + ((j + 3) << 2);
270
        p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
271
    } while (++j <= 7);
272
273 daae8699 Mike Melanson
    idct_constants[44] = idct_constants[45] =
274 44cb64ee Mike Melanson
    idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
275
}
276
277 5773a746 Michael Niedermayer
void ff_vp3_idct_mmx(int16_t *output_data)
278 44cb64ee Mike Melanson
{
279
    /* eax = quantized input
280
     * ebx = dequantizer matrix
281
     * ecx = IDCT constants
282
     *  M(I) = ecx + MaskOffset(0) + I * 8
283
     *  C(I) = ecx + CosineOffset(32) + (I-1) * 8
284
     * edx = output
285
     * r0..r7 = mm0..mm7
286
     */
287
288
#define C(x) (idct_constants + 16 + (x - 1) * 4)
289
#define Eight (idct_constants + 44)
290
291 daae8699 Mike Melanson
    /* at this point, function has completed dequantization + dezigzag +
292 44cb64ee Mike Melanson
     * partial transposition; now do the idct itself */
293
#define I(K) (output_data + K * 8)
294
#define J(K) (output_data + ((K - 4) * 8) + 4)
295
296
    RowIDCT();
297
    Transpose();
298
299
#undef I
300
#undef J
301
#define I(K) (output_data + (K * 8) + 32)
302
#define J(K) (output_data + ((K - 4) * 8) + 36)
303
304
    RowIDCT();
305
    Transpose();
306
307
#undef I
308
#undef J
309
#define I(K) (output_data + K * 8)
310
#define J(K) (output_data + K * 8)
311
312
    ColumnIDCT();
313
314
#undef I
315
#undef J
316
#define I(K) (output_data + (K * 8) + 4)
317
#define J(K) (output_data + (K * 8) + 4)
318
319
    ColumnIDCT();
320
321
#undef I
322
#undef J
323
324
}
325 5b0b7054 Aurelien Jacobs
326
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
327
{
328
    ff_vp3_idct_mmx(block);
329
    put_signed_pixels_clamped_mmx(block, dest, line_size);
330
}
331
332
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
333
{
334
    ff_vp3_idct_mmx(block);
335
    add_pixels_clamped_mmx(block, dest, line_size);
336
}