Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / vp3dsp_mmx.c @ be449fca

History | View | Annotate | Download (12 KB)

1 44cb64ee Mike Melanson
/*
2
 * Copyright (C) 2004 the ffmpeg project
3
 *
4 b78e7197 Diego Biurrun
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7 44cb64ee Mike Melanson
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9 b78e7197 Diego Biurrun
 * version 2.1 of the License, or (at your option) any later version.
10 44cb64ee Mike Melanson
 *
11 b78e7197 Diego Biurrun
 * FFmpeg is distributed in the hope that it will be useful,
12 44cb64ee Mike Melanson
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17 b78e7197 Diego Biurrun
 * License along with FFmpeg; if not, write to the Free Software
18 5509bffa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 44cb64ee Mike Melanson
 */
20
21
/**
22
 * @file vp3dsp_mmx.c
23
 * MMX-optimized functions cribbed from the original VP3 source code.
24
 */
25
26 245976da Diego Biurrun
#include "libavcodec/dsputil.h"
27 167029a7 David Conrad
#include "dsputil_mmx.h"
28 44cb64ee Mike Melanson
29 b4c3d835 David Conrad
extern const uint16_t ff_vp3_idct_data[];
30 44cb64ee Mike Melanson
31
/* from original comments: The Macro does IDct on 4 1-D Dcts */
32 437e3f4d David Conrad
#define BeginIDCT() \
33
    "movq   "I(3)", %%mm2 \n\t" \
34
    "movq   "C(3)", %%mm6 \n\t" \
35
    "movq    %%mm2, %%mm4 \n\t" \
36
    "movq   "J(5)", %%mm7 \n\t" \
37
    "pmulhw  %%mm6, %%mm4 \n\t"    /* r4 = c3*i3 - i3 */ \
38
    "movq   "C(5)", %%mm1 \n\t" \
39
    "pmulhw  %%mm7, %%mm6 \n\t"    /* r6 = c3*i5 - i5 */ \
40
    "movq    %%mm1, %%mm5 \n\t" \
41
    "pmulhw  %%mm2, %%mm1 \n\t"    /* r1 = c5*i3 - i3 */ \
42
    "movq   "I(1)", %%mm3 \n\t" \
43
    "pmulhw  %%mm7, %%mm5 \n\t"    /* r5 = c5*i5 - i5 */ \
44
    "movq   "C(1)", %%mm0 \n\t" \
45
    "paddw   %%mm2, %%mm4 \n\t"    /* r4 = c3*i3 */ \
46
    "paddw   %%mm7, %%mm6 \n\t"    /* r6 = c3*i5 */ \
47
    "paddw   %%mm1, %%mm2 \n\t"    /* r2 = c5*i3 */ \
48
    "movq   "J(7)", %%mm1 \n\t" \
49
    "paddw   %%mm5, %%mm7 \n\t"    /* r7 = c5*i5 */ \
50
    "movq    %%mm0, %%mm5 \n\t"    /* r5 = c1 */ \
51
    "pmulhw  %%mm3, %%mm0 \n\t"    /* r0 = c1*i1 - i1 */ \
52
    "paddsw  %%mm7, %%mm4 \n\t"    /* r4 = C = c3*i3 + c5*i5 */ \
53
    "pmulhw  %%mm1, %%mm5 \n\t"    /* r5 = c1*i7 - i7 */ \
54
    "movq   "C(7)", %%mm7 \n\t" \
55
    "psubsw  %%mm2, %%mm6 \n\t"    /* r6 = D = c3*i5 - c5*i3 */ \
56
    "paddw   %%mm3, %%mm0 \n\t"    /* r0 = c1*i1 */ \
57
    "pmulhw  %%mm7, %%mm3 \n\t"    /* r3 = c7*i1 */ \
58
    "movq   "I(2)", %%mm2 \n\t" \
59
    "pmulhw  %%mm1, %%mm7 \n\t"    /* r7 = c7*i7 */ \
60
    "paddw   %%mm1, %%mm5 \n\t"    /* r5 = c1*i7 */ \
61
    "movq    %%mm2, %%mm1 \n\t"    /* r1 = i2 */ \
62
    "pmulhw "C(2)", %%mm2 \n\t"    /* r2 = c2*i2 - i2 */ \
63
    "psubsw  %%mm5, %%mm3 \n\t"    /* r3 = B = c7*i1 - c1*i7 */ \
64
    "movq   "J(6)", %%mm5 \n\t" \
65
    "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = A = c1*i1 + c7*i7 */ \
66
    "movq    %%mm5, %%mm7 \n\t"    /* r7 = i6 */ \
67
    "psubsw  %%mm4, %%mm0 \n\t"    /* r0 = A - C */ \
68
    "pmulhw "C(2)", %%mm5 \n\t"    /* r5 = c2*i6 - i6 */ \
69
    "paddw   %%mm1, %%mm2 \n\t"    /* r2 = c2*i2 */ \
70
    "pmulhw "C(6)", %%mm1 \n\t"    /* r1 = c6*i2 */ \
71
    "paddsw  %%mm4, %%mm4 \n\t"    /* r4 = C + C */ \
72
    "paddsw  %%mm0, %%mm4 \n\t"    /* r4 = C. = A + C */ \
73
    "psubsw  %%mm6, %%mm3 \n\t"    /* r3 = B - D */ \
74
    "paddw   %%mm7, %%mm5 \n\t"    /* r5 = c2*i6 */ \
75
    "paddsw  %%mm6, %%mm6 \n\t"    /* r6 = D + D */ \
76
    "pmulhw "C(6)", %%mm7 \n\t"    /* r7 = c6*i6 */ \
77
    "paddsw  %%mm3, %%mm6 \n\t"    /* r6 = D. = B + D */ \
78
    "movq    %%mm4, "I(1)"\n\t"    /* save C. at I(1) */ \
79
    "psubsw  %%mm5, %%mm1 \n\t"    /* r1 = H = c6*i2 - c2*i6 */ \
80
    "movq   "C(4)", %%mm4 \n\t" \
81
    "movq    %%mm3, %%mm5 \n\t"    /* r5 = B - D */ \
82
    "pmulhw  %%mm4, %%mm3 \n\t"    /* r3 = (c4 - 1) * (B - D) */ \
83
    "paddsw  %%mm2, %%mm7 \n\t"    /* r3 = (c4 - 1) * (B - D) */ \
84
    "movq    %%mm6, "I(2)"\n\t"    /* save D. at I(2) */ \
85
    "movq    %%mm0, %%mm2 \n\t"    /* r2 = A - C */ \
86
    "movq   "I(0)", %%mm6 \n\t" \
87
    "pmulhw  %%mm4, %%mm0 \n\t"    /* r0 = (c4 - 1) * (A - C) */ \
88
    "paddw   %%mm3, %%mm5 \n\t"    /* r5 = B. = c4 * (B - D) */ \
89
    "movq   "J(4)", %%mm3 \n\t" \
90
    "psubsw  %%mm1, %%mm5 \n\t"    /* r5 = B.. = B. - H */ \
91
    "paddw   %%mm0, %%mm2 \n\t"    /* r0 = A. = c4 * (A - C) */ \
92
    "psubsw  %%mm3, %%mm6 \n\t"    /* r6 = i0 - i4 */ \
93
    "movq    %%mm6, %%mm0 \n\t" \
94
    "pmulhw  %%mm4, %%mm6 \n\t"    /* r6 = (c4 - 1) * (i0 - i4) */ \
95
    "paddsw  %%mm3, %%mm3 \n\t"    /* r3 = i4 + i4 */ \
96
    "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H + H */ \
97
    "paddsw  %%mm0, %%mm3 \n\t"    /* r3 = i0 + i4 */ \
98
    "paddsw  %%mm5, %%mm1 \n\t"    /* r1 = H. = B + H */ \
99
    "pmulhw  %%mm3, %%mm4 \n\t"    /* r4 = (c4 - 1) * (i0 + i4) */ \
100
    "paddsw  %%mm0, %%mm6 \n\t"    /* r6 = F = c4 * (i0 - i4) */ \
101
    "psubsw  %%mm2, %%mm6 \n\t"    /* r6 = F. = F - A. */ \
102
    "paddsw  %%mm2, %%mm2 \n\t"    /* r2 = A. + A. */ \
103
    "movq   "I(1)", %%mm0 \n\t"    /* r0 = C. */ \
104
    "paddsw  %%mm6, %%mm2 \n\t"    /* r2 = A.. = F + A. */ \
105
    "paddw   %%mm3, %%mm4 \n\t"    /* r4 = E = c4 * (i0 + i4) */ \
106
    "psubsw  %%mm1, %%mm2 \n\t"    /* r2 = R2 = A.. - H. */
107 44cb64ee Mike Melanson
108
/* RowIDCT gets ready to transpose */
109 437e3f4d David Conrad
#define RowIDCT() \
110
    BeginIDCT() \
111
    "movq   "I(2)", %%mm3 \n\t"    /* r3 = D. */ \
112
    "psubsw  %%mm7, %%mm4 \n\t"    /* r4 = E. = E - G */ \
113
    "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H. + H. */ \
114
    "paddsw  %%mm7, %%mm7 \n\t"    /* r7 = G + G */ \
115
    "paddsw  %%mm2, %%mm1 \n\t"    /* r1 = R1 = A.. + H. */ \
116
    "paddsw  %%mm4, %%mm7 \n\t"    /* r1 = R1 = A.. + H. */ \
117
    "psubsw  %%mm3, %%mm4 \n\t"    /* r4 = R4 = E. - D. */ \
118
    "paddsw  %%mm3, %%mm3 \n\t" \
119
    "psubsw  %%mm5, %%mm6 \n\t"    /* r6 = R6 = F. - B.. */ \
120
    "paddsw  %%mm5, %%mm5 \n\t" \
121
    "paddsw  %%mm4, %%mm3 \n\t"    /* r3 = R3 = E. + D. */ \
122
    "paddsw  %%mm6, %%mm5 \n\t"    /* r5 = R5 = F. + B.. */ \
123
    "psubsw  %%mm0, %%mm7 \n\t"    /* r7 = R7 = G. - C. */ \
124
    "paddsw  %%mm0, %%mm0 \n\t" \
125
    "movq    %%mm1, "I(1)"\n\t"    /* save R1 */ \
126
    "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = R0 = G. + C. */
127 44cb64ee Mike Melanson
128
/* Column IDCT normalizes and stores final results */
129 437e3f4d David Conrad
#define ColumnIDCT() \
130
    BeginIDCT() \
131
    "paddsw "OC_8", %%mm2 \n\t"    /* adjust R2 (and R1) for shift */ \
132
    "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H. + H. */ \
133
    "paddsw  %%mm2, %%mm1 \n\t"    /* r1 = R1 = A.. + H. */ \
134
    "psraw      $4, %%mm2 \n\t"    /* r2 = NR2 */ \
135
    "psubsw  %%mm7, %%mm4 \n\t"    /* r4 = E. = E - G */ \
136
    "psraw      $4, %%mm1 \n\t"    /* r1 = NR1 */ \
137
    "movq   "I(2)", %%mm3 \n\t"    /* r3 = D. */ \
138
    "paddsw  %%mm7, %%mm7 \n\t"    /* r7 = G + G */ \
139
    "movq    %%mm2, "I(2)"\n\t"    /* store NR2 at I2 */ \
140
    "paddsw  %%mm4, %%mm7 \n\t"    /* r7 = G. = E + G */ \
141
    "movq    %%mm1, "I(1)"\n\t"    /* store NR1 at I1 */ \
142
    "psubsw  %%mm3, %%mm4 \n\t"    /* r4 = R4 = E. - D. */ \
143
    "paddsw "OC_8", %%mm4 \n\t"    /* adjust R4 (and R3) for shift */ \
144
    "paddsw  %%mm3, %%mm3 \n\t"    /* r3 = D. + D. */ \
145
    "paddsw  %%mm4, %%mm3 \n\t"    /* r3 = R3 = E. + D. */ \
146
    "psraw      $4, %%mm4 \n\t"    /* r4 = NR4 */ \
147
    "psubsw  %%mm5, %%mm6 \n\t"    /* r6 = R6 = F. - B.. */ \
148
    "psraw      $4, %%mm3 \n\t"    /* r3 = NR3 */ \
149
    "paddsw "OC_8", %%mm6 \n\t"    /* adjust R6 (and R5) for shift */ \
150
    "paddsw  %%mm5, %%mm5 \n\t"    /* r5 = B.. + B.. */ \
151
    "paddsw  %%mm6, %%mm5 \n\t"    /* r5 = R5 = F. + B.. */ \
152
    "psraw      $4, %%mm6 \n\t"    /* r6 = NR6 */ \
153
    "movq    %%mm4, "J(4)"\n\t"    /* store NR4 at J4 */ \
154
    "psraw      $4, %%mm5 \n\t"    /* r5 = NR5 */ \
155
    "movq    %%mm3, "I(3)"\n\t"    /* store NR3 at I3 */ \
156
    "psubsw  %%mm0, %%mm7 \n\t"    /* r7 = R7 = G. - C. */ \
157
    "paddsw "OC_8", %%mm7 \n\t"    /* adjust R7 (and R0) for shift */ \
158
    "paddsw  %%mm0, %%mm0 \n\t"    /* r0 = C. + C. */ \
159
    "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = R0 = G. + C. */ \
160
    "psraw      $4, %%mm7 \n\t"    /* r7 = NR7 */ \
161
    "movq    %%mm6, "J(6)"\n\t"    /* store NR6 at J6 */ \
162
    "psraw      $4, %%mm0 \n\t"    /* r0 = NR0 */ \
163
    "movq    %%mm5, "J(5)"\n\t"    /* store NR5 at J5 */ \
164
    "movq    %%mm7, "J(7)"\n\t"    /* store NR7 at J7 */ \
165
    "movq    %%mm0, "I(0)"\n\t"    /* store NR0 at I0 */
166 44cb64ee Mike Melanson
167
/* Following macro does two 4x4 transposes in place.
168

169
  At entry (we assume):
170

171 daae8699 Mike Melanson
    r0 = a3 a2 a1 a0
172
    I(1) = b3 b2 b1 b0
173
    r2 = c3 c2 c1 c0
174
    r3 = d3 d2 d1 d0
175 44cb64ee Mike Melanson

176 daae8699 Mike Melanson
    r4 = e3 e2 e1 e0
177
    r5 = f3 f2 f1 f0
178
    r6 = g3 g2 g1 g0
179
    r7 = h3 h2 h1 h0
180 44cb64ee Mike Melanson

181 daae8699 Mike Melanson
  At exit, we have:
182 44cb64ee Mike Melanson

183 daae8699 Mike Melanson
    I(0) = d0 c0 b0 a0
184
    I(1) = d1 c1 b1 a1
185
    I(2) = d2 c2 b2 a2
186
    I(3) = d3 c3 b3 a3
187 115329f1 Diego Biurrun

188 daae8699 Mike Melanson
    J(4) = h0 g0 f0 e0
189
    J(5) = h1 g1 f1 e1
190
    J(6) = h2 g2 f2 e2
191
    J(7) = h3 g3 f3 e3
192 44cb64ee Mike Melanson

193
   I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
194
   J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
195

196
   Since r1 is free at entry, we calculate the Js first. */
197 437e3f4d David Conrad
#define Transpose() \
198
    "movq       %%mm4, %%mm1 \n\t"    /* r1 = e3 e2 e1 e0 */ \
199
    "punpcklwd  %%mm5, %%mm4 \n\t"    /* r4 = f1 e1 f0 e0 */ \
200
    "movq       %%mm0, "I(0)"\n\t"    /* save a3 a2 a1 a0 */ \
201
    "punpckhwd  %%mm5, %%mm1 \n\t"    /* r1 = f3 e3 f2 e2 */ \
202
    "movq       %%mm6, %%mm0 \n\t"    /* r0 = g3 g2 g1 g0 */ \
203
    "punpcklwd  %%mm7, %%mm6 \n\t"    /* r6 = h1 g1 h0 g0 */ \
204
    "movq       %%mm4, %%mm5 \n\t"    /* r5 = f1 e1 f0 e0 */ \
205
    "punpckldq  %%mm6, %%mm4 \n\t"    /* r4 = h0 g0 f0 e0 = R4 */ \
206
    "punpckhdq  %%mm6, %%mm5 \n\t"    /* r5 = h1 g1 f1 e1 = R5 */ \
207
    "movq       %%mm1, %%mm6 \n\t"    /* r6 = f3 e3 f2 e2 */ \
208
    "movq       %%mm4, "J(4)"\n\t" \
209
    "punpckhwd  %%mm7, %%mm0 \n\t"    /* r0 = h3 g3 h2 g2 */ \
210
    "movq       %%mm5, "J(5)"\n\t" \
211
    "punpckhdq  %%mm0, %%mm6 \n\t"    /* r6 = h3 g3 f3 e3 = R7 */ \
212
    "movq      "I(0)", %%mm4 \n\t"    /* r4 = a3 a2 a1 a0 */ \
213
    "punpckldq  %%mm0, %%mm1 \n\t"    /* r1 = h2 g2 f2 e2 = R6 */ \
214
    "movq      "I(1)", %%mm5 \n\t"    /* r5 = b3 b2 b1 b0 */ \
215
    "movq       %%mm4, %%mm0 \n\t"    /* r0 = a3 a2 a1 a0 */ \
216
    "movq       %%mm6, "J(7)"\n\t" \
217
    "punpcklwd  %%mm5, %%mm0 \n\t"    /* r0 = b1 a1 b0 a0 */ \
218
    "movq       %%mm1, "J(6)"\n\t" \
219
    "punpckhwd  %%mm5, %%mm4 \n\t"    /* r4 = b3 a3 b2 a2 */ \
220
    "movq       %%mm2, %%mm5 \n\t"    /* r5 = c3 c2 c1 c0 */ \
221
    "punpcklwd  %%mm3, %%mm2 \n\t"    /* r2 = d1 c1 d0 c0 */ \
222
    "movq       %%mm0, %%mm1 \n\t"    /* r1 = b1 a1 b0 a0 */ \
223
    "punpckldq  %%mm2, %%mm0 \n\t"    /* r0 = d0 c0 b0 a0 = R0 */ \
224
    "punpckhdq  %%mm2, %%mm1 \n\t"    /* r1 = d1 c1 b1 a1 = R1 */ \
225
    "movq       %%mm4, %%mm2 \n\t"    /* r2 = b3 a3 b2 a2 */ \
226
    "movq       %%mm0, "I(0)"\n\t" \
227
    "punpckhwd  %%mm3, %%mm5 \n\t"    /* r5 = d3 c3 d2 c2 */ \
228
    "movq       %%mm1, "I(1)"\n\t" \
229
    "punpckhdq  %%mm5, %%mm4 \n\t"    /* r4 = d3 c3 b3 a3 = R3 */ \
230
    "punpckldq  %%mm5, %%mm2 \n\t"    /* r2 = d2 c2 b2 a2 = R2 */ \
231
    "movq       %%mm4, "I(3)"\n\t" \
232
    "movq       %%mm2, "I(2)"\n\t"
233 44cb64ee Mike Melanson
234 5773a746 Michael Niedermayer
void ff_vp3_idct_mmx(int16_t *output_data)
235 44cb64ee Mike Melanson
{
236
    /* eax = quantized input
237
     * ebx = dequantizer matrix
238
     * ecx = IDCT constants
239
     *  M(I) = ecx + MaskOffset(0) + I * 8
240
     *  C(I) = ecx + CosineOffset(32) + (I-1) * 8
241
     * edx = output
242
     * r0..r7 = mm0..mm7
243
     */
244
245 437e3f4d David Conrad
#define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
246
#define OC_8 "%2"
247 44cb64ee Mike Melanson
248 daae8699 Mike Melanson
    /* at this point, function has completed dequantization + dezigzag +
249 44cb64ee Mike Melanson
     * partial transposition; now do the idct itself */
250 437e3f4d David Conrad
#define I(x) AV_STRINGIFY(16* x       )"(%0)"
251
#define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)"
252 44cb64ee Mike Melanson
253 be449fca Diego Pettenò
    __asm__ volatile (
254 c3c5bba1 David Conrad
        RowIDCT()
255
        Transpose()
256 44cb64ee Mike Melanson
257
#undef I
258
#undef J
259 437e3f4d David Conrad
#define I(x) AV_STRINGIFY(16* x    + 64)"(%0)"
260
#define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)"
261 44cb64ee Mike Melanson
262 c3c5bba1 David Conrad
        RowIDCT()
263
        Transpose()
264 44cb64ee Mike Melanson
265
#undef I
266
#undef J
267 437e3f4d David Conrad
#define I(x) AV_STRINGIFY(16*x)"(%0)"
268
#define J(x) AV_STRINGIFY(16*x)"(%0)"
269 44cb64ee Mike Melanson
270 c3c5bba1 David Conrad
        ColumnIDCT()
271 44cb64ee Mike Melanson
272
#undef I
273
#undef J
274 437e3f4d David Conrad
#define I(x) AV_STRINGIFY(16*x + 8)"(%0)"
275
#define J(x) AV_STRINGIFY(16*x + 8)"(%0)"
276 44cb64ee Mike Melanson
277 c3c5bba1 David Conrad
        ColumnIDCT()
278 437e3f4d David Conrad
        :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
279
    );
280 44cb64ee Mike Melanson
#undef I
281
#undef J
282
283
}
284 5b0b7054 Aurelien Jacobs
285
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
286
{
287
    ff_vp3_idct_mmx(block);
288
    put_signed_pixels_clamped_mmx(block, dest, line_size);
289
}
290
291
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
292
{
293
    ff_vp3_idct_mmx(block);
294
    add_pixels_clamped_mmx(block, dest, line_size);
295
}