Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp3dsp_mmx.c @ 05aec7bb

History | View | Annotate | Download (15.6 KB)

1 44cb64ee Mike Melanson
/*
2
 * Copyright (C) 2004 the ffmpeg project
3
 *
4 b78e7197 Diego Biurrun
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7 44cb64ee Mike Melanson
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9 b78e7197 Diego Biurrun
 * version 2.1 of the License, or (at your option) any later version.
10 44cb64ee Mike Melanson
 *
11 b78e7197 Diego Biurrun
 * FFmpeg is distributed in the hope that it will be useful,
12 44cb64ee Mike Melanson
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17 b78e7197 Diego Biurrun
 * License along with FFmpeg; if not, write to the Free Software
18 5509bffa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 44cb64ee Mike Melanson
 */
20
21
/**
22 bad5537e Diego Biurrun
 * @file libavcodec/x86/vp3dsp_mmx.c
23 44cb64ee Mike Melanson
 * MMX-optimized functions cribbed from the original VP3 source code.
24
 */
25
26 357f45d9 David Conrad
#include "libavutil/x86_cpu.h"
27 245976da Diego Biurrun
#include "libavcodec/dsputil.h"
28 167029a7 David Conrad
#include "dsputil_mmx.h"
29 c26e58e3 Måns Rullgård
#include "vp3dsp_mmx.h"
30 44cb64ee Mike Melanson
31 b4c3d835 David Conrad
extern const uint16_t ff_vp3_idct_data[];
32 44cb64ee Mike Melanson
33 357f45d9 David Conrad
// this is off by one or two for some cases when filter_limit is greater than 63
34
// in:  p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
35
// out: p1 in mm4, p2 in mm3
36
#define VP3_LOOP_FILTER(flim) \
37
    "movq       %%mm6, %%mm7 \n\t" \
38
    "pand    "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \
39
    "psrlw         $3, %%mm7 \n\t" \
40
    "pand    "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \
41
    "movq       %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \
42
    "pxor       %%mm4, %%mm2 \n\t" \
43
    "pand    "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \
44
    "movq       %%mm2, %%mm5 \n\t" \
45
    "paddb      %%mm2, %%mm2 \n\t" \
46
    "paddb      %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \
47
    "paddb      %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \
48
    "pcmpeqb    %%mm0, %%mm0 \n\t" \
49
    "pxor       %%mm0, %%mm1 \n\t" /* 255 - p3 */ \
50
    "pavgb      %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \
51
    "pxor       %%mm4, %%mm0 \n\t" /* 255 - p1 */ \
52
    "pavgb      %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \
53
    "paddb   "MANGLE(ff_pb_3 )", %%mm1 \n\t" \
54
    "pavgb      %%mm0, %%mm1 \n\t" /* 128+2+(   p2-p1  - p3) >> 2 */ \
55
    "pavgb      %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \
56
    "paddusb    %%mm1, %%mm7 \n\t" /* d+128+1 */ \
57
    "movq    "MANGLE(ff_pb_81)", %%mm6 \n\t" \
58
    "psubusb    %%mm7, %%mm6 \n\t" \
59
    "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \
60
\
61
    "movq     "#flim", %%mm5 \n\t" \
62
    "pminub     %%mm5, %%mm6 \n\t" \
63
    "pminub     %%mm5, %%mm7 \n\t" \
64
    "movq       %%mm6, %%mm0 \n\t" \
65
    "movq       %%mm7, %%mm1 \n\t" \
66
    "paddb      %%mm6, %%mm6 \n\t" \
67
    "paddb      %%mm7, %%mm7 \n\t" \
68
    "pminub     %%mm5, %%mm6 \n\t" \
69
    "pminub     %%mm5, %%mm7 \n\t" \
70
    "psubb      %%mm0, %%mm6 \n\t" \
71
    "psubb      %%mm1, %%mm7 \n\t" \
72
    "paddusb    %%mm7, %%mm4 \n\t" \
73
    "psubusb    %%mm6, %%mm4 \n\t" \
74
    "psubusb    %%mm7, %%mm3 \n\t" \
75
    "paddusb    %%mm6, %%mm3 \n\t"
76
77
#define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \
78
    "movd "#mm", %0        \n\t" \
79
    "movw   %w0, -1"#dst0" \n\t" \
80
    "psrlq  $32, "#mm"     \n\t" \
81
    "shr    $16, %0        \n\t" \
82
    "movw   %w0, -1"#dst1" \n\t" \
83
    "movd "#mm", %0        \n\t" \
84
    "movw   %w0, -1"#dst2" \n\t" \
85
    "shr    $16, %0        \n\t" \
86
    "movw   %w0, -1"#dst3" \n\t"
87
88 daa1ea04 David Conrad
void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
89 357f45d9 David Conrad
{
90
    __asm__ volatile(
91
        "movq          %0, %%mm6 \n\t"
92
        "movq          %1, %%mm4 \n\t"
93
        "movq          %2, %%mm2 \n\t"
94
        "movq          %3, %%mm1 \n\t"
95
96
        VP3_LOOP_FILTER(%4)
97
98
        "movq       %%mm4, %1    \n\t"
99
        "movq       %%mm3, %2    \n\t"
100
101
        : "+m" (*(uint64_t*)(src - 2*stride)),
102
          "+m" (*(uint64_t*)(src - 1*stride)),
103
          "+m" (*(uint64_t*)(src + 0*stride)),
104
          "+m" (*(uint64_t*)(src + 1*stride))
105
        : "m"(*(uint64_t*)(bounding_values+129))
106
    );
107
}
108
109 daa1ea04 David Conrad
void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
110 357f45d9 David Conrad
{
111
    x86_reg tmp;
112
113
    __asm__ volatile(
114
        "movd -2(%1),      %%mm6 \n\t"
115
        "movd -2(%1,%3),   %%mm0 \n\t"
116
        "movd -2(%1,%3,2), %%mm1 \n\t"
117
        "movd -2(%1,%4),   %%mm4 \n\t"
118
119
        TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2)
120
        VP3_LOOP_FILTER(%5)
121
        SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q)
122
123
        STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4)
124
        STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5)
125
126
        : "=&r"(tmp)
127
        : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride),
128
          "m"(*(uint64_t*)(bounding_values+129))
129
        : "memory"
130
    );
131
}
132
133 44cb64ee Mike Melanson
/* from original comments: The Macro does IDct on 4 1-D Dcts */
134 437e3f4d David Conrad
#define BeginIDCT() \
135
    "movq   "I(3)", %%mm2 \n\t" \
136
    "movq   "C(3)", %%mm6 \n\t" \
137
    "movq    %%mm2, %%mm4 \n\t" \
138
    "movq   "J(5)", %%mm7 \n\t" \
139
    "pmulhw  %%mm6, %%mm4 \n\t"    /* r4 = c3*i3 - i3 */ \
140
    "movq   "C(5)", %%mm1 \n\t" \
141
    "pmulhw  %%mm7, %%mm6 \n\t"    /* r6 = c3*i5 - i5 */ \
142
    "movq    %%mm1, %%mm5 \n\t" \
143
    "pmulhw  %%mm2, %%mm1 \n\t"    /* r1 = c5*i3 - i3 */ \
144
    "movq   "I(1)", %%mm3 \n\t" \
145
    "pmulhw  %%mm7, %%mm5 \n\t"    /* r5 = c5*i5 - i5 */ \
146
    "movq   "C(1)", %%mm0 \n\t" \
147
    "paddw   %%mm2, %%mm4 \n\t"    /* r4 = c3*i3 */ \
148
    "paddw   %%mm7, %%mm6 \n\t"    /* r6 = c3*i5 */ \
149
    "paddw   %%mm1, %%mm2 \n\t"    /* r2 = c5*i3 */ \
150
    "movq   "J(7)", %%mm1 \n\t" \
151
    "paddw   %%mm5, %%mm7 \n\t"    /* r7 = c5*i5 */ \
152
    "movq    %%mm0, %%mm5 \n\t"    /* r5 = c1 */ \
153
    "pmulhw  %%mm3, %%mm0 \n\t"    /* r0 = c1*i1 - i1 */ \
154
    "paddsw  %%mm7, %%mm4 \n\t"    /* r4 = C = c3*i3 + c5*i5 */ \
155
    "pmulhw  %%mm1, %%mm5 \n\t"    /* r5 = c1*i7 - i7 */ \
156
    "movq   "C(7)", %%mm7 \n\t" \
157
    "psubsw  %%mm2, %%mm6 \n\t"    /* r6 = D = c3*i5 - c5*i3 */ \
158
    "paddw   %%mm3, %%mm0 \n\t"    /* r0 = c1*i1 */ \
159
    "pmulhw  %%mm7, %%mm3 \n\t"    /* r3 = c7*i1 */ \
160
    "movq   "I(2)", %%mm2 \n\t" \
161
    "pmulhw  %%mm1, %%mm7 \n\t"    /* r7 = c7*i7 */ \
162
    "paddw   %%mm1, %%mm5 \n\t"    /* r5 = c1*i7 */ \
163
    "movq    %%mm2, %%mm1 \n\t"    /* r1 = i2 */ \
164
    "pmulhw "C(2)", %%mm2 \n\t"    /* r2 = c2*i2 - i2 */ \
165
    "psubsw  %%mm5, %%mm3 \n\t"    /* r3 = B = c7*i1 - c1*i7 */ \
166
    "movq   "J(6)", %%mm5 \n\t" \
167
    "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = A = c1*i1 + c7*i7 */ \
168
    "movq    %%mm5, %%mm7 \n\t"    /* r7 = i6 */ \
169
    "psubsw  %%mm4, %%mm0 \n\t"    /* r0 = A - C */ \
170
    "pmulhw "C(2)", %%mm5 \n\t"    /* r5 = c2*i6 - i6 */ \
171
    "paddw   %%mm1, %%mm2 \n\t"    /* r2 = c2*i2 */ \
172
    "pmulhw "C(6)", %%mm1 \n\t"    /* r1 = c6*i2 */ \
173
    "paddsw  %%mm4, %%mm4 \n\t"    /* r4 = C + C */ \
174
    "paddsw  %%mm0, %%mm4 \n\t"    /* r4 = C. = A + C */ \
175
    "psubsw  %%mm6, %%mm3 \n\t"    /* r3 = B - D */ \
176
    "paddw   %%mm7, %%mm5 \n\t"    /* r5 = c2*i6 */ \
177
    "paddsw  %%mm6, %%mm6 \n\t"    /* r6 = D + D */ \
178
    "pmulhw "C(6)", %%mm7 \n\t"    /* r7 = c6*i6 */ \
179
    "paddsw  %%mm3, %%mm6 \n\t"    /* r6 = D. = B + D */ \
180
    "movq    %%mm4, "I(1)"\n\t"    /* save C. at I(1) */ \
181
    "psubsw  %%mm5, %%mm1 \n\t"    /* r1 = H = c6*i2 - c2*i6 */ \
182
    "movq   "C(4)", %%mm4 \n\t" \
183
    "movq    %%mm3, %%mm5 \n\t"    /* r5 = B - D */ \
184
    "pmulhw  %%mm4, %%mm3 \n\t"    /* r3 = (c4 - 1) * (B - D) */ \
185
    "paddsw  %%mm2, %%mm7 \n\t"    /* r3 = (c4 - 1) * (B - D) */ \
186
    "movq    %%mm6, "I(2)"\n\t"    /* save D. at I(2) */ \
187
    "movq    %%mm0, %%mm2 \n\t"    /* r2 = A - C */ \
188
    "movq   "I(0)", %%mm6 \n\t" \
189
    "pmulhw  %%mm4, %%mm0 \n\t"    /* r0 = (c4 - 1) * (A - C) */ \
190
    "paddw   %%mm3, %%mm5 \n\t"    /* r5 = B. = c4 * (B - D) */ \
191
    "movq   "J(4)", %%mm3 \n\t" \
192
    "psubsw  %%mm1, %%mm5 \n\t"    /* r5 = B.. = B. - H */ \
193
    "paddw   %%mm0, %%mm2 \n\t"    /* r0 = A. = c4 * (A - C) */ \
194
    "psubsw  %%mm3, %%mm6 \n\t"    /* r6 = i0 - i4 */ \
195
    "movq    %%mm6, %%mm0 \n\t" \
196
    "pmulhw  %%mm4, %%mm6 \n\t"    /* r6 = (c4 - 1) * (i0 - i4) */ \
197
    "paddsw  %%mm3, %%mm3 \n\t"    /* r3 = i4 + i4 */ \
198
    "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H + H */ \
199
    "paddsw  %%mm0, %%mm3 \n\t"    /* r3 = i0 + i4 */ \
200
    "paddsw  %%mm5, %%mm1 \n\t"    /* r1 = H. = B + H */ \
201
    "pmulhw  %%mm3, %%mm4 \n\t"    /* r4 = (c4 - 1) * (i0 + i4) */ \
202
    "paddsw  %%mm0, %%mm6 \n\t"    /* r6 = F = c4 * (i0 - i4) */ \
203
    "psubsw  %%mm2, %%mm6 \n\t"    /* r6 = F. = F - A. */ \
204
    "paddsw  %%mm2, %%mm2 \n\t"    /* r2 = A. + A. */ \
205
    "movq   "I(1)", %%mm0 \n\t"    /* r0 = C. */ \
206
    "paddsw  %%mm6, %%mm2 \n\t"    /* r2 = A.. = F + A. */ \
207
    "paddw   %%mm3, %%mm4 \n\t"    /* r4 = E = c4 * (i0 + i4) */ \
208
    "psubsw  %%mm1, %%mm2 \n\t"    /* r2 = R2 = A.. - H. */
209 44cb64ee Mike Melanson
210
/* RowIDCT gets ready to transpose */
211 437e3f4d David Conrad
#define RowIDCT() \
212
    BeginIDCT() \
213
    "movq   "I(2)", %%mm3 \n\t"    /* r3 = D. */ \
214
    "psubsw  %%mm7, %%mm4 \n\t"    /* r4 = E. = E - G */ \
215
    "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H. + H. */ \
216
    "paddsw  %%mm7, %%mm7 \n\t"    /* r7 = G + G */ \
217
    "paddsw  %%mm2, %%mm1 \n\t"    /* r1 = R1 = A.. + H. */ \
218
    "paddsw  %%mm4, %%mm7 \n\t"    /* r1 = R1 = A.. + H. */ \
219
    "psubsw  %%mm3, %%mm4 \n\t"    /* r4 = R4 = E. - D. */ \
220
    "paddsw  %%mm3, %%mm3 \n\t" \
221
    "psubsw  %%mm5, %%mm6 \n\t"    /* r6 = R6 = F. - B.. */ \
222
    "paddsw  %%mm5, %%mm5 \n\t" \
223
    "paddsw  %%mm4, %%mm3 \n\t"    /* r3 = R3 = E. + D. */ \
224
    "paddsw  %%mm6, %%mm5 \n\t"    /* r5 = R5 = F. + B.. */ \
225
    "psubsw  %%mm0, %%mm7 \n\t"    /* r7 = R7 = G. - C. */ \
226
    "paddsw  %%mm0, %%mm0 \n\t" \
227
    "movq    %%mm1, "I(1)"\n\t"    /* save R1 */ \
228
    "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = R0 = G. + C. */
229 44cb64ee Mike Melanson
230
/* Column IDCT normalizes and stores final results */
231 437e3f4d David Conrad
#define ColumnIDCT() \
232
    BeginIDCT() \
233
    "paddsw "OC_8", %%mm2 \n\t"    /* adjust R2 (and R1) for shift */ \
234
    "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H. + H. */ \
235
    "paddsw  %%mm2, %%mm1 \n\t"    /* r1 = R1 = A.. + H. */ \
236
    "psraw      $4, %%mm2 \n\t"    /* r2 = NR2 */ \
237
    "psubsw  %%mm7, %%mm4 \n\t"    /* r4 = E. = E - G */ \
238
    "psraw      $4, %%mm1 \n\t"    /* r1 = NR1 */ \
239
    "movq   "I(2)", %%mm3 \n\t"    /* r3 = D. */ \
240
    "paddsw  %%mm7, %%mm7 \n\t"    /* r7 = G + G */ \
241
    "movq    %%mm2, "I(2)"\n\t"    /* store NR2 at I2 */ \
242
    "paddsw  %%mm4, %%mm7 \n\t"    /* r7 = G. = E + G */ \
243
    "movq    %%mm1, "I(1)"\n\t"    /* store NR1 at I1 */ \
244
    "psubsw  %%mm3, %%mm4 \n\t"    /* r4 = R4 = E. - D. */ \
245
    "paddsw "OC_8", %%mm4 \n\t"    /* adjust R4 (and R3) for shift */ \
246
    "paddsw  %%mm3, %%mm3 \n\t"    /* r3 = D. + D. */ \
247
    "paddsw  %%mm4, %%mm3 \n\t"    /* r3 = R3 = E. + D. */ \
248
    "psraw      $4, %%mm4 \n\t"    /* r4 = NR4 */ \
249
    "psubsw  %%mm5, %%mm6 \n\t"    /* r6 = R6 = F. - B.. */ \
250
    "psraw      $4, %%mm3 \n\t"    /* r3 = NR3 */ \
251
    "paddsw "OC_8", %%mm6 \n\t"    /* adjust R6 (and R5) for shift */ \
252
    "paddsw  %%mm5, %%mm5 \n\t"    /* r5 = B.. + B.. */ \
253
    "paddsw  %%mm6, %%mm5 \n\t"    /* r5 = R5 = F. + B.. */ \
254
    "psraw      $4, %%mm6 \n\t"    /* r6 = NR6 */ \
255
    "movq    %%mm4, "J(4)"\n\t"    /* store NR4 at J4 */ \
256
    "psraw      $4, %%mm5 \n\t"    /* r5 = NR5 */ \
257
    "movq    %%mm3, "I(3)"\n\t"    /* store NR3 at I3 */ \
258
    "psubsw  %%mm0, %%mm7 \n\t"    /* r7 = R7 = G. - C. */ \
259
    "paddsw "OC_8", %%mm7 \n\t"    /* adjust R7 (and R0) for shift */ \
260
    "paddsw  %%mm0, %%mm0 \n\t"    /* r0 = C. + C. */ \
261
    "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = R0 = G. + C. */ \
262
    "psraw      $4, %%mm7 \n\t"    /* r7 = NR7 */ \
263
    "movq    %%mm6, "J(6)"\n\t"    /* store NR6 at J6 */ \
264
    "psraw      $4, %%mm0 \n\t"    /* r0 = NR0 */ \
265
    "movq    %%mm5, "J(5)"\n\t"    /* store NR5 at J5 */ \
266
    "movq    %%mm7, "J(7)"\n\t"    /* store NR7 at J7 */ \
267
    "movq    %%mm0, "I(0)"\n\t"    /* store NR0 at I0 */
268 44cb64ee Mike Melanson
269
/* Following macro does two 4x4 transposes in place.
270

271
  At entry (we assume):
272

273 daae8699 Mike Melanson
    r0 = a3 a2 a1 a0
274
    I(1) = b3 b2 b1 b0
275
    r2 = c3 c2 c1 c0
276
    r3 = d3 d2 d1 d0
277 44cb64ee Mike Melanson

278 daae8699 Mike Melanson
    r4 = e3 e2 e1 e0
279
    r5 = f3 f2 f1 f0
280
    r6 = g3 g2 g1 g0
281
    r7 = h3 h2 h1 h0
282 44cb64ee Mike Melanson

283 daae8699 Mike Melanson
  At exit, we have:
284 44cb64ee Mike Melanson

285 daae8699 Mike Melanson
    I(0) = d0 c0 b0 a0
286
    I(1) = d1 c1 b1 a1
287
    I(2) = d2 c2 b2 a2
288
    I(3) = d3 c3 b3 a3
289 115329f1 Diego Biurrun

290 daae8699 Mike Melanson
    J(4) = h0 g0 f0 e0
291
    J(5) = h1 g1 f1 e1
292
    J(6) = h2 g2 f2 e2
293
    J(7) = h3 g3 f3 e3
294 44cb64ee Mike Melanson

295
   I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
296
   J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
297

298
   Since r1 is free at entry, we calculate the Js first. */
299 437e3f4d David Conrad
#define Transpose() \
300
    "movq       %%mm4, %%mm1 \n\t"    /* r1 = e3 e2 e1 e0 */ \
301
    "punpcklwd  %%mm5, %%mm4 \n\t"    /* r4 = f1 e1 f0 e0 */ \
302
    "movq       %%mm0, "I(0)"\n\t"    /* save a3 a2 a1 a0 */ \
303
    "punpckhwd  %%mm5, %%mm1 \n\t"    /* r1 = f3 e3 f2 e2 */ \
304
    "movq       %%mm6, %%mm0 \n\t"    /* r0 = g3 g2 g1 g0 */ \
305
    "punpcklwd  %%mm7, %%mm6 \n\t"    /* r6 = h1 g1 h0 g0 */ \
306
    "movq       %%mm4, %%mm5 \n\t"    /* r5 = f1 e1 f0 e0 */ \
307
    "punpckldq  %%mm6, %%mm4 \n\t"    /* r4 = h0 g0 f0 e0 = R4 */ \
308
    "punpckhdq  %%mm6, %%mm5 \n\t"    /* r5 = h1 g1 f1 e1 = R5 */ \
309
    "movq       %%mm1, %%mm6 \n\t"    /* r6 = f3 e3 f2 e2 */ \
310
    "movq       %%mm4, "J(4)"\n\t" \
311
    "punpckhwd  %%mm7, %%mm0 \n\t"    /* r0 = h3 g3 h2 g2 */ \
312
    "movq       %%mm5, "J(5)"\n\t" \
313
    "punpckhdq  %%mm0, %%mm6 \n\t"    /* r6 = h3 g3 f3 e3 = R7 */ \
314
    "movq      "I(0)", %%mm4 \n\t"    /* r4 = a3 a2 a1 a0 */ \
315
    "punpckldq  %%mm0, %%mm1 \n\t"    /* r1 = h2 g2 f2 e2 = R6 */ \
316
    "movq      "I(1)", %%mm5 \n\t"    /* r5 = b3 b2 b1 b0 */ \
317
    "movq       %%mm4, %%mm0 \n\t"    /* r0 = a3 a2 a1 a0 */ \
318
    "movq       %%mm6, "J(7)"\n\t" \
319
    "punpcklwd  %%mm5, %%mm0 \n\t"    /* r0 = b1 a1 b0 a0 */ \
320
    "movq       %%mm1, "J(6)"\n\t" \
321
    "punpckhwd  %%mm5, %%mm4 \n\t"    /* r4 = b3 a3 b2 a2 */ \
322
    "movq       %%mm2, %%mm5 \n\t"    /* r5 = c3 c2 c1 c0 */ \
323
    "punpcklwd  %%mm3, %%mm2 \n\t"    /* r2 = d1 c1 d0 c0 */ \
324
    "movq       %%mm0, %%mm1 \n\t"    /* r1 = b1 a1 b0 a0 */ \
325
    "punpckldq  %%mm2, %%mm0 \n\t"    /* r0 = d0 c0 b0 a0 = R0 */ \
326
    "punpckhdq  %%mm2, %%mm1 \n\t"    /* r1 = d1 c1 b1 a1 = R1 */ \
327
    "movq       %%mm4, %%mm2 \n\t"    /* r2 = b3 a3 b2 a2 */ \
328
    "movq       %%mm0, "I(0)"\n\t" \
329
    "punpckhwd  %%mm3, %%mm5 \n\t"    /* r5 = d3 c3 d2 c2 */ \
330
    "movq       %%mm1, "I(1)"\n\t" \
331
    "punpckhdq  %%mm5, %%mm4 \n\t"    /* r4 = d3 c3 b3 a3 = R3 */ \
332
    "punpckldq  %%mm5, %%mm2 \n\t"    /* r2 = d2 c2 b2 a2 = R2 */ \
333
    "movq       %%mm4, "I(3)"\n\t" \
334
    "movq       %%mm2, "I(2)"\n\t"
335 44cb64ee Mike Melanson
336 5773a746 Michael Niedermayer
void ff_vp3_idct_mmx(int16_t *output_data)
337 44cb64ee Mike Melanson
{
338
    /* eax = quantized input
339
     * ebx = dequantizer matrix
340
     * ecx = IDCT constants
341
     *  M(I) = ecx + MaskOffset(0) + I * 8
342
     *  C(I) = ecx + CosineOffset(32) + (I-1) * 8
343
     * edx = output
344
     * r0..r7 = mm0..mm7
345
     */
346
347 437e3f4d David Conrad
#define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
348
#define OC_8 "%2"
349 44cb64ee Mike Melanson
350 daae8699 Mike Melanson
    /* at this point, function has completed dequantization + dezigzag +
351 44cb64ee Mike Melanson
     * partial transposition; now do the idct itself */
352 437e3f4d David Conrad
#define I(x) AV_STRINGIFY(16* x       )"(%0)"
353
#define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)"
354 44cb64ee Mike Melanson
355 be449fca Diego Pettenò
    __asm__ volatile (
356 c3c5bba1 David Conrad
        RowIDCT()
357
        Transpose()
358 44cb64ee Mike Melanson
359
#undef I
360
#undef J
361 437e3f4d David Conrad
#define I(x) AV_STRINGIFY(16* x    + 64)"(%0)"
362
#define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)"
363 44cb64ee Mike Melanson
364 c3c5bba1 David Conrad
        RowIDCT()
365
        Transpose()
366 44cb64ee Mike Melanson
367
#undef I
368
#undef J
369 437e3f4d David Conrad
#define I(x) AV_STRINGIFY(16*x)"(%0)"
370
#define J(x) AV_STRINGIFY(16*x)"(%0)"
371 44cb64ee Mike Melanson
372 c3c5bba1 David Conrad
        ColumnIDCT()
373 44cb64ee Mike Melanson
374
#undef I
375
#undef J
376 437e3f4d David Conrad
#define I(x) AV_STRINGIFY(16*x + 8)"(%0)"
377
#define J(x) AV_STRINGIFY(16*x + 8)"(%0)"
378 44cb64ee Mike Melanson
379 c3c5bba1 David Conrad
        ColumnIDCT()
380 437e3f4d David Conrad
        :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
381
    );
382 44cb64ee Mike Melanson
#undef I
383
#undef J
384
385
}
386 5b0b7054 Aurelien Jacobs
387
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
388
{
389
    ff_vp3_idct_mmx(block);
390
    put_signed_pixels_clamped_mmx(block, dest, line_size);
391
}
392
393
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
394
{
395
    ff_vp3_idct_mmx(block);
396
    add_pixels_clamped_mmx(block, dest, line_size);
397
}