Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp3dsp_mmx.c @ 179655b6

History | View | Annotate | Download (16.8 KB)

1
/*
2
 * Copyright (C) 2004 the ffmpeg project
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
/**
22
 * @file
23
 * MMX-optimized functions cribbed from the original VP3 source code.
24
 */
25

    
26
#include "libavutil/x86_cpu.h"
27
#include "libavcodec/dsputil.h"
28
#include "dsputil_mmx.h"
29
#include "vp3dsp_mmx.h"
30

    
31
extern const uint16_t ff_vp3_idct_data[];
32

    
33
// this is off by one or two for some cases when filter_limit is greater than 63
34
// in:  p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
35
// out: p1 in mm4, p2 in mm3
36
#define VP3_LOOP_FILTER(flim) \
37
    "movq       %%mm6, %%mm7 \n\t" \
38
    "pand    "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \
39
    "psrlw         $3, %%mm7 \n\t" \
40
    "pand    "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \
41
    "movq       %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \
42
    "pxor       %%mm4, %%mm2 \n\t" \
43
    "pand    "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \
44
    "movq       %%mm2, %%mm5 \n\t" \
45
    "paddb      %%mm2, %%mm2 \n\t" \
46
    "paddb      %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \
47
    "paddb      %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \
48
    "pcmpeqb    %%mm0, %%mm0 \n\t" \
49
    "pxor       %%mm0, %%mm1 \n\t" /* 255 - p3 */ \
50
    "pavgb      %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \
51
    "pxor       %%mm4, %%mm0 \n\t" /* 255 - p1 */ \
52
    "pavgb      %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \
53
    "paddb   "MANGLE(ff_pb_3 )", %%mm1 \n\t" \
54
    "pavgb      %%mm0, %%mm1 \n\t" /* 128+2+(   p2-p1  - p3) >> 2 */ \
55
    "pavgb      %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \
56
    "paddusb    %%mm1, %%mm7 \n\t" /* d+128+1 */ \
57
    "movq    "MANGLE(ff_pb_81)", %%mm6 \n\t" \
58
    "psubusb    %%mm7, %%mm6 \n\t" \
59
    "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \
60
\
61
    "movq     "#flim", %%mm5 \n\t" \
62
    "pminub     %%mm5, %%mm6 \n\t" \
63
    "pminub     %%mm5, %%mm7 \n\t" \
64
    "movq       %%mm6, %%mm0 \n\t" \
65
    "movq       %%mm7, %%mm1 \n\t" \
66
    "paddb      %%mm6, %%mm6 \n\t" \
67
    "paddb      %%mm7, %%mm7 \n\t" \
68
    "pminub     %%mm5, %%mm6 \n\t" \
69
    "pminub     %%mm5, %%mm7 \n\t" \
70
    "psubb      %%mm0, %%mm6 \n\t" \
71
    "psubb      %%mm1, %%mm7 \n\t" \
72
    "paddusb    %%mm7, %%mm4 \n\t" \
73
    "psubusb    %%mm6, %%mm4 \n\t" \
74
    "psubusb    %%mm7, %%mm3 \n\t" \
75
    "paddusb    %%mm6, %%mm3 \n\t"
76

    
77
#define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \
78
    "movd "#mm", %0        \n\t" \
79
    "movw   %w0, -1"#dst0" \n\t" \
80
    "psrlq  $32, "#mm"     \n\t" \
81
    "shr    $16, %0        \n\t" \
82
    "movw   %w0, -1"#dst1" \n\t" \
83
    "movd "#mm", %0        \n\t" \
84
    "movw   %w0, -1"#dst2" \n\t" \
85
    "shr    $16, %0        \n\t" \
86
    "movw   %w0, -1"#dst3" \n\t"
87

    
88
void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
89
{
90
    __asm__ volatile(
91
        "movq          %0, %%mm6 \n\t"
92
        "movq          %1, %%mm4 \n\t"
93
        "movq          %2, %%mm2 \n\t"
94
        "movq          %3, %%mm1 \n\t"
95

    
96
        VP3_LOOP_FILTER(%4)
97

    
98
        "movq       %%mm4, %1    \n\t"
99
        "movq       %%mm3, %2    \n\t"
100

    
101
        : "+m" (*(uint64_t*)(src - 2*stride)),
102
          "+m" (*(uint64_t*)(src - 1*stride)),
103
          "+m" (*(uint64_t*)(src + 0*stride)),
104
          "+m" (*(uint64_t*)(src + 1*stride))
105
        : "m"(*(uint64_t*)(bounding_values+129))
106
    );
107
}
108

    
109
void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
110
{
111
    x86_reg tmp;
112

    
113
    __asm__ volatile(
114
        "movd -2(%1),      %%mm6 \n\t"
115
        "movd -2(%1,%3),   %%mm0 \n\t"
116
        "movd -2(%1,%3,2), %%mm1 \n\t"
117
        "movd -2(%1,%4),   %%mm4 \n\t"
118

    
119
        TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2)
120
        VP3_LOOP_FILTER(%5)
121
        SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q)
122

    
123
        STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4)
124
        STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5)
125

    
126
        : "=&r"(tmp)
127
        : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride),
128
          "m"(*(uint64_t*)(bounding_values+129))
129
        : "memory"
130
    );
131
}
132

    
133
/* from original comments: The Macro does IDct on 4 1-D Dcts */
134
#define BeginIDCT() \
135
    "movq   "I(3)", %%mm2 \n\t" \
136
    "movq   "C(3)", %%mm6 \n\t" \
137
    "movq    %%mm2, %%mm4 \n\t" \
138
    "movq   "J(5)", %%mm7 \n\t" \
139
    "pmulhw  %%mm6, %%mm4 \n\t"    /* r4 = c3*i3 - i3 */ \
140
    "movq   "C(5)", %%mm1 \n\t" \
141
    "pmulhw  %%mm7, %%mm6 \n\t"    /* r6 = c3*i5 - i5 */ \
142
    "movq    %%mm1, %%mm5 \n\t" \
143
    "pmulhw  %%mm2, %%mm1 \n\t"    /* r1 = c5*i3 - i3 */ \
144
    "movq   "I(1)", %%mm3 \n\t" \
145
    "pmulhw  %%mm7, %%mm5 \n\t"    /* r5 = c5*i5 - i5 */ \
146
    "movq   "C(1)", %%mm0 \n\t" \
147
    "paddw   %%mm2, %%mm4 \n\t"    /* r4 = c3*i3 */ \
148
    "paddw   %%mm7, %%mm6 \n\t"    /* r6 = c3*i5 */ \
149
    "paddw   %%mm1, %%mm2 \n\t"    /* r2 = c5*i3 */ \
150
    "movq   "J(7)", %%mm1 \n\t" \
151
    "paddw   %%mm5, %%mm7 \n\t"    /* r7 = c5*i5 */ \
152
    "movq    %%mm0, %%mm5 \n\t"    /* r5 = c1 */ \
153
    "pmulhw  %%mm3, %%mm0 \n\t"    /* r0 = c1*i1 - i1 */ \
154
    "paddsw  %%mm7, %%mm4 \n\t"    /* r4 = C = c3*i3 + c5*i5 */ \
155
    "pmulhw  %%mm1, %%mm5 \n\t"    /* r5 = c1*i7 - i7 */ \
156
    "movq   "C(7)", %%mm7 \n\t" \
157
    "psubsw  %%mm2, %%mm6 \n\t"    /* r6 = D = c3*i5 - c5*i3 */ \
158
    "paddw   %%mm3, %%mm0 \n\t"    /* r0 = c1*i1 */ \
159
    "pmulhw  %%mm7, %%mm3 \n\t"    /* r3 = c7*i1 */ \
160
    "movq   "I(2)", %%mm2 \n\t" \
161
    "pmulhw  %%mm1, %%mm7 \n\t"    /* r7 = c7*i7 */ \
162
    "paddw   %%mm1, %%mm5 \n\t"    /* r5 = c1*i7 */ \
163
    "movq    %%mm2, %%mm1 \n\t"    /* r1 = i2 */ \
164
    "pmulhw "C(2)", %%mm2 \n\t"    /* r2 = c2*i2 - i2 */ \
165
    "psubsw  %%mm5, %%mm3 \n\t"    /* r3 = B = c7*i1 - c1*i7 */ \
166
    "movq   "J(6)", %%mm5 \n\t" \
167
    "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = A = c1*i1 + c7*i7 */ \
168
    "movq    %%mm5, %%mm7 \n\t"    /* r7 = i6 */ \
169
    "psubsw  %%mm4, %%mm0 \n\t"    /* r0 = A - C */ \
170
    "pmulhw "C(2)", %%mm5 \n\t"    /* r5 = c2*i6 - i6 */ \
171
    "paddw   %%mm1, %%mm2 \n\t"    /* r2 = c2*i2 */ \
172
    "pmulhw "C(6)", %%mm1 \n\t"    /* r1 = c6*i2 */ \
173
    "paddsw  %%mm4, %%mm4 \n\t"    /* r4 = C + C */ \
174
    "paddsw  %%mm0, %%mm4 \n\t"    /* r4 = C. = A + C */ \
175
    "psubsw  %%mm6, %%mm3 \n\t"    /* r3 = B - D */ \
176
    "paddw   %%mm7, %%mm5 \n\t"    /* r5 = c2*i6 */ \
177
    "paddsw  %%mm6, %%mm6 \n\t"    /* r6 = D + D */ \
178
    "pmulhw "C(6)", %%mm7 \n\t"    /* r7 = c6*i6 */ \
179
    "paddsw  %%mm3, %%mm6 \n\t"    /* r6 = D. = B + D */ \
180
    "movq    %%mm4, "I(1)"\n\t"    /* save C. at I(1) */ \
181
    "psubsw  %%mm5, %%mm1 \n\t"    /* r1 = H = c6*i2 - c2*i6 */ \
182
    "movq   "C(4)", %%mm4 \n\t" \
183
    "movq    %%mm3, %%mm5 \n\t"    /* r5 = B - D */ \
184
    "pmulhw  %%mm4, %%mm3 \n\t"    /* r3 = (c4 - 1) * (B - D) */ \
185
    "paddsw  %%mm2, %%mm7 \n\t"    /* r3 = (c4 - 1) * (B - D) */ \
186
    "movq    %%mm6, "I(2)"\n\t"    /* save D. at I(2) */ \
187
    "movq    %%mm0, %%mm2 \n\t"    /* r2 = A - C */ \
188
    "movq   "I(0)", %%mm6 \n\t" \
189
    "pmulhw  %%mm4, %%mm0 \n\t"    /* r0 = (c4 - 1) * (A - C) */ \
190
    "paddw   %%mm3, %%mm5 \n\t"    /* r5 = B. = c4 * (B - D) */ \
191
    "movq   "J(4)", %%mm3 \n\t" \
192
    "psubsw  %%mm1, %%mm5 \n\t"    /* r5 = B.. = B. - H */ \
193
    "paddw   %%mm0, %%mm2 \n\t"    /* r0 = A. = c4 * (A - C) */ \
194
    "psubsw  %%mm3, %%mm6 \n\t"    /* r6 = i0 - i4 */ \
195
    "movq    %%mm6, %%mm0 \n\t" \
196
    "pmulhw  %%mm4, %%mm6 \n\t"    /* r6 = (c4 - 1) * (i0 - i4) */ \
197
    "paddsw  %%mm3, %%mm3 \n\t"    /* r3 = i4 + i4 */ \
198
    "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H + H */ \
199
    "paddsw  %%mm0, %%mm3 \n\t"    /* r3 = i0 + i4 */ \
200
    "paddsw  %%mm5, %%mm1 \n\t"    /* r1 = H. = B + H */ \
201
    "pmulhw  %%mm3, %%mm4 \n\t"    /* r4 = (c4 - 1) * (i0 + i4) */ \
202
    "paddsw  %%mm0, %%mm6 \n\t"    /* r6 = F = c4 * (i0 - i4) */ \
203
    "psubsw  %%mm2, %%mm6 \n\t"    /* r6 = F. = F - A. */ \
204
    "paddsw  %%mm2, %%mm2 \n\t"    /* r2 = A. + A. */ \
205
    "movq   "I(1)", %%mm0 \n\t"    /* r0 = C. */ \
206
    "paddsw  %%mm6, %%mm2 \n\t"    /* r2 = A.. = F + A. */ \
207
    "paddw   %%mm3, %%mm4 \n\t"    /* r4 = E = c4 * (i0 + i4) */ \
208
    "psubsw  %%mm1, %%mm2 \n\t"    /* r2 = R2 = A.. - H. */
209

    
210
/* RowIDCT gets ready to transpose */
211
#define RowIDCT() \
212
    BeginIDCT() \
213
    "movq   "I(2)", %%mm3 \n\t"    /* r3 = D. */ \
214
    "psubsw  %%mm7, %%mm4 \n\t"    /* r4 = E. = E - G */ \
215
    "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H. + H. */ \
216
    "paddsw  %%mm7, %%mm7 \n\t"    /* r7 = G + G */ \
217
    "paddsw  %%mm2, %%mm1 \n\t"    /* r1 = R1 = A.. + H. */ \
218
    "paddsw  %%mm4, %%mm7 \n\t"    /* r1 = R1 = A.. + H. */ \
219
    "psubsw  %%mm3, %%mm4 \n\t"    /* r4 = R4 = E. - D. */ \
220
    "paddsw  %%mm3, %%mm3 \n\t" \
221
    "psubsw  %%mm5, %%mm6 \n\t"    /* r6 = R6 = F. - B.. */ \
222
    "paddsw  %%mm5, %%mm5 \n\t" \
223
    "paddsw  %%mm4, %%mm3 \n\t"    /* r3 = R3 = E. + D. */ \
224
    "paddsw  %%mm6, %%mm5 \n\t"    /* r5 = R5 = F. + B.. */ \
225
    "psubsw  %%mm0, %%mm7 \n\t"    /* r7 = R7 = G. - C. */ \
226
    "paddsw  %%mm0, %%mm0 \n\t" \
227
    "movq    %%mm1, "I(1)"\n\t"    /* save R1 */ \
228
    "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = R0 = G. + C. */
229

    
230
/* Column IDCT normalizes and stores final results */
231
#define ColumnIDCT() \
232
    BeginIDCT() \
233
    "paddsw "OC_8", %%mm2 \n\t"    /* adjust R2 (and R1) for shift */ \
234
    "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H. + H. */ \
235
    "paddsw  %%mm2, %%mm1 \n\t"    /* r1 = R1 = A.. + H. */ \
236
    "psraw      $4, %%mm2 \n\t"    /* r2 = NR2 */ \
237
    "psubsw  %%mm7, %%mm4 \n\t"    /* r4 = E. = E - G */ \
238
    "psraw      $4, %%mm1 \n\t"    /* r1 = NR1 */ \
239
    "movq   "I(2)", %%mm3 \n\t"    /* r3 = D. */ \
240
    "paddsw  %%mm7, %%mm7 \n\t"    /* r7 = G + G */ \
241
    "movq    %%mm2, "I(2)"\n\t"    /* store NR2 at I2 */ \
242
    "paddsw  %%mm4, %%mm7 \n\t"    /* r7 = G. = E + G */ \
243
    "movq    %%mm1, "I(1)"\n\t"    /* store NR1 at I1 */ \
244
    "psubsw  %%mm3, %%mm4 \n\t"    /* r4 = R4 = E. - D. */ \
245
    "paddsw "OC_8", %%mm4 \n\t"    /* adjust R4 (and R3) for shift */ \
246
    "paddsw  %%mm3, %%mm3 \n\t"    /* r3 = D. + D. */ \
247
    "paddsw  %%mm4, %%mm3 \n\t"    /* r3 = R3 = E. + D. */ \
248
    "psraw      $4, %%mm4 \n\t"    /* r4 = NR4 */ \
249
    "psubsw  %%mm5, %%mm6 \n\t"    /* r6 = R6 = F. - B.. */ \
250
    "psraw      $4, %%mm3 \n\t"    /* r3 = NR3 */ \
251
    "paddsw "OC_8", %%mm6 \n\t"    /* adjust R6 (and R5) for shift */ \
252
    "paddsw  %%mm5, %%mm5 \n\t"    /* r5 = B.. + B.. */ \
253
    "paddsw  %%mm6, %%mm5 \n\t"    /* r5 = R5 = F. + B.. */ \
254
    "psraw      $4, %%mm6 \n\t"    /* r6 = NR6 */ \
255
    "movq    %%mm4, "J(4)"\n\t"    /* store NR4 at J4 */ \
256
    "psraw      $4, %%mm5 \n\t"    /* r5 = NR5 */ \
257
    "movq    %%mm3, "I(3)"\n\t"    /* store NR3 at I3 */ \
258
    "psubsw  %%mm0, %%mm7 \n\t"    /* r7 = R7 = G. - C. */ \
259
    "paddsw "OC_8", %%mm7 \n\t"    /* adjust R7 (and R0) for shift */ \
260
    "paddsw  %%mm0, %%mm0 \n\t"    /* r0 = C. + C. */ \
261
    "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = R0 = G. + C. */ \
262
    "psraw      $4, %%mm7 \n\t"    /* r7 = NR7 */ \
263
    "movq    %%mm6, "J(6)"\n\t"    /* store NR6 at J6 */ \
264
    "psraw      $4, %%mm0 \n\t"    /* r0 = NR0 */ \
265
    "movq    %%mm5, "J(5)"\n\t"    /* store NR5 at J5 */ \
266
    "movq    %%mm7, "J(7)"\n\t"    /* store NR7 at J7 */ \
267
    "movq    %%mm0, "I(0)"\n\t"    /* store NR0 at I0 */
268

    
269
/* Following macro does two 4x4 transposes in place.
270

271
  At entry (we assume):
272

273
    r0 = a3 a2 a1 a0
274
    I(1) = b3 b2 b1 b0
275
    r2 = c3 c2 c1 c0
276
    r3 = d3 d2 d1 d0
277

278
    r4 = e3 e2 e1 e0
279
    r5 = f3 f2 f1 f0
280
    r6 = g3 g2 g1 g0
281
    r7 = h3 h2 h1 h0
282

283
  At exit, we have:
284

285
    I(0) = d0 c0 b0 a0
286
    I(1) = d1 c1 b1 a1
287
    I(2) = d2 c2 b2 a2
288
    I(3) = d3 c3 b3 a3
289

290
    J(4) = h0 g0 f0 e0
291
    J(5) = h1 g1 f1 e1
292
    J(6) = h2 g2 f2 e2
293
    J(7) = h3 g3 f3 e3
294

295
   I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
296
   J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
297

298
   Since r1 is free at entry, we calculate the Js first. */
299
#define Transpose() \
300
    "movq       %%mm4, %%mm1 \n\t"    /* r1 = e3 e2 e1 e0 */ \
301
    "punpcklwd  %%mm5, %%mm4 \n\t"    /* r4 = f1 e1 f0 e0 */ \
302
    "movq       %%mm0, "I(0)"\n\t"    /* save a3 a2 a1 a0 */ \
303
    "punpckhwd  %%mm5, %%mm1 \n\t"    /* r1 = f3 e3 f2 e2 */ \
304
    "movq       %%mm6, %%mm0 \n\t"    /* r0 = g3 g2 g1 g0 */ \
305
    "punpcklwd  %%mm7, %%mm6 \n\t"    /* r6 = h1 g1 h0 g0 */ \
306
    "movq       %%mm4, %%mm5 \n\t"    /* r5 = f1 e1 f0 e0 */ \
307
    "punpckldq  %%mm6, %%mm4 \n\t"    /* r4 = h0 g0 f0 e0 = R4 */ \
308
    "punpckhdq  %%mm6, %%mm5 \n\t"    /* r5 = h1 g1 f1 e1 = R5 */ \
309
    "movq       %%mm1, %%mm6 \n\t"    /* r6 = f3 e3 f2 e2 */ \
310
    "movq       %%mm4, "J(4)"\n\t" \
311
    "punpckhwd  %%mm7, %%mm0 \n\t"    /* r0 = h3 g3 h2 g2 */ \
312
    "movq       %%mm5, "J(5)"\n\t" \
313
    "punpckhdq  %%mm0, %%mm6 \n\t"    /* r6 = h3 g3 f3 e3 = R7 */ \
314
    "movq      "I(0)", %%mm4 \n\t"    /* r4 = a3 a2 a1 a0 */ \
315
    "punpckldq  %%mm0, %%mm1 \n\t"    /* r1 = h2 g2 f2 e2 = R6 */ \
316
    "movq      "I(1)", %%mm5 \n\t"    /* r5 = b3 b2 b1 b0 */ \
317
    "movq       %%mm4, %%mm0 \n\t"    /* r0 = a3 a2 a1 a0 */ \
318
    "movq       %%mm6, "J(7)"\n\t" \
319
    "punpcklwd  %%mm5, %%mm0 \n\t"    /* r0 = b1 a1 b0 a0 */ \
320
    "movq       %%mm1, "J(6)"\n\t" \
321
    "punpckhwd  %%mm5, %%mm4 \n\t"    /* r4 = b3 a3 b2 a2 */ \
322
    "movq       %%mm2, %%mm5 \n\t"    /* r5 = c3 c2 c1 c0 */ \
323
    "punpcklwd  %%mm3, %%mm2 \n\t"    /* r2 = d1 c1 d0 c0 */ \
324
    "movq       %%mm0, %%mm1 \n\t"    /* r1 = b1 a1 b0 a0 */ \
325
    "punpckldq  %%mm2, %%mm0 \n\t"    /* r0 = d0 c0 b0 a0 = R0 */ \
326
    "punpckhdq  %%mm2, %%mm1 \n\t"    /* r1 = d1 c1 b1 a1 = R1 */ \
327
    "movq       %%mm4, %%mm2 \n\t"    /* r2 = b3 a3 b2 a2 */ \
328
    "movq       %%mm0, "I(0)"\n\t" \
329
    "punpckhwd  %%mm3, %%mm5 \n\t"    /* r5 = d3 c3 d2 c2 */ \
330
    "movq       %%mm1, "I(1)"\n\t" \
331
    "punpckhdq  %%mm5, %%mm4 \n\t"    /* r4 = d3 c3 b3 a3 = R3 */ \
332
    "punpckldq  %%mm5, %%mm2 \n\t"    /* r2 = d2 c2 b2 a2 = R2 */ \
333
    "movq       %%mm4, "I(3)"\n\t" \
334
    "movq       %%mm2, "I(2)"\n\t"
335

    
336
void ff_vp3_idct_mmx(int16_t *output_data)
337
{
338
    /* eax = quantized input
339
     * ebx = dequantizer matrix
340
     * ecx = IDCT constants
341
     *  M(I) = ecx + MaskOffset(0) + I * 8
342
     *  C(I) = ecx + CosineOffset(32) + (I-1) * 8
343
     * edx = output
344
     * r0..r7 = mm0..mm7
345
     */
346

    
347
#define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
348
#define OC_8 "%2"
349

    
350
    /* at this point, function has completed dequantization + dezigzag +
351
     * partial transposition; now do the idct itself */
352
#define I(x) AV_STRINGIFY(16* x       )"(%0)"
353
#define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)"
354

    
355
    __asm__ volatile (
356
        RowIDCT()
357
        Transpose()
358

    
359
#undef I
360
#undef J
361
#define I(x) AV_STRINGIFY(16* x    + 64)"(%0)"
362
#define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)"
363

    
364
        RowIDCT()
365
        Transpose()
366

    
367
#undef I
368
#undef J
369
#define I(x) AV_STRINGIFY(16*x)"(%0)"
370
#define J(x) AV_STRINGIFY(16*x)"(%0)"
371

    
372
        ColumnIDCT()
373

    
374
#undef I
375
#undef J
376
#define I(x) AV_STRINGIFY(16*x + 8)"(%0)"
377
#define J(x) AV_STRINGIFY(16*x + 8)"(%0)"
378

    
379
        ColumnIDCT()
380
        :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
381
    );
382
#undef I
383
#undef J
384

    
385
}
386

    
387
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
388
{
389
    ff_vp3_idct_mmx(block);
390
    put_signed_pixels_clamped_mmx(block, dest, line_size);
391
}
392

    
393
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
394
{
395
    ff_vp3_idct_mmx(block);
396
    add_pixels_clamped_mmx(block, dest, line_size);
397
}
398

    
399
void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block)
400
{
401
    int dc = (block[0] + 15) >> 5;
402

    
403
    __asm__ volatile(
404
        "movd          %3, %%mm0 \n\t"
405
        "pshufw $0, %%mm0, %%mm0 \n\t"
406
        "pxor       %%mm1, %%mm1 \n\t"
407
        "psubw      %%mm0, %%mm1 \n\t"
408
        "packuswb   %%mm0, %%mm0 \n\t"
409
        "packuswb   %%mm1, %%mm1 \n\t"
410

    
411
#define DC_ADD \
412
        "movq        (%0), %%mm2 \n\t" \
413
        "movq     (%0,%1), %%mm3 \n\t" \
414
        "paddusb    %%mm0, %%mm2 \n\t" \
415
        "movq   (%0,%1,2), %%mm4 \n\t" \
416
        "paddusb    %%mm0, %%mm3 \n\t" \
417
        "movq     (%0,%2), %%mm5 \n\t" \
418
        "paddusb    %%mm0, %%mm4 \n\t" \
419
        "paddusb    %%mm0, %%mm5 \n\t" \
420
        "psubusb    %%mm1, %%mm2 \n\t" \
421
        "psubusb    %%mm1, %%mm3 \n\t" \
422
        "movq       %%mm2, (%0)  \n\t" \
423
        "psubusb    %%mm1, %%mm4 \n\t" \
424
        "movq       %%mm3, (%0,%1) \n\t" \
425
        "psubusb    %%mm1, %%mm5 \n\t" \
426
        "movq       %%mm4, (%0,%1,2) \n\t" \
427
        "movq       %%mm5, (%0,%2) \n\t"
428

    
429
        DC_ADD
430
        "lea    (%0,%1,4), %0 \n\t"
431
        DC_ADD
432

    
433
        : "+r"(dest)
434
        : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc)
435
    );
436
}