Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / simple_idct_mmx.c @ c47d146b

History | View | Annotate | Download (71 KB)

1
/*
2
 * Simple IDCT MMX
3
 *
4
 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22
#include "libavcodec/dsputil.h"
23
#include "libavcodec/simple_idct.h"
24

    
25
/*
26
23170.475006
27
22725.260826
28
21406.727617
29
19265.545870
30
16384.000000
31
12872.826198
32
8866.956905
33
4520.335430
34
*/
35
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39
#if 0
40
#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
41
#else
42
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
43
#endif
44
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45
#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46
#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47

    
48
#define ROW_SHIFT 11
49
#define COL_SHIFT 20 // 6
50

    
51
DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
52
DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
53

    
54
DECLARE_ALIGNED(8, static const int16_t, coeffs[])= {
55
        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
56
//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
57
//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
58
        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
59
        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
60
//        0, 0, 0, 0,
61
//        0, 0, 0, 0,
62

    
63
 C4,  C4,  C4,  C4,
64
 C4, -C4,  C4, -C4,
65

    
66
 C2,  C6,  C2,  C6,
67
 C6, -C2,  C6, -C2,
68

    
69
 C1,  C3,  C1,  C3,
70
 C5,  C7,  C5,  C7,
71

    
72
 C3, -C7,  C3, -C7,
73
-C1, -C5, -C1, -C5,
74

    
75
 C5, -C1,  C5, -C1,
76
 C7,  C3,  C7,  C3,
77

    
78
 C7, -C5,  C7, -C5,
79
 C3, -C1,  C3, -C1
80
};
81

    
82
#if 0
83
static void unused_var_killer(void)
84
{
85
        int a= wm1010 + d40000;
86
        temp[0]=a;
87
}
88

89
static void inline idctCol (int16_t * col, int16_t *input)
90
{
91
#undef C0
92
#undef C1
93
#undef C2
94
#undef C3
95
#undef C4
96
#undef C5
97
#undef C6
98
#undef C7
99
        int a0, a1, a2, a3, b0, b1, b2, b3;
100
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
106
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
107
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
108
/*
109
        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
110
                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
111
                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
112
                return;
113
        }*/
114

115
col[8*0] = input[8*0 + 0];
116
col[8*1] = input[8*2 + 0];
117
col[8*2] = input[8*0 + 1];
118
col[8*3] = input[8*2 + 1];
119
col[8*4] = input[8*4 + 0];
120
col[8*5] = input[8*6 + 0];
121
col[8*6] = input[8*4 + 1];
122
col[8*7] = input[8*6 + 1];
123

124
        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
125
        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
126
        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
127
        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
128

129
        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
130
        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
131
        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
132
        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
133

134
        col[8*0] = (a0 + b0) >> COL_SHIFT;
135
        col[8*1] = (a1 + b1) >> COL_SHIFT;
136
        col[8*2] = (a2 + b2) >> COL_SHIFT;
137
        col[8*3] = (a3 + b3) >> COL_SHIFT;
138
        col[8*4] = (a3 - b3) >> COL_SHIFT;
139
        col[8*5] = (a2 - b2) >> COL_SHIFT;
140
        col[8*6] = (a1 - b1) >> COL_SHIFT;
141
        col[8*7] = (a0 - b0) >> COL_SHIFT;
142
}
143

144
static void inline idctRow (int16_t * output, int16_t * input)
145
{
146
        int16_t row[8];
147

148
        int a0, a1, a2, a3, b0, b1, b2, b3;
149
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
155
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
156
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
157

158
row[0] = input[0];
159
row[2] = input[1];
160
row[4] = input[4];
161
row[6] = input[5];
162
row[1] = input[8];
163
row[3] = input[9];
164
row[5] = input[12];
165
row[7] = input[13];
166

167
        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
168
                row[0] = row[1] = row[2] = row[3] = row[4] =
169
                        row[5] = row[6] = row[7] = row[0]<<3;
170
        output[0]  = row[0];
171
        output[2]  = row[1];
172
        output[4]  = row[2];
173
        output[6]  = row[3];
174
        output[8]  = row[4];
175
        output[10] = row[5];
176
        output[12] = row[6];
177
        output[14] = row[7];
178
                return;
179
        }
180

181
        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
182
        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
183
        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
184
        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
185

186
        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
187
        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
188
        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
189
        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
190

191
        row[0] = (a0 + b0) >> ROW_SHIFT;
192
        row[1] = (a1 + b1) >> ROW_SHIFT;
193
        row[2] = (a2 + b2) >> ROW_SHIFT;
194
        row[3] = (a3 + b3) >> ROW_SHIFT;
195
        row[4] = (a3 - b3) >> ROW_SHIFT;
196
        row[5] = (a2 - b2) >> ROW_SHIFT;
197
        row[6] = (a1 - b1) >> ROW_SHIFT;
198
        row[7] = (a0 - b0) >> ROW_SHIFT;
199

200
        output[0]  = row[0];
201
        output[2]  = row[1];
202
        output[4]  = row[2];
203
        output[6]  = row[3];
204
        output[8]  = row[4];
205
        output[10] = row[5];
206
        output[12] = row[6];
207
        output[14] = row[7];
208
}
209
#endif
210

    
211
static inline void idct(int16_t *block)
212
{
213
        DECLARE_ALIGNED(8, int64_t, align_tmp[16]);
214
        int16_t * const temp= (int16_t*)align_tmp;
215

    
216
        __asm__ volatile(
217
#if 0 //Alternative, simpler variant
218

219
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
220
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
221
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
222
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
223
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
224
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
225
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
226
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
227
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
228
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
229
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
230
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
231
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
232
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
233
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
234
        #rounder ", %%mm4               \n\t"\
235
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
236
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
237
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
238
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
239
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
240
        #rounder ", %%mm0               \n\t"\
241
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
242
        "paddd %%mm0, %%mm0             \n\t" \
243
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
244
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
245
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
246
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
247
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
248
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
249
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
250
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
251
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
252
        "psrad $" #shift ", %%mm7       \n\t"\
253
        "psrad $" #shift ", %%mm4       \n\t"\
254
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
255
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
256
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
257
        "psrad $" #shift ", %%mm1       \n\t"\
258
        "psrad $" #shift ", %%mm2       \n\t"\
259
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
260
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
261
        "movq %%mm7, " #dst "           \n\t"\
262
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
263
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
264
        "movq %%mm2, 24+" #dst "        \n\t"\
265
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
266
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
267
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
268
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
269
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
270
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
271
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
272
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
273
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
274
        "psrad $" #shift ", %%mm2       \n\t"\
275
        "psrad $" #shift ", %%mm0       \n\t"\
276
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
277
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
278
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
279
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
280
        "psrad $" #shift ", %%mm6       \n\t"\
281
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
282
        "movq %%mm2, 8+" #dst "         \n\t"\
283
        "psrad $" #shift ", %%mm4       \n\t"\
284
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
285
        "movq %%mm4, 16+" #dst "        \n\t"\
286

287
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
288
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
289
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
290
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
291
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
292
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
293
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
294
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
295
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
296
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
297
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
298
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
299
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
300
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
301
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
302
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
303
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
304
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
305
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
306
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
307
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
308
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
309
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
310
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
311
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
312
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
313
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
314
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
315
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
316
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
317
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
318
        "psrad $" #shift ", %%mm7       \n\t"\
319
        "psrad $" #shift ", %%mm4       \n\t"\
320
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
321
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
322
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
323
        "psrad $" #shift ", %%mm0       \n\t"\
324
        "psrad $" #shift ", %%mm2       \n\t"\
325
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
326
        "movd %%mm7, " #dst "           \n\t"\
327
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
328
        "movd %%mm0, 16+" #dst "        \n\t"\
329
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
330
        "movd %%mm2, 96+" #dst "        \n\t"\
331
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
332
        "movd %%mm4, 112+" #dst "       \n\t"\
333
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
334
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
335
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
336
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
337
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
338
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
339
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
340
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
341
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
342
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
343
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
344
        "psrad $" #shift ", %%mm2       \n\t"\
345
        "psrad $" #shift ", %%mm5       \n\t"\
346
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
347
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
348
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
349
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
350
        "psrad $" #shift ", %%mm6       \n\t"\
351
        "psrad $" #shift ", %%mm4       \n\t"\
352
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
353
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
354
        "movd %%mm2, 32+" #dst "        \n\t"\
355
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
356
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
357
        "movd %%mm6, 48+" #dst "        \n\t"\
358
        "movd %%mm4, 64+" #dst "        \n\t"\
359
        "movd %%mm5, 80+" #dst "        \n\t"\
360

361

362
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
363
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
364
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
365
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
366
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
367
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
368
        "pand %%mm0, %%mm4              \n\t"\
369
        "por %%mm1, %%mm4               \n\t"\
370
        "por %%mm2, %%mm4               \n\t"\
371
        "por %%mm3, %%mm4               \n\t"\
372
        "packssdw %%mm4,%%mm4           \n\t"\
373
        "movd %%mm4, %%eax              \n\t"\
374
        "orl %%eax, %%eax               \n\t"\
375
        "jz 1f                          \n\t"\
376
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
377
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
378
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
379
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
380
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
381
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
382
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
383
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
384
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
385
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
386
        #rounder ", %%mm4               \n\t"\
387
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
388
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
389
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
390
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
391
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
392
        #rounder ", %%mm0               \n\t"\
393
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
394
        "paddd %%mm0, %%mm0             \n\t" \
395
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
396
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
397
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
398
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
399
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
400
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
401
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
402
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
403
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
404
        "psrad $" #shift ", %%mm7       \n\t"\
405
        "psrad $" #shift ", %%mm4       \n\t"\
406
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
407
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
408
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
409
        "psrad $" #shift ", %%mm1       \n\t"\
410
        "psrad $" #shift ", %%mm2       \n\t"\
411
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
412
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
413
        "movq %%mm7, " #dst "           \n\t"\
414
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
415
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
416
        "movq %%mm2, 24+" #dst "        \n\t"\
417
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
418
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
419
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
420
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
421
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
422
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
423
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
424
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
425
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
426
        "psrad $" #shift ", %%mm2       \n\t"\
427
        "psrad $" #shift ", %%mm0       \n\t"\
428
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
429
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
430
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
431
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
432
        "psrad $" #shift ", %%mm6       \n\t"\
433
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
434
        "movq %%mm2, 8+" #dst "         \n\t"\
435
        "psrad $" #shift ", %%mm4       \n\t"\
436
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
437
        "movq %%mm4, 16+" #dst "        \n\t"\
438
        "jmp 2f                         \n\t"\
439
        "1:                             \n\t"\
440
        "pslld $16, %%mm0               \n\t"\
441
        "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
442
        "psrad $13, %%mm0               \n\t"\
443
        "packssdw %%mm0, %%mm0          \n\t"\
444
        "movq %%mm0, " #dst "           \n\t"\
445
        "movq %%mm0, 8+" #dst "         \n\t"\
446
        "movq %%mm0, 16+" #dst "        \n\t"\
447
        "movq %%mm0, 24+" #dst "        \n\t"\
448
        "2:                             \n\t"
449

450

451
//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
452
ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
453
/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
454
ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
455
ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
456

457
DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
458
DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
459
DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
460

461

462
//IDCT(      src0,   src4,   src1,    src5,    dst, shift)
463
COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
464
COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
465
COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
466
COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
467

468
#else
469

    
470
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
471
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
472
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
473
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
474
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
475
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
476
        "pand %%mm0, %%mm4              \n\t"\
477
        "por %%mm1, %%mm4               \n\t"\
478
        "por %%mm2, %%mm4               \n\t"\
479
        "por %%mm3, %%mm4               \n\t"\
480
        "packssdw %%mm4,%%mm4           \n\t"\
481
        "movd %%mm4, %%eax              \n\t"\
482
        "orl %%eax, %%eax               \n\t"\
483
        "jz 1f                          \n\t"\
484
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
485
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
486
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
487
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
488
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
489
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
490
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
491
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
492
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
493
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
494
        #rounder ", %%mm4               \n\t"\
495
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
496
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
497
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
498
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
499
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
500
        #rounder ", %%mm0               \n\t"\
501
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
502
        "paddd %%mm0, %%mm0             \n\t" \
503
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
504
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
505
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
506
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
507
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
508
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
509
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
510
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
511
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
512
        "psrad $" #shift ", %%mm7       \n\t"\
513
        "psrad $" #shift ", %%mm4       \n\t"\
514
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
515
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
516
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
517
        "psrad $" #shift ", %%mm1       \n\t"\
518
        "psrad $" #shift ", %%mm2       \n\t"\
519
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
520
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
521
        "movq %%mm7, " #dst "           \n\t"\
522
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
523
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
524
        "movq %%mm2, 24+" #dst "        \n\t"\
525
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
526
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
527
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
528
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
529
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
530
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
531
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
532
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
533
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
534
        "psrad $" #shift ", %%mm2       \n\t"\
535
        "psrad $" #shift ", %%mm0       \n\t"\
536
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
537
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
538
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
539
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
540
        "psrad $" #shift ", %%mm6       \n\t"\
541
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
542
        "movq %%mm2, 8+" #dst "         \n\t"\
543
        "psrad $" #shift ", %%mm4       \n\t"\
544
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
545
        "movq %%mm4, 16+" #dst "        \n\t"\
546
        "jmp 2f                         \n\t"\
547
        "1:                             \n\t"\
548
        "pslld $16, %%mm0               \n\t"\
549
        "paddd "MANGLE(d40000)", %%mm0  \n\t"\
550
        "psrad $13, %%mm0               \n\t"\
551
        "packssdw %%mm0, %%mm0          \n\t"\
552
        "movq %%mm0, " #dst "           \n\t"\
553
        "movq %%mm0, 8+" #dst "         \n\t"\
554
        "movq %%mm0, 16+" #dst "        \n\t"\
555
        "movq %%mm0, 24+" #dst "        \n\t"\
556
        "2:                             \n\t"
557

    
558
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
559
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
560
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
561
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
562
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
563
        "movq %%mm0, %%mm4              \n\t"\
564
        "por %%mm1, %%mm4               \n\t"\
565
        "por %%mm2, %%mm4               \n\t"\
566
        "por %%mm3, %%mm4               \n\t"\
567
        "packssdw %%mm4,%%mm4           \n\t"\
568
        "movd %%mm4, %%eax              \n\t"\
569
        "orl %%eax, %%eax               \n\t"\
570
        "jz " #bt "                     \n\t"\
571
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
572
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
573
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
574
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
575
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
576
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
577
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
578
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
579
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
580
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
581
        #rounder ", %%mm4               \n\t"\
582
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
583
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
584
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
585
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
586
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
587
        #rounder ", %%mm0               \n\t"\
588
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
589
        "paddd %%mm0, %%mm0             \n\t" \
590
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
591
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
592
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
593
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
594
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
595
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
596
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
597
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
598
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
599
        "psrad $" #shift ", %%mm7       \n\t"\
600
        "psrad $" #shift ", %%mm4       \n\t"\
601
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
602
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
603
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
604
        "psrad $" #shift ", %%mm1       \n\t"\
605
        "psrad $" #shift ", %%mm2       \n\t"\
606
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
607
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
608
        "movq %%mm7, " #dst "           \n\t"\
609
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
610
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
611
        "movq %%mm2, 24+" #dst "        \n\t"\
612
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
613
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
614
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
615
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
616
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
617
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
618
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
619
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
620
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
621
        "psrad $" #shift ", %%mm2       \n\t"\
622
        "psrad $" #shift ", %%mm0       \n\t"\
623
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
624
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
625
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
626
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
627
        "psrad $" #shift ", %%mm6       \n\t"\
628
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
629
        "movq %%mm2, 8+" #dst "         \n\t"\
630
        "psrad $" #shift ", %%mm4       \n\t"\
631
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
632
        "movq %%mm4, 16+" #dst "        \n\t"\
633

    
634
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
635
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
636
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
637
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
638
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
639
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
640
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
641
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
642
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
643
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
644
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
645
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
646
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
647
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
648
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
649
        #rounder ", %%mm4               \n\t"\
650
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
651
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
652
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
653
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
654
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
655
        #rounder ", %%mm0               \n\t"\
656
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
657
        "paddd %%mm0, %%mm0             \n\t" \
658
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
659
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
660
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
661
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
662
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
663
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
664
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
665
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
666
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
667
        "psrad $" #shift ", %%mm7       \n\t"\
668
        "psrad $" #shift ", %%mm4       \n\t"\
669
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
670
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
671
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
672
        "psrad $" #shift ", %%mm1       \n\t"\
673
        "psrad $" #shift ", %%mm2       \n\t"\
674
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
675
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
676
        "movq %%mm7, " #dst "           \n\t"\
677
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
678
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
679
        "movq %%mm2, 24+" #dst "        \n\t"\
680
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
681
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
682
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
683
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
684
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
685
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
686
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
687
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
688
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
689
        "psrad $" #shift ", %%mm2       \n\t"\
690
        "psrad $" #shift ", %%mm0       \n\t"\
691
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
692
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
693
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
694
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
695
        "psrad $" #shift ", %%mm6       \n\t"\
696
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
697
        "movq %%mm2, 8+" #dst "         \n\t"\
698
        "psrad $" #shift ", %%mm4       \n\t"\
699
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
700
        "movq %%mm4, 16+" #dst "        \n\t"\
701

    
702
//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
703
DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
704
Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
705
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
706
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
707

    
708
#undef IDCT
709
#define IDCT(src0, src4, src1, src5, dst, shift) \
710
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
711
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
712
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
713
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
714
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
715
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
716
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
717
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
718
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
719
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
720
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
721
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
722
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
723
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
724
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
725
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
726
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
727
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
728
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
729
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
730
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
731
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
732
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
733
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
734
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
735
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
736
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
737
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
738
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
739
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
740
        "psrad $" #shift ", %%mm7       \n\t"\
741
        "psrad $" #shift ", %%mm4       \n\t"\
742
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
743
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
744
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
745
        "psrad $" #shift ", %%mm0       \n\t"\
746
        "psrad $" #shift ", %%mm2       \n\t"\
747
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
748
        "movd %%mm7, " #dst "           \n\t"\
749
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
750
        "movd %%mm0, 16+" #dst "        \n\t"\
751
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
752
        "movd %%mm2, 96+" #dst "        \n\t"\
753
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
754
        "movd %%mm4, 112+" #dst "       \n\t"\
755
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
756
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
757
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
758
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
759
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
760
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
761
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
762
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
763
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
764
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
765
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
766
        "psrad $" #shift ", %%mm2       \n\t"\
767
        "psrad $" #shift ", %%mm5       \n\t"\
768
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
769
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
770
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
771
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
772
        "psrad $" #shift ", %%mm6       \n\t"\
773
        "psrad $" #shift ", %%mm4       \n\t"\
774
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
775
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
776
        "movd %%mm2, 32+" #dst "        \n\t"\
777
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
778
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
779
        "movd %%mm6, 48+" #dst "        \n\t"\
780
        "movd %%mm4, 64+" #dst "        \n\t"\
781
        "movd %%mm5, 80+" #dst "        \n\t"
782

    
783

    
784
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
785
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
786
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
787
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
788
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
789
        "jmp 9f                         \n\t"
790

    
791
        "#" ASMALIGN(4)                      \
792
        "4:                             \n\t"
793
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
794
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
795

    
796
#undef IDCT
797
#define IDCT(src0, src4, src1, src5, dst, shift) \
798
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
799
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
800
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
801
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
802
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
803
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
804
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
805
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
806
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
807
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
808
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
809
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
810
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
811
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
812
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
813
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
814
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
815
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
816
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
817
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
818
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
819
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
820
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
821
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
822
        "psrad $" #shift ", %%mm1       \n\t"\
823
        "psrad $" #shift ", %%mm4       \n\t"\
824
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
825
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
826
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
827
        "psrad $" #shift ", %%mm0       \n\t"\
828
        "psrad $" #shift ", %%mm2       \n\t"\
829
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
830
        "movd %%mm1, " #dst "           \n\t"\
831
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
832
        "movd %%mm0, 16+" #dst "        \n\t"\
833
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
834
        "movd %%mm2, 96+" #dst "        \n\t"\
835
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
836
        "movd %%mm4, 112+" #dst "       \n\t"\
837
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
838
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
839
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
840
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
841
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
842
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
843
        "psrad $" #shift ", %%mm2       \n\t"\
844
        "psrad $" #shift ", %%mm5       \n\t"\
845
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
846
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
847
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
848
        "psrad $" #shift ", %%mm6       \n\t"\
849
        "psrad $" #shift ", %%mm1       \n\t"\
850
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
851
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
852
        "movd %%mm2, 32+" #dst "        \n\t"\
853
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
854
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
855
        "movd %%mm6, 48+" #dst "        \n\t"\
856
        "movd %%mm1, 64+" #dst "        \n\t"\
857
        "movd %%mm5, 80+" #dst "        \n\t"
858

    
859
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
860
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
861
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
862
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
863
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
864
        "jmp 9f                         \n\t"
865

    
866
        "#" ASMALIGN(4)                      \
867
        "6:                             \n\t"
868
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
869

    
870
#undef IDCT
871
#define IDCT(src0, src4, src1, src5, dst, shift) \
872
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
873
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
874
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
875
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
876
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
877
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
878
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
879
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
880
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
881
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
882
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
883
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
884
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
885
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
886
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
887
        "psrad $" #shift ", %%mm1       \n\t"\
888
        "psrad $" #shift ", %%mm4       \n\t"\
889
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
890
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
891
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
892
        "psrad $" #shift ", %%mm0       \n\t"\
893
        "psrad $" #shift ", %%mm2       \n\t"\
894
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
895
        "movd %%mm1, " #dst "           \n\t"\
896
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
897
        "movd %%mm0, 16+" #dst "        \n\t"\
898
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
899
        "movd %%mm2, 96+" #dst "        \n\t"\
900
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
901
        "movd %%mm4, 112+" #dst "       \n\t"\
902
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
903
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
904
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
905
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
906
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
907
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
908
        "psrad $" #shift ", %%mm2       \n\t"\
909
        "psrad $" #shift ", %%mm5       \n\t"\
910
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
911
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
912
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
913
        "psrad $" #shift ", %%mm6       \n\t"\
914
        "psrad $" #shift ", %%mm1       \n\t"\
915
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
916
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
917
        "movd %%mm2, 32+" #dst "        \n\t"\
918
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
919
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
920
        "movd %%mm6, 48+" #dst "        \n\t"\
921
        "movd %%mm1, 64+" #dst "        \n\t"\
922
        "movd %%mm5, 80+" #dst "        \n\t"
923

    
924

    
925
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
926
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
927
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
928
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
929
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
930
        "jmp 9f                         \n\t"
931

    
932
        "#" ASMALIGN(4)                      \
933
        "2:                             \n\t"
934
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
935

    
936
#undef IDCT
937
#define IDCT(src0, src4, src1, src5, dst, shift) \
938
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
939
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
940
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
941
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
942
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
943
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
944
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
945
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
946
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
947
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
948
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
949
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
950
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
951
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
952
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
953
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
954
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
955
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
956
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
957
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
958
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
959
        "psrad $" #shift ", %%mm7       \n\t"\
960
        "psrad $" #shift ", %%mm4       \n\t"\
961
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
962
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
963
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
964
        "psrad $" #shift ", %%mm0       \n\t"\
965
        "psrad $" #shift ", %%mm2       \n\t"\
966
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
967
        "movd %%mm7, " #dst "           \n\t"\
968
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
969
        "movd %%mm0, 16+" #dst "        \n\t"\
970
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
971
        "movd %%mm2, 96+" #dst "        \n\t"\
972
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
973
        "movd %%mm4, 112+" #dst "       \n\t"\
974
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
975
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
976
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
977
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
978
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
979
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
980
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
981
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
982
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
983
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
984
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
985
        "psrad $" #shift ", %%mm2       \n\t"\
986
        "psrad $" #shift ", %%mm5       \n\t"\
987
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
988
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
989
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
990
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
991
        "psrad $" #shift ", %%mm6       \n\t"\
992
        "psrad $" #shift ", %%mm4       \n\t"\
993
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
994
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
995
        "movd %%mm2, 32+" #dst "        \n\t"\
996
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
997
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
998
        "movd %%mm6, 48+" #dst "        \n\t"\
999
        "movd %%mm4, 64+" #dst "        \n\t"\
1000
        "movd %%mm5, 80+" #dst "        \n\t"
1001

    
1002
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1003
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1004
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1005
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1006
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1007
        "jmp 9f                         \n\t"
1008

    
1009
        "#" ASMALIGN(4)                      \
1010
        "3:                             \n\t"
1011
#undef IDCT
1012
#define IDCT(src0, src4, src1, src5, dst, shift) \
1013
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1014
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1015
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1016
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1017
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1018
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1019
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1020
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1021
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1022
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1023
        "movq 64(%2), %%mm3             \n\t"\
1024
        "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1025
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1026
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1027
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1028
        "psrad $" #shift ", %%mm7       \n\t"\
1029
        "psrad $" #shift ", %%mm4       \n\t"\
1030
        "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
1031
        "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1032
        "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
1033
        "psrad $" #shift ", %%mm0       \n\t"\
1034
        "psrad $" #shift ", %%mm1       \n\t"\
1035
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1036
        "movd %%mm7, " #dst "           \n\t"\
1037
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1038
        "movd %%mm0, 16+" #dst "        \n\t"\
1039
        "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
1040
        "movd %%mm1, 96+" #dst "        \n\t"\
1041
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1042
        "movd %%mm4, 112+" #dst "       \n\t"\
1043
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1044
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1045
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1046
        "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
1047
        "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
1048
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1049
        "psrad $" #shift ", %%mm1       \n\t"\
1050
        "psrad $" #shift ", %%mm5       \n\t"\
1051
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1052
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1053
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1054
        "psrad $" #shift ", %%mm6       \n\t"\
1055
        "psrad $" #shift ", %%mm4       \n\t"\
1056
        "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
1057
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1058
        "movd %%mm1, 32+" #dst "        \n\t"\
1059
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1060
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1061
        "movd %%mm6, 48+" #dst "        \n\t"\
1062
        "movd %%mm4, 64+" #dst "        \n\t"\
1063
        "movd %%mm5, 80+" #dst "        \n\t"
1064

    
1065

    
1066
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1067
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1068
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1069
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1070
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1071
        "jmp 9f                         \n\t"
1072

    
1073
        "#" ASMALIGN(4)                      \
1074
        "5:                             \n\t"
1075
#undef IDCT
1076
#define IDCT(src0, src4, src1, src5, dst, shift) \
1077
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1078
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1079
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1080
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1081
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1082
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1083
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1084
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1085
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1086
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1087
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1088
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1089
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1090
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1091
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1092
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1093
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1094
        "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
1095
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1096
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1097
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1098
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1099
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1100
        "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1101
        "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1102
        "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
1103
        "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
1104
        "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
1105
        "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
1106
        "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
1107
        "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
1108
        "psrad $" #shift ", %%mm4       \n\t"\
1109
        "psrad $" #shift ", %%mm7       \n\t"\
1110
        "psrad $" #shift ", %%mm3       \n\t"\
1111
        "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
1112
        "movq %%mm4, " #dst "           \n\t"\
1113
        "psrad $" #shift ", %%mm0       \n\t"\
1114
        "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
1115
        "movq %%mm0, 16+" #dst "        \n\t"\
1116
        "movq %%mm0, 96+" #dst "        \n\t"\
1117
        "movq %%mm4, 112+" #dst "       \n\t"\
1118
        "psrad $" #shift ", %%mm5       \n\t"\
1119
        "psrad $" #shift ", %%mm6       \n\t"\
1120
        "psrad $" #shift ", %%mm2       \n\t"\
1121
        "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1122
        "movq %%mm5, 32+" #dst "        \n\t"\
1123
        "psrad $" #shift ", %%mm1       \n\t"\
1124
        "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1125
        "movq %%mm6, 48+" #dst "        \n\t"\
1126
        "movq %%mm6, 64+" #dst "        \n\t"\
1127
        "movq %%mm5, 80+" #dst "        \n\t"
1128

    
1129

    
1130
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1131
IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1132
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1133
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1134
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1135
        "jmp 9f                         \n\t"
1136

    
1137

    
1138
        "#" ASMALIGN(4)                      \
1139
        "1:                             \n\t"
1140
#undef IDCT
1141
#define IDCT(src0, src4, src1, src5, dst, shift) \
1142
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1143
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1144
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1145
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1146
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1147
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1148
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1149
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1150
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1151
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1152
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1153
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1154
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1155
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1156
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1157
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1158
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1159
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1160
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1161
        "movq 64(%2), %%mm1             \n\t"\
1162
        "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1163
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1164
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1165
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1166
        "psrad $" #shift ", %%mm7       \n\t"\
1167
        "psrad $" #shift ", %%mm4       \n\t"\
1168
        "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
1169
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1170
        "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
1171
        "psrad $" #shift ", %%mm0       \n\t"\
1172
        "psrad $" #shift ", %%mm3       \n\t"\
1173
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1174
        "movd %%mm7, " #dst "           \n\t"\
1175
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1176
        "movd %%mm0, 16+" #dst "        \n\t"\
1177
        "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
1178
        "movd %%mm3, 96+" #dst "        \n\t"\
1179
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1180
        "movd %%mm4, 112+" #dst "       \n\t"\
1181
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1182
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1183
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1184
        "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
1185
        "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
1186
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1187
        "psrad $" #shift ", %%mm3       \n\t"\
1188
        "psrad $" #shift ", %%mm5       \n\t"\
1189
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1190
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1191
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1192
        "psrad $" #shift ", %%mm6       \n\t"\
1193
        "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
1194
        "movd %%mm3, 32+" #dst "        \n\t"\
1195
        "psrad $" #shift ", %%mm4       \n\t"\
1196
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1197
        "movd %%mm6, 48+" #dst "        \n\t"\
1198
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1199
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1200
        "movd %%mm4, 64+" #dst "        \n\t"\
1201
        "movd %%mm5, 80+" #dst "        \n\t"
1202

    
1203

    
1204
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1205
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1206
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1207
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1208
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1209
        "jmp 9f                         \n\t"
1210

    
1211

    
1212
        "#" ASMALIGN(4)
1213
        "7:                             \n\t"
1214
#undef IDCT
1215
#define IDCT(src0, src4, src1, src5, dst, shift) \
1216
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1217
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1218
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1219
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1220
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1221
        "psrad $" #shift ", %%mm4       \n\t"\
1222
        "psrad $" #shift ", %%mm0       \n\t"\
1223
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1224
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1225
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1226
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1227
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1228
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1229
        "psrad $" #shift ", %%mm1       \n\t"\
1230
        "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
1231
        "movq %%mm4, " #dst "           \n\t"\
1232
        "psrad $" #shift ", %%mm2       \n\t"\
1233
        "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
1234
        "movq %%mm0, 16+" #dst "        \n\t"\
1235
        "movq %%mm0, 96+" #dst "        \n\t"\
1236
        "movq %%mm4, 112+" #dst "       \n\t"\
1237
        "movq %%mm0, 32+" #dst "        \n\t"\
1238
        "movq %%mm4, 48+" #dst "        \n\t"\
1239
        "movq %%mm4, 64+" #dst "        \n\t"\
1240
        "movq %%mm0, 80+" #dst "        \n\t"
1241

    
1242
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1243
IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1244
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1245
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1246
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1247

    
1248

    
1249
#endif
1250

    
1251
/*
1252
Input
1253
 00 40 04 44 20 60 24 64
1254
 10 30 14 34 50 70 54 74
1255
 01 41 03 43 21 61 23 63
1256
 11 31 13 33 51 71 53 73
1257
 02 42 06 46 22 62 26 66
1258
 12 32 16 36 52 72 56 76
1259
 05 45 07 47 25 65 27 67
1260
 15 35 17 37 55 75 57 77
1261

1262
Temp
1263
 00 04 10 14 20 24 30 34
1264
 40 44 50 54 60 64 70 74
1265
 01 03 11 13 21 23 31 33
1266
 41 43 51 53 61 63 71 73
1267
 02 06 12 16 22 26 32 36
1268
 42 46 52 56 62 66 72 76
1269
 05 07 15 17 25 27 35 37
1270
 45 47 55 57 65 67 75 77
1271
*/
1272

    
1273
"9: \n\t"
1274
                :: "r" (block), "r" (temp), "r" (coeffs)
1275
                : "%eax"
1276
        );
1277
}
1278

    
1279
void ff_simple_idct_mmx(int16_t *block)
1280
{
1281
    idct(block);
1282
}
1283

    
1284
//FIXME merge add/put into the idct
1285

    
1286
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1287
{
1288
    idct(block);
1289
    put_pixels_clamped_mmx(block, dest, line_size);
1290
}
1291
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1292
{
1293
    idct(block);
1294
    add_pixels_clamped_mmx(block, dest, line_size);
1295
}