Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / simple_idct_mmx.c @ 40d0e665

History | View | Annotate | Download (71 KB)

1
/*
2
 * Simple IDCT MMX
3
 *
4
 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22
#include "dsputil.h"
23
#include "simple_idct.h"
24

    
25
/*
26
23170.475006
27
22725.260826
28
21406.727617
29
19265.545870
30
16384.000000
31
12872.826198
32
8866.956905
33
4520.335430
34
*/
35
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39
#if 0
40
#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
41
#else
42
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
43
#endif
44
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45
#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46
#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47

    
48
#define ROW_SHIFT 11
49
#define COL_SHIFT 20 // 6
50

    
51
DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
52
DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
53

    
54
DECLARE_ALIGNED(8, static const int16_t, coeffs[])= {
55
        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
56
//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
57
//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
58
        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
59
        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
60
//        0, 0, 0, 0,
61
//        0, 0, 0, 0,
62

    
63
 C4,  C4,  C4,  C4,
64
 C4, -C4,  C4, -C4,
65

    
66
 C2,  C6,  C2,  C6,
67
 C6, -C2,  C6, -C2,
68

    
69
 C1,  C3,  C1,  C3,
70
 C5,  C7,  C5,  C7,
71

    
72
 C3, -C7,  C3, -C7,
73
-C1, -C5, -C1, -C5,
74

    
75
 C5, -C1,  C5, -C1,
76
 C7,  C3,  C7,  C3,
77

    
78
 C7, -C5,  C7, -C5,
79
 C3, -C1,  C3, -C1
80
};
81

    
82
#if 0
83
static void unused_var_killer(){
84
        int a= wm1010 + d40000;
85
        temp[0]=a;
86
}
87

88
static void inline idctCol (int16_t * col, int16_t *input)
89
{
90
#undef C0
91
#undef C1
92
#undef C2
93
#undef C3
94
#undef C4
95
#undef C5
96
#undef C6
97
#undef C7
98
        int a0, a1, a2, a3, b0, b1, b2, b3;
99
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
106
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
107
/*
108
        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
109
                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
110
                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
111
                return;
112
        }*/
113

114
col[8*0] = input[8*0 + 0];
115
col[8*1] = input[8*2 + 0];
116
col[8*2] = input[8*0 + 1];
117
col[8*3] = input[8*2 + 1];
118
col[8*4] = input[8*4 + 0];
119
col[8*5] = input[8*6 + 0];
120
col[8*6] = input[8*4 + 1];
121
col[8*7] = input[8*6 + 1];
122

123
        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
124
        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
125
        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
126
        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
127

128
        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
129
        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
130
        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
131
        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
132

133
        col[8*0] = (a0 + b0) >> COL_SHIFT;
134
        col[8*1] = (a1 + b1) >> COL_SHIFT;
135
        col[8*2] = (a2 + b2) >> COL_SHIFT;
136
        col[8*3] = (a3 + b3) >> COL_SHIFT;
137
        col[8*4] = (a3 - b3) >> COL_SHIFT;
138
        col[8*5] = (a2 - b2) >> COL_SHIFT;
139
        col[8*6] = (a1 - b1) >> COL_SHIFT;
140
        col[8*7] = (a0 - b0) >> COL_SHIFT;
141
}
142

143
static void inline idctRow (int16_t * output, int16_t * input)
144
{
145
        int16_t row[8];
146

147
        int a0, a1, a2, a3, b0, b1, b2, b3;
148
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
155
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
156

157
row[0] = input[0];
158
row[2] = input[1];
159
row[4] = input[4];
160
row[6] = input[5];
161
row[1] = input[8];
162
row[3] = input[9];
163
row[5] = input[12];
164
row[7] = input[13];
165

166
        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
167
                row[0] = row[1] = row[2] = row[3] = row[4] =
168
                        row[5] = row[6] = row[7] = row[0]<<3;
169
        output[0]  = row[0];
170
        output[2]  = row[1];
171
        output[4]  = row[2];
172
        output[6]  = row[3];
173
        output[8]  = row[4];
174
        output[10] = row[5];
175
        output[12] = row[6];
176
        output[14] = row[7];
177
                return;
178
        }
179

180
        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
181
        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
182
        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
183
        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
184

185
        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
186
        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
187
        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
188
        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
189

190
        row[0] = (a0 + b0) >> ROW_SHIFT;
191
        row[1] = (a1 + b1) >> ROW_SHIFT;
192
        row[2] = (a2 + b2) >> ROW_SHIFT;
193
        row[3] = (a3 + b3) >> ROW_SHIFT;
194
        row[4] = (a3 - b3) >> ROW_SHIFT;
195
        row[5] = (a2 - b2) >> ROW_SHIFT;
196
        row[6] = (a1 - b1) >> ROW_SHIFT;
197
        row[7] = (a0 - b0) >> ROW_SHIFT;
198

199
        output[0]  = row[0];
200
        output[2]  = row[1];
201
        output[4]  = row[2];
202
        output[6]  = row[3];
203
        output[8]  = row[4];
204
        output[10] = row[5];
205
        output[12] = row[6];
206
        output[14] = row[7];
207
}
208
#endif
209

    
210
static inline void idct(int16_t *block)
211
{
212
        DECLARE_ALIGNED(8, int64_t, align_tmp[16]);
213
        int16_t * const temp= (int16_t*)align_tmp;
214

    
215
        asm volatile(
216
#if 0 //Alternative, simpler variant
217

218
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
219
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
220
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
221
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
222
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
223
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
224
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
225
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
226
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
227
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
228
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
229
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
230
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
231
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
232
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
233
        #rounder ", %%mm4               \n\t"\
234
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
235
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
236
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
237
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
238
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
239
        #rounder ", %%mm0               \n\t"\
240
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
241
        "paddd %%mm0, %%mm0             \n\t" \
242
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
243
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
244
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
245
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
246
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
247
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
248
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
249
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
250
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
251
        "psrad $" #shift ", %%mm7       \n\t"\
252
        "psrad $" #shift ", %%mm4       \n\t"\
253
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
254
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
255
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
256
        "psrad $" #shift ", %%mm1       \n\t"\
257
        "psrad $" #shift ", %%mm2       \n\t"\
258
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
259
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
260
        "movq %%mm7, " #dst "           \n\t"\
261
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
262
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
263
        "movq %%mm2, 24+" #dst "        \n\t"\
264
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
265
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
266
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
267
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
268
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
269
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
270
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
271
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
272
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
273
        "psrad $" #shift ", %%mm2       \n\t"\
274
        "psrad $" #shift ", %%mm0       \n\t"\
275
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
276
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
277
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
278
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
279
        "psrad $" #shift ", %%mm6       \n\t"\
280
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
281
        "movq %%mm2, 8+" #dst "         \n\t"\
282
        "psrad $" #shift ", %%mm4       \n\t"\
283
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
284
        "movq %%mm4, 16+" #dst "        \n\t"\
285

286
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
287
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
288
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
289
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
290
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
291
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
292
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
293
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
294
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
295
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
296
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
297
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
298
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
299
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
300
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
301
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
302
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
303
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
304
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
305
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
306
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
307
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
308
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
309
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
310
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
311
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
312
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
313
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
314
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
315
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
316
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
317
        "psrad $" #shift ", %%mm7       \n\t"\
318
        "psrad $" #shift ", %%mm4       \n\t"\
319
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
320
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
321
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
322
        "psrad $" #shift ", %%mm0       \n\t"\
323
        "psrad $" #shift ", %%mm2       \n\t"\
324
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
325
        "movd %%mm7, " #dst "           \n\t"\
326
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
327
        "movd %%mm0, 16+" #dst "        \n\t"\
328
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
329
        "movd %%mm2, 96+" #dst "        \n\t"\
330
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
331
        "movd %%mm4, 112+" #dst "       \n\t"\
332
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
333
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
334
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
335
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
336
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
337
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
338
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
339
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
340
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
341
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
342
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
343
        "psrad $" #shift ", %%mm2       \n\t"\
344
        "psrad $" #shift ", %%mm5       \n\t"\
345
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
346
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
347
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
348
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
349
        "psrad $" #shift ", %%mm6       \n\t"\
350
        "psrad $" #shift ", %%mm4       \n\t"\
351
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
352
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
353
        "movd %%mm2, 32+" #dst "        \n\t"\
354
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
355
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
356
        "movd %%mm6, 48+" #dst "        \n\t"\
357
        "movd %%mm4, 64+" #dst "        \n\t"\
358
        "movd %%mm5, 80+" #dst "        \n\t"\
359

360

361
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
362
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
363
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
364
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
365
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
366
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
367
        "pand %%mm0, %%mm4              \n\t"\
368
        "por %%mm1, %%mm4               \n\t"\
369
        "por %%mm2, %%mm4               \n\t"\
370
        "por %%mm3, %%mm4               \n\t"\
371
        "packssdw %%mm4,%%mm4           \n\t"\
372
        "movd %%mm4, %%eax              \n\t"\
373
        "orl %%eax, %%eax               \n\t"\
374
        "jz 1f                          \n\t"\
375
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
376
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
377
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
378
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
379
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
380
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
381
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
382
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
383
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
384
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
385
        #rounder ", %%mm4               \n\t"\
386
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
387
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
388
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
389
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
390
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
391
        #rounder ", %%mm0               \n\t"\
392
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
393
        "paddd %%mm0, %%mm0             \n\t" \
394
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
395
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
396
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
397
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
398
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
399
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
400
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
401
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
402
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
403
        "psrad $" #shift ", %%mm7       \n\t"\
404
        "psrad $" #shift ", %%mm4       \n\t"\
405
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
406
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
407
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
408
        "psrad $" #shift ", %%mm1       \n\t"\
409
        "psrad $" #shift ", %%mm2       \n\t"\
410
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
411
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
412
        "movq %%mm7, " #dst "           \n\t"\
413
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
414
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
415
        "movq %%mm2, 24+" #dst "        \n\t"\
416
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
417
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
418
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
419
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
420
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
421
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
422
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
423
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
424
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
425
        "psrad $" #shift ", %%mm2       \n\t"\
426
        "psrad $" #shift ", %%mm0       \n\t"\
427
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
428
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
429
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
430
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
431
        "psrad $" #shift ", %%mm6       \n\t"\
432
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
433
        "movq %%mm2, 8+" #dst "         \n\t"\
434
        "psrad $" #shift ", %%mm4       \n\t"\
435
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
436
        "movq %%mm4, 16+" #dst "        \n\t"\
437
        "jmp 2f                         \n\t"\
438
        "1:                             \n\t"\
439
        "pslld $16, %%mm0               \n\t"\
440
        "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
441
        "psrad $13, %%mm0               \n\t"\
442
        "packssdw %%mm0, %%mm0          \n\t"\
443
        "movq %%mm0, " #dst "           \n\t"\
444
        "movq %%mm0, 8+" #dst "         \n\t"\
445
        "movq %%mm0, 16+" #dst "        \n\t"\
446
        "movq %%mm0, 24+" #dst "        \n\t"\
447
        "2:                             \n\t"
448

449

450
//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
451
ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
452
/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
453
ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
454
ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
455

456
DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
457
DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
458
DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
459

460

461
//IDCT(      src0,   src4,   src1,    src5,    dst, shift)
462
COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
463
COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
464
COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
465
COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
466

467
#else
468

    
469
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
470
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
471
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
472
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
473
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
474
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
475
        "pand %%mm0, %%mm4              \n\t"\
476
        "por %%mm1, %%mm4               \n\t"\
477
        "por %%mm2, %%mm4               \n\t"\
478
        "por %%mm3, %%mm4               \n\t"\
479
        "packssdw %%mm4,%%mm4           \n\t"\
480
        "movd %%mm4, %%eax              \n\t"\
481
        "orl %%eax, %%eax               \n\t"\
482
        "jz 1f                          \n\t"\
483
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
484
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
485
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
486
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
487
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
488
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
489
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
490
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
491
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
492
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
493
        #rounder ", %%mm4               \n\t"\
494
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
495
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
496
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
497
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
498
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
499
        #rounder ", %%mm0               \n\t"\
500
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
501
        "paddd %%mm0, %%mm0             \n\t" \
502
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
503
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
504
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
505
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
506
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
507
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
508
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
509
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
510
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
511
        "psrad $" #shift ", %%mm7       \n\t"\
512
        "psrad $" #shift ", %%mm4       \n\t"\
513
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
514
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
515
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
516
        "psrad $" #shift ", %%mm1       \n\t"\
517
        "psrad $" #shift ", %%mm2       \n\t"\
518
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
519
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
520
        "movq %%mm7, " #dst "           \n\t"\
521
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
522
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
523
        "movq %%mm2, 24+" #dst "        \n\t"\
524
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
525
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
526
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
527
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
528
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
529
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
530
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
531
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
532
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
533
        "psrad $" #shift ", %%mm2       \n\t"\
534
        "psrad $" #shift ", %%mm0       \n\t"\
535
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
536
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
537
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
538
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
539
        "psrad $" #shift ", %%mm6       \n\t"\
540
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
541
        "movq %%mm2, 8+" #dst "         \n\t"\
542
        "psrad $" #shift ", %%mm4       \n\t"\
543
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
544
        "movq %%mm4, 16+" #dst "        \n\t"\
545
        "jmp 2f                         \n\t"\
546
        "1:                             \n\t"\
547
        "pslld $16, %%mm0               \n\t"\
548
        "paddd "MANGLE(d40000)", %%mm0  \n\t"\
549
        "psrad $13, %%mm0               \n\t"\
550
        "packssdw %%mm0, %%mm0          \n\t"\
551
        "movq %%mm0, " #dst "           \n\t"\
552
        "movq %%mm0, 8+" #dst "         \n\t"\
553
        "movq %%mm0, 16+" #dst "        \n\t"\
554
        "movq %%mm0, 24+" #dst "        \n\t"\
555
        "2:                             \n\t"
556

    
557
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
558
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
559
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
560
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
561
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
562
        "movq %%mm0, %%mm4              \n\t"\
563
        "por %%mm1, %%mm4               \n\t"\
564
        "por %%mm2, %%mm4               \n\t"\
565
        "por %%mm3, %%mm4               \n\t"\
566
        "packssdw %%mm4,%%mm4           \n\t"\
567
        "movd %%mm4, %%eax              \n\t"\
568
        "orl %%eax, %%eax               \n\t"\
569
        "jz " #bt "                     \n\t"\
570
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
571
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
572
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
573
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
574
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
575
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
576
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
577
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
578
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
579
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
580
        #rounder ", %%mm4               \n\t"\
581
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
582
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
583
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
584
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
585
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
586
        #rounder ", %%mm0               \n\t"\
587
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
588
        "paddd %%mm0, %%mm0             \n\t" \
589
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
590
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
591
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
592
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
593
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
594
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
595
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
596
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
597
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
598
        "psrad $" #shift ", %%mm7       \n\t"\
599
        "psrad $" #shift ", %%mm4       \n\t"\
600
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
601
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
602
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
603
        "psrad $" #shift ", %%mm1       \n\t"\
604
        "psrad $" #shift ", %%mm2       \n\t"\
605
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
606
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
607
        "movq %%mm7, " #dst "           \n\t"\
608
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
609
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
610
        "movq %%mm2, 24+" #dst "        \n\t"\
611
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
612
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
613
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
614
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
615
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
616
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
617
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
618
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
619
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
620
        "psrad $" #shift ", %%mm2       \n\t"\
621
        "psrad $" #shift ", %%mm0       \n\t"\
622
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
623
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
624
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
625
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
626
        "psrad $" #shift ", %%mm6       \n\t"\
627
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
628
        "movq %%mm2, 8+" #dst "         \n\t"\
629
        "psrad $" #shift ", %%mm4       \n\t"\
630
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
631
        "movq %%mm4, 16+" #dst "        \n\t"\
632

    
633
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
634
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
635
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
636
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
637
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
638
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
639
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
640
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
641
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
642
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
643
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
644
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
645
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
646
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
647
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
648
        #rounder ", %%mm4               \n\t"\
649
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
650
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
651
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
652
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
653
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
654
        #rounder ", %%mm0               \n\t"\
655
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
656
        "paddd %%mm0, %%mm0             \n\t" \
657
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
658
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
659
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
660
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
661
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
662
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
663
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
664
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
665
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
666
        "psrad $" #shift ", %%mm7       \n\t"\
667
        "psrad $" #shift ", %%mm4       \n\t"\
668
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
669
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
670
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
671
        "psrad $" #shift ", %%mm1       \n\t"\
672
        "psrad $" #shift ", %%mm2       \n\t"\
673
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
674
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
675
        "movq %%mm7, " #dst "           \n\t"\
676
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
677
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
678
        "movq %%mm2, 24+" #dst "        \n\t"\
679
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
680
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
681
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
682
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
683
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
684
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
685
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
686
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
687
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
688
        "psrad $" #shift ", %%mm2       \n\t"\
689
        "psrad $" #shift ", %%mm0       \n\t"\
690
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
691
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
692
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
693
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
694
        "psrad $" #shift ", %%mm6       \n\t"\
695
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
696
        "movq %%mm2, 8+" #dst "         \n\t"\
697
        "psrad $" #shift ", %%mm4       \n\t"\
698
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
699
        "movq %%mm4, 16+" #dst "        \n\t"\
700

    
701
//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
702
DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
703
Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
704
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
705
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
706

    
707
#undef IDCT
708
#define IDCT(src0, src4, src1, src5, dst, shift) \
709
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
710
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
711
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
712
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
713
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
714
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
715
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
716
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
717
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
718
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
719
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
720
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
721
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
722
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
723
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
724
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
725
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
726
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
727
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
728
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
729
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
730
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
731
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
732
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
733
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
734
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
735
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
736
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
737
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
738
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
739
        "psrad $" #shift ", %%mm7       \n\t"\
740
        "psrad $" #shift ", %%mm4       \n\t"\
741
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
742
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
743
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
744
        "psrad $" #shift ", %%mm0       \n\t"\
745
        "psrad $" #shift ", %%mm2       \n\t"\
746
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
747
        "movd %%mm7, " #dst "           \n\t"\
748
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
749
        "movd %%mm0, 16+" #dst "        \n\t"\
750
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
751
        "movd %%mm2, 96+" #dst "        \n\t"\
752
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
753
        "movd %%mm4, 112+" #dst "       \n\t"\
754
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
755
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
756
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
757
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
758
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
759
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
760
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
761
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
762
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
763
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
764
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
765
        "psrad $" #shift ", %%mm2       \n\t"\
766
        "psrad $" #shift ", %%mm5       \n\t"\
767
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
768
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
769
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
770
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
771
        "psrad $" #shift ", %%mm6       \n\t"\
772
        "psrad $" #shift ", %%mm4       \n\t"\
773
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
774
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
775
        "movd %%mm2, 32+" #dst "        \n\t"\
776
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
777
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
778
        "movd %%mm6, 48+" #dst "        \n\t"\
779
        "movd %%mm4, 64+" #dst "        \n\t"\
780
        "movd %%mm5, 80+" #dst "        \n\t"
781

    
782

    
783
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
784
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
785
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
786
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
787
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
788
        "jmp 9f                         \n\t"
789

    
790
        "#" ASMALIGN(4)                      \
791
        "4:                             \n\t"
792
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
793
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
794

    
795
#undef IDCT
796
#define IDCT(src0, src4, src1, src5, dst, shift) \
797
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
798
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
799
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
800
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
801
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
802
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
803
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
804
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
805
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
806
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
807
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
808
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
809
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
810
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
811
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
812
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
813
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
814
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
815
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
816
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
817
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
818
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
819
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
820
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
821
        "psrad $" #shift ", %%mm1       \n\t"\
822
        "psrad $" #shift ", %%mm4       \n\t"\
823
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
824
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
825
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
826
        "psrad $" #shift ", %%mm0       \n\t"\
827
        "psrad $" #shift ", %%mm2       \n\t"\
828
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
829
        "movd %%mm1, " #dst "           \n\t"\
830
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
831
        "movd %%mm0, 16+" #dst "        \n\t"\
832
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
833
        "movd %%mm2, 96+" #dst "        \n\t"\
834
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
835
        "movd %%mm4, 112+" #dst "       \n\t"\
836
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
837
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
838
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
839
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
840
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
841
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
842
        "psrad $" #shift ", %%mm2       \n\t"\
843
        "psrad $" #shift ", %%mm5       \n\t"\
844
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
845
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
846
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
847
        "psrad $" #shift ", %%mm6       \n\t"\
848
        "psrad $" #shift ", %%mm1       \n\t"\
849
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
850
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
851
        "movd %%mm2, 32+" #dst "        \n\t"\
852
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
853
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
854
        "movd %%mm6, 48+" #dst "        \n\t"\
855
        "movd %%mm1, 64+" #dst "        \n\t"\
856
        "movd %%mm5, 80+" #dst "        \n\t"
857

    
858
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
859
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
860
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
861
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
862
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
863
        "jmp 9f                         \n\t"
864

    
865
        "#" ASMALIGN(4)                      \
866
        "6:                             \n\t"
867
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
868

    
869
#undef IDCT
870
#define IDCT(src0, src4, src1, src5, dst, shift) \
871
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
872
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
873
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
874
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
875
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
876
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
877
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
878
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
879
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
880
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
881
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
882
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
883
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
884
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
885
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
886
        "psrad $" #shift ", %%mm1       \n\t"\
887
        "psrad $" #shift ", %%mm4       \n\t"\
888
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
889
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
890
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
891
        "psrad $" #shift ", %%mm0       \n\t"\
892
        "psrad $" #shift ", %%mm2       \n\t"\
893
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
894
        "movd %%mm1, " #dst "           \n\t"\
895
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
896
        "movd %%mm0, 16+" #dst "        \n\t"\
897
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
898
        "movd %%mm2, 96+" #dst "        \n\t"\
899
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
900
        "movd %%mm4, 112+" #dst "       \n\t"\
901
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
902
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
903
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
904
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
905
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
906
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
907
        "psrad $" #shift ", %%mm2       \n\t"\
908
        "psrad $" #shift ", %%mm5       \n\t"\
909
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
910
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
911
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
912
        "psrad $" #shift ", %%mm6       \n\t"\
913
        "psrad $" #shift ", %%mm1       \n\t"\
914
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
915
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
916
        "movd %%mm2, 32+" #dst "        \n\t"\
917
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
918
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
919
        "movd %%mm6, 48+" #dst "        \n\t"\
920
        "movd %%mm1, 64+" #dst "        \n\t"\
921
        "movd %%mm5, 80+" #dst "        \n\t"
922

    
923

    
924
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
925
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
926
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
927
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
928
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
929
        "jmp 9f                         \n\t"
930

    
931
        "#" ASMALIGN(4)                      \
932
        "2:                             \n\t"
933
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
934

    
935
#undef IDCT
936
#define IDCT(src0, src4, src1, src5, dst, shift) \
937
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
938
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
939
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
940
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
941
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
942
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
943
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
944
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
945
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
946
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
947
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
948
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
949
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
950
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
951
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
952
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
953
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
954
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
955
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
956
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
957
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
958
        "psrad $" #shift ", %%mm7       \n\t"\
959
        "psrad $" #shift ", %%mm4       \n\t"\
960
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
961
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
962
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
963
        "psrad $" #shift ", %%mm0       \n\t"\
964
        "psrad $" #shift ", %%mm2       \n\t"\
965
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
966
        "movd %%mm7, " #dst "           \n\t"\
967
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
968
        "movd %%mm0, 16+" #dst "        \n\t"\
969
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
970
        "movd %%mm2, 96+" #dst "        \n\t"\
971
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
972
        "movd %%mm4, 112+" #dst "       \n\t"\
973
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
974
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
975
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
976
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
977
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
978
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
979
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
980
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
981
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
982
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
983
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
984
        "psrad $" #shift ", %%mm2       \n\t"\
985
        "psrad $" #shift ", %%mm5       \n\t"\
986
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
987
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
988
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
989
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
990
        "psrad $" #shift ", %%mm6       \n\t"\
991
        "psrad $" #shift ", %%mm4       \n\t"\
992
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
993
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
994
        "movd %%mm2, 32+" #dst "        \n\t"\
995
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
996
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
997
        "movd %%mm6, 48+" #dst "        \n\t"\
998
        "movd %%mm4, 64+" #dst "        \n\t"\
999
        "movd %%mm5, 80+" #dst "        \n\t"
1000

    
1001
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1002
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1003
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1004
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1005
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1006
        "jmp 9f                         \n\t"
1007

    
1008
        "#" ASMALIGN(4)                      \
1009
        "3:                             \n\t"
1010
#undef IDCT
1011
#define IDCT(src0, src4, src1, src5, dst, shift) \
1012
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1013
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1014
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1015
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1016
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1017
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1018
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1019
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1020
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1021
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1022
        "movq 64(%2), %%mm3             \n\t"\
1023
        "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1024
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1025
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1026
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1027
        "psrad $" #shift ", %%mm7       \n\t"\
1028
        "psrad $" #shift ", %%mm4       \n\t"\
1029
        "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
1030
        "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1031
        "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
1032
        "psrad $" #shift ", %%mm0       \n\t"\
1033
        "psrad $" #shift ", %%mm1       \n\t"\
1034
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1035
        "movd %%mm7, " #dst "           \n\t"\
1036
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1037
        "movd %%mm0, 16+" #dst "        \n\t"\
1038
        "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
1039
        "movd %%mm1, 96+" #dst "        \n\t"\
1040
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1041
        "movd %%mm4, 112+" #dst "       \n\t"\
1042
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1043
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1044
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1045
        "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
1046
        "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
1047
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1048
        "psrad $" #shift ", %%mm1       \n\t"\
1049
        "psrad $" #shift ", %%mm5       \n\t"\
1050
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1051
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1052
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1053
        "psrad $" #shift ", %%mm6       \n\t"\
1054
        "psrad $" #shift ", %%mm4       \n\t"\
1055
        "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
1056
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1057
        "movd %%mm1, 32+" #dst "        \n\t"\
1058
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1059
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1060
        "movd %%mm6, 48+" #dst "        \n\t"\
1061
        "movd %%mm4, 64+" #dst "        \n\t"\
1062
        "movd %%mm5, 80+" #dst "        \n\t"
1063

    
1064

    
1065
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1066
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1067
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1068
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1069
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1070
        "jmp 9f                         \n\t"
1071

    
1072
        "#" ASMALIGN(4)                      \
1073
        "5:                             \n\t"
1074
#undef IDCT
1075
#define IDCT(src0, src4, src1, src5, dst, shift) \
1076
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1077
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1078
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1079
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1080
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1081
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1082
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1083
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1084
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1085
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1086
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1087
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1088
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1089
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1090
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1091
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1092
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1093
        "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
1094
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1095
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1096
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1097
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1098
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1099
        "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1100
        "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1101
        "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
1102
        "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
1103
        "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
1104
        "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
1105
        "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
1106
        "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
1107
        "psrad $" #shift ", %%mm4       \n\t"\
1108
        "psrad $" #shift ", %%mm7       \n\t"\
1109
        "psrad $" #shift ", %%mm3       \n\t"\
1110
        "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
1111
        "movq %%mm4, " #dst "           \n\t"\
1112
        "psrad $" #shift ", %%mm0       \n\t"\
1113
        "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
1114
        "movq %%mm0, 16+" #dst "        \n\t"\
1115
        "movq %%mm0, 96+" #dst "        \n\t"\
1116
        "movq %%mm4, 112+" #dst "       \n\t"\
1117
        "psrad $" #shift ", %%mm5       \n\t"\
1118
        "psrad $" #shift ", %%mm6       \n\t"\
1119
        "psrad $" #shift ", %%mm2       \n\t"\
1120
        "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1121
        "movq %%mm5, 32+" #dst "        \n\t"\
1122
        "psrad $" #shift ", %%mm1       \n\t"\
1123
        "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1124
        "movq %%mm6, 48+" #dst "        \n\t"\
1125
        "movq %%mm6, 64+" #dst "        \n\t"\
1126
        "movq %%mm5, 80+" #dst "        \n\t"
1127

    
1128

    
1129
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1130
IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1131
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1132
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1133
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1134
        "jmp 9f                         \n\t"
1135

    
1136

    
1137
        "#" ASMALIGN(4)                      \
1138
        "1:                             \n\t"
1139
#undef IDCT
1140
#define IDCT(src0, src4, src1, src5, dst, shift) \
1141
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1142
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1143
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1144
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1145
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1146
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1147
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1148
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1149
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1150
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1151
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1152
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1153
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1154
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1155
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1156
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1157
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1158
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1159
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1160
        "movq 64(%2), %%mm1             \n\t"\
1161
        "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1162
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1163
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1164
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1165
        "psrad $" #shift ", %%mm7       \n\t"\
1166
        "psrad $" #shift ", %%mm4       \n\t"\
1167
        "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
1168
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1169
        "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
1170
        "psrad $" #shift ", %%mm0       \n\t"\
1171
        "psrad $" #shift ", %%mm3       \n\t"\
1172
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1173
        "movd %%mm7, " #dst "           \n\t"\
1174
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1175
        "movd %%mm0, 16+" #dst "        \n\t"\
1176
        "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
1177
        "movd %%mm3, 96+" #dst "        \n\t"\
1178
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1179
        "movd %%mm4, 112+" #dst "       \n\t"\
1180
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1181
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1182
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1183
        "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
1184
        "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
1185
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1186
        "psrad $" #shift ", %%mm3       \n\t"\
1187
        "psrad $" #shift ", %%mm5       \n\t"\
1188
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1189
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1190
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1191
        "psrad $" #shift ", %%mm6       \n\t"\
1192
        "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
1193
        "movd %%mm3, 32+" #dst "        \n\t"\
1194
        "psrad $" #shift ", %%mm4       \n\t"\
1195
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1196
        "movd %%mm6, 48+" #dst "        \n\t"\
1197
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1198
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1199
        "movd %%mm4, 64+" #dst "        \n\t"\
1200
        "movd %%mm5, 80+" #dst "        \n\t"
1201

    
1202

    
1203
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1204
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1205
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1206
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1207
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1208
        "jmp 9f                         \n\t"
1209

    
1210

    
1211
        "#" ASMALIGN(4)
1212
        "7:                             \n\t"
1213
#undef IDCT
1214
#define IDCT(src0, src4, src1, src5, dst, shift) \
1215
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1216
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1217
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1218
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1219
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1220
        "psrad $" #shift ", %%mm4       \n\t"\
1221
        "psrad $" #shift ", %%mm0       \n\t"\
1222
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1223
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1224
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1225
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1226
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1227
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1228
        "psrad $" #shift ", %%mm1       \n\t"\
1229
        "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
1230
        "movq %%mm4, " #dst "           \n\t"\
1231
        "psrad $" #shift ", %%mm2       \n\t"\
1232
        "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
1233
        "movq %%mm0, 16+" #dst "        \n\t"\
1234
        "movq %%mm0, 96+" #dst "        \n\t"\
1235
        "movq %%mm4, 112+" #dst "       \n\t"\
1236
        "movq %%mm0, 32+" #dst "        \n\t"\
1237
        "movq %%mm4, 48+" #dst "        \n\t"\
1238
        "movq %%mm4, 64+" #dst "        \n\t"\
1239
        "movq %%mm0, 80+" #dst "        \n\t"
1240

    
1241
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1242
IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1243
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1244
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1245
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1246

    
1247

    
1248
#endif
1249

    
1250
/*
1251
Input
1252
 00 40 04 44 20 60 24 64
1253
 10 30 14 34 50 70 54 74
1254
 01 41 03 43 21 61 23 63
1255
 11 31 13 33 51 71 53 73
1256
 02 42 06 46 22 62 26 66
1257
 12 32 16 36 52 72 56 76
1258
 05 45 07 47 25 65 27 67
1259
 15 35 17 37 55 75 57 77
1260

1261
Temp
1262
 00 04 10 14 20 24 30 34
1263
 40 44 50 54 60 64 70 74
1264
 01 03 11 13 21 23 31 33
1265
 41 43 51 53 61 63 71 73
1266
 02 06 12 16 22 26 32 36
1267
 42 46 52 56 62 66 72 76
1268
 05 07 15 17 25 27 35 37
1269
 45 47 55 57 65 67 75 77
1270
*/
1271

    
1272
"9: \n\t"
1273
                :: "r" (block), "r" (temp), "r" (coeffs)
1274
                : "%eax"
1275
        );
1276
}
1277

    
1278
void ff_simple_idct_mmx(int16_t *block)
1279
{
1280
    idct(block);
1281
}
1282

    
1283
//FIXME merge add/put into the idct
1284

    
1285
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1286
{
1287
    idct(block);
1288
    put_pixels_clamped_mmx(block, dest, line_size);
1289
}
1290
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1291
{
1292
    idct(block);
1293
    add_pixels_clamped_mmx(block, dest, line_size);
1294
}