Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / simple_idct_mmx.c @ b550bfaa

History | View | Annotate | Download (71.1 KB)

1
/*
2
 * Simple IDCT MMX
3
 *
4
 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22
#include "dsputil.h"
23
#include "simple_idct.h"
24

    
25
/*
26
23170.475006
27
22725.260826
28
21406.727617
29
19265.545870
30
16384.000000
31
12872.826198
32
8866.956905
33
4520.335430
34
*/
35
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39
#if 0
40
#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
41
#else
42
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
43
#endif
44
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45
#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46
#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47

    
48
#define ROW_SHIFT 11
49
#define COL_SHIFT 20 // 6
50

    
51
static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
52
static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
53

    
54
static const int16_t __attribute__((aligned(8))) coeffs[]= {
55
        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
56
//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
57
//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
58
        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
59
        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
60
//        0, 0, 0, 0,
61
//        0, 0, 0, 0,
62

    
63
 C4,  C4,  C4,  C4,
64
 C4, -C4,  C4, -C4,
65

    
66
 C2,  C6,  C2,  C6,
67
 C6, -C2,  C6, -C2,
68

    
69
 C1,  C3,  C1,  C3,
70
 C5,  C7,  C5,  C7,
71

    
72
 C3, -C7,  C3, -C7,
73
-C1, -C5, -C1, -C5,
74

    
75
 C5, -C1,  C5, -C1,
76
 C7,  C3,  C7,  C3,
77

    
78
 C7, -C5,  C7, -C5,
79
 C3, -C1,  C3, -C1
80
};
81

    
82
#if 0
83
static void unused_var_killer(){
84
        int a= wm1010 + d40000;
85
        temp[0]=a;
86
}
87

88
static void inline idctCol (int16_t * col, int16_t *input)
89
{
90
#undef C0
91
#undef C1
92
#undef C2
93
#undef C3
94
#undef C4
95
#undef C5
96
#undef C6
97
#undef C7
98
        int a0, a1, a2, a3, b0, b1, b2, b3;
99
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
106
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
107
/*
108
        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
109
                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
110
                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
111
                return;
112
        }*/
113

114
col[8*0] = input[8*0 + 0];
115
col[8*1] = input[8*2 + 0];
116
col[8*2] = input[8*0 + 1];
117
col[8*3] = input[8*2 + 1];
118
col[8*4] = input[8*4 + 0];
119
col[8*5] = input[8*6 + 0];
120
col[8*6] = input[8*4 + 1];
121
col[8*7] = input[8*6 + 1];
122

123
        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
124
        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
125
        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
126
        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
127

128
        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
129
        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
130
        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
131
        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
132

133
        col[8*0] = (a0 + b0) >> COL_SHIFT;
134
        col[8*1] = (a1 + b1) >> COL_SHIFT;
135
        col[8*2] = (a2 + b2) >> COL_SHIFT;
136
        col[8*3] = (a3 + b3) >> COL_SHIFT;
137
        col[8*4] = (a3 - b3) >> COL_SHIFT;
138
        col[8*5] = (a2 - b2) >> COL_SHIFT;
139
        col[8*6] = (a1 - b1) >> COL_SHIFT;
140
        col[8*7] = (a0 - b0) >> COL_SHIFT;
141
}
142

143
static void inline idctRow (int16_t * output, int16_t * input)
144
{
145
        int16_t row[8];
146

147
        int a0, a1, a2, a3, b0, b1, b2, b3;
148
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
155
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
156

157
row[0] = input[0];
158
row[2] = input[1];
159
row[4] = input[4];
160
row[6] = input[5];
161
row[1] = input[8];
162
row[3] = input[9];
163
row[5] = input[12];
164
row[7] = input[13];
165

166
        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
167
                row[0] = row[1] = row[2] = row[3] = row[4] =
168
                        row[5] = row[6] = row[7] = row[0]<<3;
169
        output[0]  = row[0];
170
        output[2]  = row[1];
171
        output[4]  = row[2];
172
        output[6]  = row[3];
173
        output[8]  = row[4];
174
        output[10] = row[5];
175
        output[12] = row[6];
176
        output[14] = row[7];
177
                return;
178
        }
179

180
        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
181
        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
182
        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
183
        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
184

185
        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
186
        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
187
        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
188
        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
189

190
        row[0] = (a0 + b0) >> ROW_SHIFT;
191
        row[1] = (a1 + b1) >> ROW_SHIFT;
192
        row[2] = (a2 + b2) >> ROW_SHIFT;
193
        row[3] = (a3 + b3) >> ROW_SHIFT;
194
        row[4] = (a3 - b3) >> ROW_SHIFT;
195
        row[5] = (a2 - b2) >> ROW_SHIFT;
196
        row[6] = (a1 - b1) >> ROW_SHIFT;
197
        row[7] = (a0 - b0) >> ROW_SHIFT;
198

199
        output[0]  = row[0];
200
        output[2]  = row[1];
201
        output[4]  = row[2];
202
        output[6]  = row[3];
203
        output[8]  = row[4];
204
        output[10] = row[5];
205
        output[12] = row[6];
206
        output[14] = row[7];
207
}
208
#endif
209

    
210
static inline void idct(int16_t *block)
211
{
212
        int64_t __attribute__((aligned(8))) align_tmp[16];
213
        int16_t * const temp= (int16_t*)align_tmp;
214

    
215
        asm volatile(
216
#if 0 //Alternative, simpler variant
217

218
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
219
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
220
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
221
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
222
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
223
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
224
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
225
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
226
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
227
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
228
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
229
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
230
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
231
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
232
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
233
        #rounder ", %%mm4               \n\t"\
234
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
235
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
236
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
237
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
238
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
239
        #rounder ", %%mm0               \n\t"\
240
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
241
        "paddd %%mm0, %%mm0             \n\t" \
242
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
243
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
244
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
245
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
246
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
247
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
248
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
249
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
250
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
251
        "psrad $" #shift ", %%mm7       \n\t"\
252
        "psrad $" #shift ", %%mm4       \n\t"\
253
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
254
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
255
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
256
        "psrad $" #shift ", %%mm1       \n\t"\
257
        "psrad $" #shift ", %%mm2       \n\t"\
258
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
259
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
260
        "movq %%mm7, " #dst "           \n\t"\
261
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
262
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
263
        "movq %%mm2, 24+" #dst "        \n\t"\
264
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
265
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
266
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
267
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
268
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
269
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
270
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
271
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
272
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
273
        "psrad $" #shift ", %%mm2       \n\t"\
274
        "psrad $" #shift ", %%mm0       \n\t"\
275
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
276
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
277
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
278
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
279
        "psrad $" #shift ", %%mm6       \n\t"\
280
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
281
        "movq %%mm2, 8+" #dst "         \n\t"\
282
        "psrad $" #shift ", %%mm4       \n\t"\
283
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
284
        "movq %%mm4, 16+" #dst "        \n\t"\
285

286
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
287
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
288
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
289
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
290
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
291
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
292
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
293
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
294
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
295
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
296
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
297
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
298
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
299
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
300
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
301
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
302
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
303
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
304
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
305
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
306
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
307
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
308
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
309
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
310
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
311
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
312
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
313
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
314
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
315
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
316
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
317
        "psrad $" #shift ", %%mm7       \n\t"\
318
        "psrad $" #shift ", %%mm4       \n\t"\
319
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
320
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
321
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
322
        "psrad $" #shift ", %%mm0       \n\t"\
323
        "psrad $" #shift ", %%mm2       \n\t"\
324
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
325
        "movd %%mm7, " #dst "           \n\t"\
326
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
327
        "movd %%mm0, 16+" #dst "        \n\t"\
328
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
329
        "movd %%mm2, 96+" #dst "        \n\t"\
330
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
331
        "movd %%mm4, 112+" #dst "       \n\t"\
332
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
333
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
334
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
335
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
336
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
337
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
338
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
339
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
340
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
341
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
342
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
343
        "psrad $" #shift ", %%mm2       \n\t"\
344
        "psrad $" #shift ", %%mm5       \n\t"\
345
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
346
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
347
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
348
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
349
        "psrad $" #shift ", %%mm6       \n\t"\
350
        "psrad $" #shift ", %%mm4       \n\t"\
351
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
352
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
353
        "movd %%mm2, 32+" #dst "        \n\t"\
354
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
355
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
356
        "movd %%mm6, 48+" #dst "        \n\t"\
357
        "movd %%mm4, 64+" #dst "        \n\t"\
358
        "movd %%mm5, 80+" #dst "        \n\t"\
359

360

361
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
362
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
363
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
364
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
365
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
366
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
367
        "pand %%mm0, %%mm4              \n\t"\
368
        "por %%mm1, %%mm4               \n\t"\
369
        "por %%mm2, %%mm4               \n\t"\
370
        "por %%mm3, %%mm4               \n\t"\
371
        "packssdw %%mm4,%%mm4           \n\t"\
372
        "movd %%mm4, %%eax              \n\t"\
373
        "orl %%eax, %%eax               \n\t"\
374
        "jz 1f                          \n\t"\
375
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
376
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
377
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
378
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
379
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
380
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
381
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
382
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
383
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
384
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
385
        #rounder ", %%mm4               \n\t"\
386
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
387
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
388
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
389
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
390
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
391
        #rounder ", %%mm0               \n\t"\
392
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
393
        "paddd %%mm0, %%mm0             \n\t" \
394
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
395
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
396
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
397
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
398
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
399
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
400
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
401
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
402
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
403
        "psrad $" #shift ", %%mm7       \n\t"\
404
        "psrad $" #shift ", %%mm4       \n\t"\
405
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
406
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
407
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
408
        "psrad $" #shift ", %%mm1       \n\t"\
409
        "psrad $" #shift ", %%mm2       \n\t"\
410
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
411
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
412
        "movq %%mm7, " #dst "           \n\t"\
413
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
414
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
415
        "movq %%mm2, 24+" #dst "        \n\t"\
416
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
417
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
418
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
419
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
420
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
421
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
422
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
423
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
424
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
425
        "psrad $" #shift ", %%mm2       \n\t"\
426
        "psrad $" #shift ", %%mm0       \n\t"\
427
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
428
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
429
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
430
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
431
        "psrad $" #shift ", %%mm6       \n\t"\
432
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
433
        "movq %%mm2, 8+" #dst "         \n\t"\
434
        "psrad $" #shift ", %%mm4       \n\t"\
435
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
436
        "movq %%mm4, 16+" #dst "        \n\t"\
437
        "jmp 2f                         \n\t"\
438
        "1:                             \n\t"\
439
        "pslld $16, %%mm0               \n\t"\
440
        "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
441
        "psrad $13, %%mm0               \n\t"\
442
        "packssdw %%mm0, %%mm0          \n\t"\
443
        "movq %%mm0, " #dst "           \n\t"\
444
        "movq %%mm0, 8+" #dst "         \n\t"\
445
        "movq %%mm0, 16+" #dst "        \n\t"\
446
        "movq %%mm0, 24+" #dst "        \n\t"\
447
        "2:                             \n\t"
448

449

450
//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
451
ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
452
/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
453
ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
454
ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
455

456
DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
457
DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
458
DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
459

460

461
//IDCT(      src0,   src4,   src1,    src5,    dst, shift)
462
COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
463
COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
464
COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
465
COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
466

467
#else
468

    
469
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
470
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
471
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
472
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
473
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
474
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
475
        "pand %%mm0, %%mm4              \n\t"\
476
        "por %%mm1, %%mm4               \n\t"\
477
        "por %%mm2, %%mm4               \n\t"\
478
        "por %%mm3, %%mm4               \n\t"\
479
        "packssdw %%mm4,%%mm4           \n\t"\
480
        "movd %%mm4, %%eax              \n\t"\
481
        "orl %%eax, %%eax               \n\t"\
482
        "jz 1f                          \n\t"\
483
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
484
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
485
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
486
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
487
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
488
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
489
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
490
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
491
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
492
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
493
        #rounder ", %%mm4               \n\t"\
494
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
495
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
496
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
497
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
498
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
499
        #rounder ", %%mm0               \n\t"\
500
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
501
        "paddd %%mm0, %%mm0             \n\t" \
502
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
503
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
504
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
505
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
506
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
507
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
508
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
509
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
510
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
511
        "psrad $" #shift ", %%mm7       \n\t"\
512
        "psrad $" #shift ", %%mm4       \n\t"\
513
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
514
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
515
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
516
        "psrad $" #shift ", %%mm1       \n\t"\
517
        "psrad $" #shift ", %%mm2       \n\t"\
518
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
519
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
520
        "movq %%mm7, " #dst "           \n\t"\
521
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
522
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
523
        "movq %%mm2, 24+" #dst "        \n\t"\
524
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
525
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
526
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
527
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
528
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
529
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
530
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
531
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
532
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
533
        "psrad $" #shift ", %%mm2       \n\t"\
534
        "psrad $" #shift ", %%mm0       \n\t"\
535
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
536
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
537
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
538
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
539
        "psrad $" #shift ", %%mm6       \n\t"\
540
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
541
        "movq %%mm2, 8+" #dst "         \n\t"\
542
        "psrad $" #shift ", %%mm4       \n\t"\
543
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
544
        "movq %%mm4, 16+" #dst "        \n\t"\
545
        "jmp 2f                         \n\t"\
546
        "1:                             \n\t"\
547
        "pslld $16, %%mm0               \n\t"\
548
        "paddd "MANGLE(d40000)", %%mm0  \n\t"\
549
        "psrad $13, %%mm0               \n\t"\
550
        "packssdw %%mm0, %%mm0          \n\t"\
551
        "movq %%mm0, " #dst "           \n\t"\
552
        "movq %%mm0, 8+" #dst "         \n\t"\
553
        "movq %%mm0, 16+" #dst "        \n\t"\
554
        "movq %%mm0, 24+" #dst "        \n\t"\
555
        "2:                             \n\t"
556

    
557
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
558
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
559
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
560
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
561
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
562
        "movq %%mm0, %%mm4              \n\t"\
563
        "por %%mm1, %%mm4               \n\t"\
564
        "por %%mm2, %%mm4               \n\t"\
565
        "por %%mm3, %%mm4               \n\t"\
566
        "packssdw %%mm4,%%mm4           \n\t"\
567
        "movd %%mm4, %%eax              \n\t"\
568
        "orl %%eax, %%eax               \n\t"\
569
        "jz " #bt "                     \n\t"\
570
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
571
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
572
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
573
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
574
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
575
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
576
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
577
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
578
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
579
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
580
        #rounder ", %%mm4               \n\t"\
581
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
582
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
583
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
584
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
585
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
586
        #rounder ", %%mm0               \n\t"\
587
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
588
        "paddd %%mm0, %%mm0             \n\t" \
589
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
590
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
591
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
592
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
593
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
594
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
595
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
596
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
597
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
598
        "psrad $" #shift ", %%mm7       \n\t"\
599
        "psrad $" #shift ", %%mm4       \n\t"\
600
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
601
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
602
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
603
        "psrad $" #shift ", %%mm1       \n\t"\
604
        "psrad $" #shift ", %%mm2       \n\t"\
605
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
606
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
607
        "movq %%mm7, " #dst "           \n\t"\
608
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
609
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
610
        "movq %%mm2, 24+" #dst "        \n\t"\
611
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
612
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
613
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
614
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
615
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
616
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
617
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
618
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
619
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
620
        "psrad $" #shift ", %%mm2       \n\t"\
621
        "psrad $" #shift ", %%mm0       \n\t"\
622
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
623
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
624
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
625
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
626
        "psrad $" #shift ", %%mm6       \n\t"\
627
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
628
        "movq %%mm2, 8+" #dst "         \n\t"\
629
        "psrad $" #shift ", %%mm4       \n\t"\
630
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
631
        "movq %%mm4, 16+" #dst "        \n\t"\
632

    
633
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
634
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
635
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
636
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
637
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
638
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
639
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
640
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
641
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
642
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
643
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
644
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
645
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
646
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
647
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
648
        #rounder ", %%mm4               \n\t"\
649
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
650
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
651
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
652
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
653
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
654
        #rounder ", %%mm0               \n\t"\
655
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
656
        "paddd %%mm0, %%mm0             \n\t" \
657
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
658
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
659
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
660
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
661
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
662
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
663
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
664
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
665
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
666
        "psrad $" #shift ", %%mm7       \n\t"\
667
        "psrad $" #shift ", %%mm4       \n\t"\
668
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
669
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
670
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
671
        "psrad $" #shift ", %%mm1       \n\t"\
672
        "psrad $" #shift ", %%mm2       \n\t"\
673
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
674
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
675
        "movq %%mm7, " #dst "           \n\t"\
676
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
677
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
678
        "movq %%mm2, 24+" #dst "        \n\t"\
679
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
680
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
681
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
682
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
683
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
684
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
685
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
686
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
687
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
688
        "psrad $" #shift ", %%mm2       \n\t"\
689
        "psrad $" #shift ", %%mm0       \n\t"\
690
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
691
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
692
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
693
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
694
        "psrad $" #shift ", %%mm6       \n\t"\
695
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
696
        "movq %%mm2, 8+" #dst "         \n\t"\
697
        "psrad $" #shift ", %%mm4       \n\t"\
698
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
699
        "movq %%mm4, 16+" #dst "        \n\t"\
700

    
701
//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
702
DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
703
Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
704
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
705
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
706

    
707
#undef IDCT
708
#define IDCT(src0, src4, src1, src5, dst, shift) \
709
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
710
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
711
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
712
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
713
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
714
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
715
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
716
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
717
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
718
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
719
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
720
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
721
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
722
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
723
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
724
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
725
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
726
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
727
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
728
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
729
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
730
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
731
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
732
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
733
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
734
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
735
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
736
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
737
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
738
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
739
        "psrad $" #shift ", %%mm7       \n\t"\
740
        "psrad $" #shift ", %%mm4       \n\t"\
741
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
742
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
743
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
744
        "psrad $" #shift ", %%mm0       \n\t"\
745
        "psrad $" #shift ", %%mm2       \n\t"\
746
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
747
        "movd %%mm7, " #dst "           \n\t"\
748
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
749
        "movd %%mm0, 16+" #dst "        \n\t"\
750
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
751
        "movd %%mm2, 96+" #dst "        \n\t"\
752
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
753
        "movd %%mm4, 112+" #dst "       \n\t"\
754
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
755
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
756
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
757
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
758
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
759
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
760
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
761
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
762
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
763
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
764
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
765
        "psrad $" #shift ", %%mm2       \n\t"\
766
        "psrad $" #shift ", %%mm5       \n\t"\
767
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
768
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
769
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
770
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
771
        "psrad $" #shift ", %%mm6       \n\t"\
772
        "psrad $" #shift ", %%mm4       \n\t"\
773
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
774
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
775
        "movd %%mm2, 32+" #dst "        \n\t"\
776
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
777
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
778
        "movd %%mm6, 48+" #dst "        \n\t"\
779
        "movd %%mm4, 64+" #dst "        \n\t"\
780
        "movd %%mm5, 80+" #dst "        \n\t"
781

    
782

    
783
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
784
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
785
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
786
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
787
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
788
        "jmp 9f                         \n\t"
789

    
790
        "#" ASMALIGN(4)                      \
791
        "4:                             \n\t"
792
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
793
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
794

    
795
#undef IDCT
796
#define IDCT(src0, src4, src1, src5, dst, shift) \
797
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
798
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
799
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
800
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
801
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
802
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
803
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
804
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
805
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
806
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
807
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
808
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
809
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
810
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
811
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
812
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
813
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
814
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
815
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
816
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
817
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
818
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
819
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
820
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
821
        "psrad $" #shift ", %%mm1       \n\t"\
822
        "psrad $" #shift ", %%mm4       \n\t"\
823
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
824
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
825
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
826
        "psrad $" #shift ", %%mm0       \n\t"\
827
        "psrad $" #shift ", %%mm2       \n\t"\
828
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
829
        "movd %%mm1, " #dst "           \n\t"\
830
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
831
        "movd %%mm0, 16+" #dst "        \n\t"\
832
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
833
        "movd %%mm2, 96+" #dst "        \n\t"\
834
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
835
        "movd %%mm4, 112+" #dst "       \n\t"\
836
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
837
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
838
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
839
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
840
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
841
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
842
        "psrad $" #shift ", %%mm2       \n\t"\
843
        "psrad $" #shift ", %%mm5       \n\t"\
844
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
845
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
846
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
847
        "psrad $" #shift ", %%mm6       \n\t"\
848
        "psrad $" #shift ", %%mm1       \n\t"\
849
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
850
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
851
        "movd %%mm2, 32+" #dst "        \n\t"\
852
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
853
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
854
        "movd %%mm6, 48+" #dst "        \n\t"\
855
        "movd %%mm1, 64+" #dst "        \n\t"\
856
        "movd %%mm5, 80+" #dst "        \n\t"
857

    
858
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
859
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
860
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
861
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
862
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
863
        "jmp 9f                         \n\t"
864

    
865
        "#" ASMALIGN(4)                      \
866
        "6:                             \n\t"
867
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
868

    
869
#undef IDCT
870
#define IDCT(src0, src4, src1, src5, dst, shift) \
871
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
872
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
873
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
874
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
875
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
876
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
877
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
878
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
879
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
880
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
881
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
882
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
883
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
884
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
885
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
886
        "psrad $" #shift ", %%mm1       \n\t"\
887
        "psrad $" #shift ", %%mm4       \n\t"\
888
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
889
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
890
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
891
        "psrad $" #shift ", %%mm0       \n\t"\
892
        "psrad $" #shift ", %%mm2       \n\t"\
893
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
894
        "movd %%mm1, " #dst "           \n\t"\
895
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
896
        "movd %%mm0, 16+" #dst "        \n\t"\
897
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
898
        "movd %%mm2, 96+" #dst "        \n\t"\
899
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
900
        "movd %%mm4, 112+" #dst "       \n\t"\
901
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
902
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
903
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
904
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
905
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
906
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
907
        "psrad $" #shift ", %%mm2       \n\t"\
908
        "psrad $" #shift ", %%mm5       \n\t"\
909
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
910
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
911
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
912
        "psrad $" #shift ", %%mm6       \n\t"\
913
        "psrad $" #shift ", %%mm1       \n\t"\
914
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
915
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
916
        "movd %%mm2, 32+" #dst "        \n\t"\
917
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
918
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
919
        "movd %%mm6, 48+" #dst "        \n\t"\
920
        "movd %%mm1, 64+" #dst "        \n\t"\
921
        "movd %%mm5, 80+" #dst "        \n\t"
922

    
923

    
924
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
925
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
926
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
927
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
928
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
929
        "jmp 9f                         \n\t"
930

    
931
        "#" ASMALIGN(4)                      \
932
        "2:                             \n\t"
933
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
934

    
935
#undef IDCT
936
#define IDCT(src0, src4, src1, src5, dst, shift) \
937
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
938
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
939
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
940
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
941
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
942
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
943
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
944
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
945
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
946
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
947
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
948
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
949
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
950
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
951
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
952
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
953
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
954
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
955
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
956
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
957
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
958
        "psrad $" #shift ", %%mm7       \n\t"\
959
        "psrad $" #shift ", %%mm4       \n\t"\
960
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
961
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
962
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
963
        "psrad $" #shift ", %%mm0       \n\t"\
964
        "psrad $" #shift ", %%mm2       \n\t"\
965
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
966
        "movd %%mm7, " #dst "           \n\t"\
967
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
968
        "movd %%mm0, 16+" #dst "        \n\t"\
969
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
970
        "movd %%mm2, 96+" #dst "        \n\t"\
971
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
972
        "movd %%mm4, 112+" #dst "       \n\t"\
973
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
974
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
975
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
976
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
977
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
978
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
979
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
980
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
981
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
982
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
983
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
984
        "psrad $" #shift ", %%mm2       \n\t"\
985
        "psrad $" #shift ", %%mm5       \n\t"\
986
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
987
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
988
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
989
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
990
        "psrad $" #shift ", %%mm6       \n\t"\
991
        "psrad $" #shift ", %%mm4       \n\t"\
992
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
993
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
994
        "movd %%mm2, 32+" #dst "        \n\t"\
995
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
996
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
997
        "movd %%mm6, 48+" #dst "        \n\t"\
998
        "movd %%mm4, 64+" #dst "        \n\t"\
999
        "movd %%mm5, 80+" #dst "        \n\t"
1000

    
1001
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1002
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1003
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1004
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1005
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1006
        "jmp 9f                         \n\t"
1007

    
1008
        "#" ASMALIGN(4)                      \
1009
        "3:                             \n\t"
1010
#undef IDCT
1011
#define IDCT(src0, src4, src1, src5, dst, shift) \
1012
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1013
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1014
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1015
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1016
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1017
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1018
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1019
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1020
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1021
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1022
        "movq 64(%2), %%mm3             \n\t"\
1023
        "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1024
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1025
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1026
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1027
        "psrad $" #shift ", %%mm7       \n\t"\
1028
        "psrad $" #shift ", %%mm4       \n\t"\
1029
        "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
1030
        "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1031
        "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
1032
        "psrad $" #shift ", %%mm0       \n\t"\
1033
        "psrad $" #shift ", %%mm1       \n\t"\
1034
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1035
        "movd %%mm7, " #dst "           \n\t"\
1036
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1037
        "movd %%mm0, 16+" #dst "        \n\t"\
1038
        "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
1039
        "movd %%mm1, 96+" #dst "        \n\t"\
1040
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1041
        "movd %%mm4, 112+" #dst "       \n\t"\
1042
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1043
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1044
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1045
        "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
1046
        "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
1047
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1048
        "psrad $" #shift ", %%mm1       \n\t"\
1049
        "psrad $" #shift ", %%mm5       \n\t"\
1050
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1051
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1052
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1053
        "psrad $" #shift ", %%mm6       \n\t"\
1054
        "psrad $" #shift ", %%mm4       \n\t"\
1055
        "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
1056
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1057
        "movd %%mm1, 32+" #dst "        \n\t"\
1058
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1059
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1060
        "movd %%mm6, 48+" #dst "        \n\t"\
1061
        "movd %%mm4, 64+" #dst "        \n\t"\
1062
        "movd %%mm5, 80+" #dst "        \n\t"
1063

    
1064

    
1065
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1066
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1067
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1068
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1069
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1070
        "jmp 9f                         \n\t"
1071

    
1072
        "#" ASMALIGN(4)                      \
1073
        "5:                             \n\t"
1074
#undef IDCT
1075
#define IDCT(src0, src4, src1, src5, dst, shift) \
1076
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1077
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1078
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1079
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1080
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1081
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1082
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1083
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1084
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1085
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1086
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1087
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1088
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1089
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1090
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1091
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1092
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1093
        "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
1094
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1095
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1096
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1097
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1098
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1099
        "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1100
        "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1101
        "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
1102
        "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
1103
        "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
1104
        "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
1105
        "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
1106
        "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
1107
        "psrad $" #shift ", %%mm4       \n\t"\
1108
        "psrad $" #shift ", %%mm7       \n\t"\
1109
        "psrad $" #shift ", %%mm3       \n\t"\
1110
        "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
1111
        "movq %%mm4, " #dst "           \n\t"\
1112
        "psrad $" #shift ", %%mm0       \n\t"\
1113
        "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
1114
        "movq %%mm0, 16+" #dst "        \n\t"\
1115
        "movq %%mm0, 96+" #dst "        \n\t"\
1116
        "movq %%mm4, 112+" #dst "       \n\t"\
1117
        "psrad $" #shift ", %%mm5       \n\t"\
1118
        "psrad $" #shift ", %%mm6       \n\t"\
1119
        "psrad $" #shift ", %%mm2       \n\t"\
1120
        "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1121
        "movq %%mm5, 32+" #dst "        \n\t"\
1122
        "psrad $" #shift ", %%mm1       \n\t"\
1123
        "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1124
        "movq %%mm6, 48+" #dst "        \n\t"\
1125
        "movq %%mm6, 64+" #dst "        \n\t"\
1126
        "movq %%mm5, 80+" #dst "        \n\t"
1127

    
1128

    
1129
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1130
IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1131
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1132
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1133
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1134
        "jmp 9f                         \n\t"
1135

    
1136

    
1137
        "#" ASMALIGN(4)                      \
1138
        "1:                             \n\t"
1139
#undef IDCT
1140
#define IDCT(src0, src4, src1, src5, dst, shift) \
1141
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1142
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1143
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1144
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1145
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1146
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1147
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1148
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1149
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1150
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1151
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1152
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1153
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1154
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1155
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1156
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1157
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1158
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1159
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1160
        "movq 64(%2), %%mm1             \n\t"\
1161
        "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1162
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1163
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1164
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1165
        "psrad $" #shift ", %%mm7       \n\t"\
1166
        "psrad $" #shift ", %%mm4       \n\t"\
1167
        "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
1168
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1169
        "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
1170
        "psrad $" #shift ", %%mm0       \n\t"\
1171
        "psrad $" #shift ", %%mm3       \n\t"\
1172
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1173
        "movd %%mm7, " #dst "           \n\t"\
1174
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1175
        "movd %%mm0, 16+" #dst "        \n\t"\
1176
        "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
1177
        "movd %%mm3, 96+" #dst "        \n\t"\
1178
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1179
        "movd %%mm4, 112+" #dst "       \n\t"\
1180
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1181
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1182
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1183
        "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
1184
        "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
1185
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1186
        "psrad $" #shift ", %%mm3       \n\t"\
1187
        "psrad $" #shift ", %%mm5       \n\t"\
1188
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1189
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1190
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1191
        "psrad $" #shift ", %%mm6       \n\t"\
1192
        "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
1193
        "movd %%mm3, 32+" #dst "        \n\t"\
1194
        "psrad $" #shift ", %%mm4       \n\t"\
1195
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1196
        "movd %%mm6, 48+" #dst "        \n\t"\
1197
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1198
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1199
        "movd %%mm4, 64+" #dst "        \n\t"\
1200
        "movd %%mm5, 80+" #dst "        \n\t"
1201

    
1202

    
1203
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1204
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1205
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1206
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1207
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1208
        "jmp 9f                         \n\t"
1209

    
1210

    
1211
        "#" ASMALIGN(4)
1212
        "7:                             \n\t"
1213
#undef IDCT
1214
#define IDCT(src0, src4, src1, src5, dst, shift) \
1215
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1216
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1217
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1218
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1219
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1220
        "psrad $" #shift ", %%mm4       \n\t"\
1221
        "psrad $" #shift ", %%mm0       \n\t"\
1222
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1223
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1224
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1225
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1226
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1227
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1228
        "psrad $" #shift ", %%mm1       \n\t"\
1229
        "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
1230
        "movq %%mm4, " #dst "           \n\t"\
1231
        "psrad $" #shift ", %%mm2       \n\t"\
1232
        "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
1233
        "movq %%mm0, 16+" #dst "        \n\t"\
1234
        "movq %%mm0, 96+" #dst "        \n\t"\
1235
        "movq %%mm4, 112+" #dst "       \n\t"\
1236
        "movq %%mm0, 32+" #dst "        \n\t"\
1237
        "movq %%mm4, 48+" #dst "        \n\t"\
1238
        "movq %%mm4, 64+" #dst "        \n\t"\
1239
        "movq %%mm0, 80+" #dst "        \n\t"
1240

    
1241
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1242
IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1243
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1244
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1245
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1246

    
1247

    
1248
#endif
1249

    
1250
/*
1251
Input
1252
 00 40 04 44 20 60 24 64
1253
 10 30 14 34 50 70 54 74
1254
 01 41 03 43 21 61 23 63
1255
 11 31 13 33 51 71 53 73
1256
 02 42 06 46 22 62 26 66
1257
 12 32 16 36 52 72 56 76
1258
 05 45 07 47 25 65 27 67
1259
 15 35 17 37 55 75 57 77
1260

1261
Temp
1262
 00 04 10 14 20 24 30 34
1263
 40 44 50 54 60 64 70 74
1264
 01 03 11 13 21 23 31 33
1265
 41 43 51 53 61 63 71 73
1266
 02 06 12 16 22 26 32 36
1267
 42 46 52 56 62 66 72 76
1268
 05 07 15 17 25 27 35 37
1269
 45 47 55 57 65 67 75 77
1270
*/
1271

    
1272
"9: \n\t"
1273
                :: "r" (block), "r" (temp), "r" (coeffs)
1274
                : "%eax"
1275
        );
1276
}
1277

    
1278
void ff_simple_idct_mmx(int16_t *block)
1279
{
1280
    idct(block);
1281
}
1282

    
1283
//FIXME merge add/put into the idct
1284

    
1285
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1286
{
1287
    idct(block);
1288
    put_pixels_clamped_mmx(block, dest, line_size);
1289
}
1290
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1291
{
1292
    idct(block);
1293
    add_pixels_clamped_mmx(block, dest, line_size);
1294
}