Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / simple_idct_mmx.c @ d2bb7db1

History | View | Annotate | Download (52.5 KB)

1
/*
2
 * Simple IDCT MMX
3
 *
4
 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 */
20
#include "../dsputil.h"
21
#include "../simple_idct.h"
22

    
23
/*
24
23170.475006
25
22725.260826
26
21406.727617
27
19265.545870
28
16384.000000
29
12872.826198
30
8866.956905
31
4520.335430
32
*/
33
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37
#if 0
38
#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39
#else
40
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
41
#endif
42
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43
#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44
#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45

    
46
#define ROW_SHIFT 11
47
#define COL_SHIFT 20 // 6
48

    
49
static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
50
static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
51

    
52
static const int16_t __attribute__((aligned(8))) coeffs[]= {
53
        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
54
//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
55
//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
56
        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
57
        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
58
//        0, 0, 0, 0,
59
//        0, 0, 0, 0,
60

    
61
 C4,  C4,  C4,  C4,
62
 C4, -C4,  C4, -C4,
63
 
64
 C2,  C6,  C2,  C6,
65
 C6, -C2,  C6, -C2,
66
 
67
 C1,  C3,  C1,  C3,
68
 C5,  C7,  C5,  C7,
69
 
70
 C3, -C7,  C3, -C7,
71
-C1, -C5, -C1, -C5,
72
 
73
 C5, -C1,  C5, -C1,
74
 C7,  C3,  C7,  C3,
75
 
76
 C7, -C5,  C7, -C5,
77
 C3, -C1,  C3, -C1
78
};
79

    
80
#if 0
81
static void unused_var_killer(){
82
        int a= wm1010 + d40000;
83
        temp[0]=a;
84
}
85

86
static void inline idctCol (int16_t * col, int16_t *input)
87
{
88
#undef C0
89
#undef C1
90
#undef C2
91
#undef C3
92
#undef C4
93
#undef C5
94
#undef C6
95
#undef C7
96
        int a0, a1, a2, a3, b0, b1, b2, b3;
97
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
98
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
99
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103
        const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104
        const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105
/*
106
        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
107
                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
108
                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
109
                return;
110
        }*/
111

112
col[8*0] = input[8*0 + 0];
113
col[8*1] = input[8*2 + 0];
114
col[8*2] = input[8*0 + 1];
115
col[8*3] = input[8*2 + 1];
116
col[8*4] = input[8*4 + 0];
117
col[8*5] = input[8*6 + 0];
118
col[8*6] = input[8*4 + 1];
119
col[8*7] = input[8*6 + 1];
120

121
        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
122
        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
123
        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
124
        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
125

126
        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
127
        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
128
        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
129
        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
130

131
        col[8*0] = (a0 + b0) >> COL_SHIFT;
132
        col[8*1] = (a1 + b1) >> COL_SHIFT;
133
        col[8*2] = (a2 + b2) >> COL_SHIFT;
134
        col[8*3] = (a3 + b3) >> COL_SHIFT;
135
        col[8*4] = (a3 - b3) >> COL_SHIFT;
136
        col[8*5] = (a2 - b2) >> COL_SHIFT;
137
        col[8*6] = (a1 - b1) >> COL_SHIFT;
138
        col[8*7] = (a0 - b0) >> COL_SHIFT;
139
}
140

141
static void inline idctRow (int16_t * output, int16_t * input)
142
{
143
        int16_t row[8];
144

145
        int a0, a1, a2, a3, b0, b1, b2, b3;
146
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
147
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
148
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152
        const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153
        const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154

155
row[0] = input[0];
156
row[2] = input[1];
157
row[4] = input[4];
158
row[6] = input[5];
159
row[1] = input[8];
160
row[3] = input[9];
161
row[5] = input[12];
162
row[7] = input[13];
163

164
        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
165
                row[0] = row[1] = row[2] = row[3] = row[4] =
166
                        row[5] = row[6] = row[7] = row[0]<<3;
167
        output[0] = row[0];
168
        output[2] = row[1];
169
        output[4] = row[2];
170
        output[6] = row[3];
171
        output[8] = row[4];
172
        output[10] = row[5];
173
        output[12] = row[6];
174
        output[14] = row[7];
175
                return;
176
        }
177

178
        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
179
        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
180
        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
181
        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
182

183
        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
184
        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
185
        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
186
        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
187

188
        row[0] = (a0 + b0) >> ROW_SHIFT;
189
        row[1] = (a1 + b1) >> ROW_SHIFT;
190
        row[2] = (a2 + b2) >> ROW_SHIFT;
191
        row[3] = (a3 + b3) >> ROW_SHIFT;
192
        row[4] = (a3 - b3) >> ROW_SHIFT;
193
        row[5] = (a2 - b2) >> ROW_SHIFT;
194
        row[6] = (a1 - b1) >> ROW_SHIFT;
195
        row[7] = (a0 - b0) >> ROW_SHIFT;
196

197
        output[0] = row[0];
198
        output[2] = row[1];
199
        output[4] = row[2];
200
        output[6] = row[3];
201
        output[8] = row[4];
202
        output[10] = row[5];
203
        output[12] = row[6];
204
        output[14] = row[7];
205
}
206
#endif
207

    
208
static inline void idct(int16_t *block)
209
{
210
        int64_t __attribute__((aligned(8))) align_tmp[16];
211
        int16_t * const temp= (int16_t*)align_tmp;
212

    
213
        asm volatile(
214
#if 0 //Alternative, simpler variant
215

216
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
217
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
218
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
219
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
220
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
221
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
222
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
223
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
224
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
225
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
226
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
227
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
228
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
229
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
230
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
231
        #rounder ", %%mm4                        \n\t"\
232
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
233
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
234
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
235
        "movq 56(%2), %%mm5                        \n\t" /* C7        C5        C7        C5 */\
236
        "pmaddwd %%mm3, %%mm5                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
237
        #rounder ", %%mm0                        \n\t"\
238
        "paddd %%mm0, %%mm1                        \n\t" /* A1                a1 */\
239
        "paddd %%mm0, %%mm0                        \n\t" \
240
        "psubd %%mm1, %%mm0                        \n\t" /* A2                a2 */\
241
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
242
        "paddd %%mm5, %%mm7                        \n\t" /* B0                b0 */\
243
        "movq 72(%2), %%mm5                        \n\t" /* -C5        -C1        -C5        -C1 */\
244
        "pmaddwd %%mm3, %%mm5                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
245
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
246
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
247
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
248
        "paddd %%mm2, %%mm5                        \n\t" /* B1                b1 */\
249
        "psrad $" #shift ", %%mm7                \n\t"\
250
        "psrad $" #shift ", %%mm4                \n\t"\
251
        "movq %%mm1, %%mm2                        \n\t" /* A1                a1 */\
252
        "paddd %%mm5, %%mm1                        \n\t" /* A1+B1                a1+b1 */\
253
        "psubd %%mm5, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
254
        "psrad $" #shift ", %%mm1                \n\t"\
255
        "psrad $" #shift ", %%mm2                \n\t"\
256
        "packssdw %%mm1, %%mm7                        \n\t" /* A1+B1        a1+b1        A0+B0        a0+b0 */\
257
        "packssdw %%mm4, %%mm2                        \n\t" /* A0-B0        a0-b0        A1-B1        a1-b1 */\
258
        "movq %%mm7, " #dst "                        \n\t"\
259
        "movq " #src1 ", %%mm1                        \n\t" /* R3        R1        r3        r1 */\
260
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
261
        "movq %%mm2, 24+" #dst "                \n\t"\
262
        "pmaddwd %%mm1, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
263
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
264
        "pmaddwd 96(%2), %%mm1                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
265
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
266
        "movq %%mm0, %%mm2                        \n\t" /* A2                a2 */\
267
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
268
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
269
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
270
        "psubd %%mm4, %%mm0                        \n\t" /* a2-B2                a2-b2 */\
271
        "psrad $" #shift ", %%mm2                \n\t"\
272
        "psrad $" #shift ", %%mm0                \n\t"\
273
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
274
        "paddd %%mm1, %%mm3                        \n\t" /* B3                b3 */\
275
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
276
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
277
        "psrad $" #shift ", %%mm6                \n\t"\
278
        "packssdw %%mm6, %%mm2                        \n\t" /* A3+B3        a3+b3        A2+B2        a2+b2 */\
279
        "movq %%mm2, 8+" #dst "                        \n\t"\
280
        "psrad $" #shift ", %%mm4                \n\t"\
281
        "packssdw %%mm0, %%mm4                        \n\t" /* A2-B2        a2-b2        A3-B3        a3-b3 */\
282
        "movq %%mm4, 16+" #dst "                \n\t"\
283

284
#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
285
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
286
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
287
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
288
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
289
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
290
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
291
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
292
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
293
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
294
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
295
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
296
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
297
        #rounder ", %%mm4                        \n\t"\
298
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
299
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
300
        #rounder ", %%mm0                        \n\t"\
301
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
302
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
303
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
304
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
305
        "paddd %%mm1, %%mm0                        \n\t" /* A1                a1 */\
306
        "psubd %%mm1, %%mm5                        \n\t" /* A2                a2 */\
307
        "movq 56(%2), %%mm1                        \n\t" /* C7        C5        C7        C5 */\
308
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
309
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
310
        "paddd %%mm1, %%mm7                        \n\t" /* B0                b0 */\
311
        "movq 72(%2), %%mm1                        \n\t" /* -C5        -C1        -C5        -C1 */\
312
        "pmaddwd %%mm3, %%mm1                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
313
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
314
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
315
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
316
        "paddd %%mm2, %%mm1                        \n\t" /* B1                b1 */\
317
        "psrad $" #shift ", %%mm7                \n\t"\
318
        "psrad $" #shift ", %%mm4                \n\t"\
319
        "movq %%mm0, %%mm2                        \n\t" /* A1                a1 */\
320
        "paddd %%mm1, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
321
        "psubd %%mm1, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
322
        "psrad $" #shift ", %%mm0                \n\t"\
323
        "psrad $" #shift ", %%mm2                \n\t"\
324
        "packssdw %%mm7, %%mm7                        \n\t" /* A0+B0        a0+b0 */\
325
        "movd %%mm7, " #dst "                        \n\t"\
326
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
327
        "movd %%mm0, 16+" #dst "                \n\t"\
328
        "packssdw %%mm2, %%mm2                        \n\t" /* A1-B1        a1-b1 */\
329
        "movd %%mm2, 96+" #dst "                \n\t"\
330
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
331
        "movd %%mm4, 112+" #dst "                \n\t"\
332
        "movq " #src1 ", %%mm0                        \n\t" /* R3        R1        r3        r1 */\
333
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
334
        "pmaddwd %%mm0, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
335
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
336
        "pmaddwd 96(%2), %%mm0                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
337
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
338
        "movq %%mm5, %%mm2                        \n\t" /* A2                a2 */\
339
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
340
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
341
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
342
        "psubd %%mm4, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
343
        "psrad $" #shift ", %%mm2                \n\t"\
344
        "psrad $" #shift ", %%mm5                \n\t"\
345
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
346
        "paddd %%mm0, %%mm3                        \n\t" /* B3                b3 */\
347
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
348
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
349
        "psrad $" #shift ", %%mm6                \n\t"\
350
        "psrad $" #shift ", %%mm4                \n\t"\
351
        "packssdw %%mm2, %%mm2                        \n\t" /* A2+B2        a2+b2 */\
352
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
353
        "movd %%mm2, 32+" #dst "                \n\t"\
354
        "packssdw %%mm4, %%mm4                        \n\t" /* A3-B3        a3-b3 */\
355
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
356
        "movd %%mm6, 48+" #dst "                \n\t"\
357
        "movd %%mm4, 64+" #dst "                \n\t"\
358
        "movd %%mm5, 80+" #dst "                \n\t"\
359

360
        
361
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
362
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
363
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
364
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
365
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
366
        "movq "MANGLE(wm1010)", %%mm4                \n\t"\
367
        "pand %%mm0, %%mm4                        \n\t"\
368
        "por %%mm1, %%mm4                        \n\t"\
369
        "por %%mm2, %%mm4                        \n\t"\
370
        "por %%mm3, %%mm4                        \n\t"\
371
        "packssdw %%mm4,%%mm4                        \n\t"\
372
        "movd %%mm4, %%eax                        \n\t"\
373
        "orl %%eax, %%eax                        \n\t"\
374
        "jz 1f                                        \n\t"\
375
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
376
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
377
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
378
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
379
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
380
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
381
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
382
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
383
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
384
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
385
        #rounder ", %%mm4                        \n\t"\
386
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
387
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
388
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
389
        "movq 56(%2), %%mm5                        \n\t" /* C7        C5        C7        C5 */\
390
        "pmaddwd %%mm3, %%mm5                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
391
        #rounder ", %%mm0                        \n\t"\
392
        "paddd %%mm0, %%mm1                        \n\t" /* A1                a1 */\
393
        "paddd %%mm0, %%mm0                        \n\t" \
394
        "psubd %%mm1, %%mm0                        \n\t" /* A2                a2 */\
395
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
396
        "paddd %%mm5, %%mm7                        \n\t" /* B0                b0 */\
397
        "movq 72(%2), %%mm5                        \n\t" /* -C5        -C1        -C5        -C1 */\
398
        "pmaddwd %%mm3, %%mm5                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
399
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
400
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
401
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
402
        "paddd %%mm2, %%mm5                        \n\t" /* B1                b1 */\
403
        "psrad $" #shift ", %%mm7                \n\t"\
404
        "psrad $" #shift ", %%mm4                \n\t"\
405
        "movq %%mm1, %%mm2                        \n\t" /* A1                a1 */\
406
        "paddd %%mm5, %%mm1                        \n\t" /* A1+B1                a1+b1 */\
407
        "psubd %%mm5, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
408
        "psrad $" #shift ", %%mm1                \n\t"\
409
        "psrad $" #shift ", %%mm2                \n\t"\
410
        "packssdw %%mm1, %%mm7                        \n\t" /* A1+B1        a1+b1        A0+B0        a0+b0 */\
411
        "packssdw %%mm4, %%mm2                        \n\t" /* A0-B0        a0-b0        A1-B1        a1-b1 */\
412
        "movq %%mm7, " #dst "                        \n\t"\
413
        "movq " #src1 ", %%mm1                        \n\t" /* R3        R1        r3        r1 */\
414
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
415
        "movq %%mm2, 24+" #dst "                \n\t"\
416
        "pmaddwd %%mm1, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
417
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
418
        "pmaddwd 96(%2), %%mm1                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
419
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
420
        "movq %%mm0, %%mm2                        \n\t" /* A2                a2 */\
421
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
422
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
423
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
424
        "psubd %%mm4, %%mm0                        \n\t" /* a2-B2                a2-b2 */\
425
        "psrad $" #shift ", %%mm2                \n\t"\
426
        "psrad $" #shift ", %%mm0                \n\t"\
427
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
428
        "paddd %%mm1, %%mm3                        \n\t" /* B3                b3 */\
429
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
430
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
431
        "psrad $" #shift ", %%mm6                \n\t"\
432
        "packssdw %%mm6, %%mm2                        \n\t" /* A3+B3        a3+b3        A2+B2        a2+b2 */\
433
        "movq %%mm2, 8+" #dst "                        \n\t"\
434
        "psrad $" #shift ", %%mm4                \n\t"\
435
        "packssdw %%mm0, %%mm4                        \n\t" /* A2-B2        a2-b2        A3-B3        a3-b3 */\
436
        "movq %%mm4, 16+" #dst "                \n\t"\
437
        "jmp 2f                                        \n\t"\
438
        "1:                                        \n\t"\
439
        "pslld $16, %%mm0                        \n\t"\
440
        "#paddd "MANGLE(d40000)", %%mm0                \n\t"\
441
        "psrad $13, %%mm0                        \n\t"\
442
        "packssdw %%mm0, %%mm0                        \n\t"\
443
        "movq %%mm0, " #dst "                        \n\t"\
444
        "movq %%mm0, 8+" #dst "                        \n\t"\
445
        "movq %%mm0, 16+" #dst "                \n\t"\
446
        "movq %%mm0, 24+" #dst "                \n\t"\
447
        "2:                                        \n\t"
448

449

450
//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
451
ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
452
/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
453
ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
454
ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
455

456
DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
457
DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
458
DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
459

460

461
//IDCT(      src0,   src4,   src1,    src5,    dst, rounder, shift)
462
COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
463
COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
464
COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
465
COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
466

467
#else
468

    
469
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
470
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
471
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
472
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
473
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
474
        "movq "MANGLE(wm1010)", %%mm4                \n\t"\
475
        "pand %%mm0, %%mm4                        \n\t"\
476
        "por %%mm1, %%mm4                        \n\t"\
477
        "por %%mm2, %%mm4                        \n\t"\
478
        "por %%mm3, %%mm4                        \n\t"\
479
        "packssdw %%mm4,%%mm4                        \n\t"\
480
        "movd %%mm4, %%eax                        \n\t"\
481
        "orl %%eax, %%eax                        \n\t"\
482
        "jz 1f                                        \n\t"\
483
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
484
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
485
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
486
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
487
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
488
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
489
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
490
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
491
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
492
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
493
        #rounder ", %%mm4                        \n\t"\
494
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
495
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
496
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
497
        "movq 56(%2), %%mm5                        \n\t" /* C7        C5        C7        C5 */\
498
        "pmaddwd %%mm3, %%mm5                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
499
        #rounder ", %%mm0                        \n\t"\
500
        "paddd %%mm0, %%mm1                        \n\t" /* A1                a1 */\
501
        "paddd %%mm0, %%mm0                        \n\t" \
502
        "psubd %%mm1, %%mm0                        \n\t" /* A2                a2 */\
503
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
504
        "paddd %%mm5, %%mm7                        \n\t" /* B0                b0 */\
505
        "movq 72(%2), %%mm5                        \n\t" /* -C5        -C1        -C5        -C1 */\
506
        "pmaddwd %%mm3, %%mm5                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
507
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
508
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
509
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
510
        "paddd %%mm2, %%mm5                        \n\t" /* B1                b1 */\
511
        "psrad $" #shift ", %%mm7                \n\t"\
512
        "psrad $" #shift ", %%mm4                \n\t"\
513
        "movq %%mm1, %%mm2                        \n\t" /* A1                a1 */\
514
        "paddd %%mm5, %%mm1                        \n\t" /* A1+B1                a1+b1 */\
515
        "psubd %%mm5, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
516
        "psrad $" #shift ", %%mm1                \n\t"\
517
        "psrad $" #shift ", %%mm2                \n\t"\
518
        "packssdw %%mm1, %%mm7                        \n\t" /* A1+B1        a1+b1        A0+B0        a0+b0 */\
519
        "packssdw %%mm4, %%mm2                        \n\t" /* A0-B0        a0-b0        A1-B1        a1-b1 */\
520
        "movq %%mm7, " #dst "                        \n\t"\
521
        "movq " #src1 ", %%mm1                        \n\t" /* R3        R1        r3        r1 */\
522
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
523
        "movq %%mm2, 24+" #dst "                \n\t"\
524
        "pmaddwd %%mm1, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
525
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
526
        "pmaddwd 96(%2), %%mm1                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
527
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
528
        "movq %%mm0, %%mm2                        \n\t" /* A2                a2 */\
529
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
530
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
531
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
532
        "psubd %%mm4, %%mm0                        \n\t" /* a2-B2                a2-b2 */\
533
        "psrad $" #shift ", %%mm2                \n\t"\
534
        "psrad $" #shift ", %%mm0                \n\t"\
535
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
536
        "paddd %%mm1, %%mm3                        \n\t" /* B3                b3 */\
537
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
538
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
539
        "psrad $" #shift ", %%mm6                \n\t"\
540
        "packssdw %%mm6, %%mm2                        \n\t" /* A3+B3        a3+b3        A2+B2        a2+b2 */\
541
        "movq %%mm2, 8+" #dst "                        \n\t"\
542
        "psrad $" #shift ", %%mm4                \n\t"\
543
        "packssdw %%mm0, %%mm4                        \n\t" /* A2-B2        a2-b2        A3-B3        a3-b3 */\
544
        "movq %%mm4, 16+" #dst "                \n\t"\
545
        "jmp 2f                                        \n\t"\
546
        "1:                                        \n\t"\
547
        "pslld $16, %%mm0                        \n\t"\
548
        "paddd "MANGLE(d40000)", %%mm0                \n\t"\
549
        "psrad $13, %%mm0                        \n\t"\
550
        "packssdw %%mm0, %%mm0                        \n\t"\
551
        "movq %%mm0, " #dst "                        \n\t"\
552
        "movq %%mm0, 8+" #dst "                        \n\t"\
553
        "movq %%mm0, 16+" #dst "                \n\t"\
554
        "movq %%mm0, 24+" #dst "                \n\t"\
555
        "2:                                        \n\t"
556

    
557
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
558
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
559
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
560
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
561
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
562
        "movq %%mm0, %%mm4                        \n\t"\
563
        "por %%mm1, %%mm4                        \n\t"\
564
        "por %%mm2, %%mm4                        \n\t"\
565
        "por %%mm3, %%mm4                        \n\t"\
566
        "packssdw %%mm4,%%mm4                        \n\t"\
567
        "movd %%mm4, %%eax                        \n\t"\
568
        "orl %%eax, %%eax                        \n\t"\
569
        "jz " #bt "                                \n\t"\
570
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
571
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
572
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
573
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
574
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
575
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
576
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
577
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
578
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
579
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
580
        #rounder ", %%mm4                        \n\t"\
581
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
582
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
583
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
584
        "movq 56(%2), %%mm5                        \n\t" /* C7        C5        C7        C5 */\
585
        "pmaddwd %%mm3, %%mm5                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
586
        #rounder ", %%mm0                        \n\t"\
587
        "paddd %%mm0, %%mm1                        \n\t" /* A1                a1 */\
588
        "paddd %%mm0, %%mm0                        \n\t" \
589
        "psubd %%mm1, %%mm0                        \n\t" /* A2                a2 */\
590
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
591
        "paddd %%mm5, %%mm7                        \n\t" /* B0                b0 */\
592
        "movq 72(%2), %%mm5                        \n\t" /* -C5        -C1        -C5        -C1 */\
593
        "pmaddwd %%mm3, %%mm5                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
594
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
595
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
596
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
597
        "paddd %%mm2, %%mm5                        \n\t" /* B1                b1 */\
598
        "psrad $" #shift ", %%mm7                \n\t"\
599
        "psrad $" #shift ", %%mm4                \n\t"\
600
        "movq %%mm1, %%mm2                        \n\t" /* A1                a1 */\
601
        "paddd %%mm5, %%mm1                        \n\t" /* A1+B1                a1+b1 */\
602
        "psubd %%mm5, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
603
        "psrad $" #shift ", %%mm1                \n\t"\
604
        "psrad $" #shift ", %%mm2                \n\t"\
605
        "packssdw %%mm1, %%mm7                        \n\t" /* A1+B1        a1+b1        A0+B0        a0+b0 */\
606
        "packssdw %%mm4, %%mm2                        \n\t" /* A0-B0        a0-b0        A1-B1        a1-b1 */\
607
        "movq %%mm7, " #dst "                        \n\t"\
608
        "movq " #src1 ", %%mm1                        \n\t" /* R3        R1        r3        r1 */\
609
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
610
        "movq %%mm2, 24+" #dst "                \n\t"\
611
        "pmaddwd %%mm1, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
612
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
613
        "pmaddwd 96(%2), %%mm1                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
614
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
615
        "movq %%mm0, %%mm2                        \n\t" /* A2                a2 */\
616
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
617
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
618
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
619
        "psubd %%mm4, %%mm0                        \n\t" /* a2-B2                a2-b2 */\
620
        "psrad $" #shift ", %%mm2                \n\t"\
621
        "psrad $" #shift ", %%mm0                \n\t"\
622
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
623
        "paddd %%mm1, %%mm3                        \n\t" /* B3                b3 */\
624
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
625
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
626
        "psrad $" #shift ", %%mm6                \n\t"\
627
        "packssdw %%mm6, %%mm2                        \n\t" /* A3+B3        a3+b3        A2+B2        a2+b2 */\
628
        "movq %%mm2, 8+" #dst "                        \n\t"\
629
        "psrad $" #shift ", %%mm4                \n\t"\
630
        "packssdw %%mm0, %%mm4                        \n\t" /* A2-B2        a2-b2        A3-B3        a3-b3 */\
631
        "movq %%mm4, 16+" #dst "                \n\t"\
632

    
633
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
634
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
635
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
636
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
637
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
638
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
639
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
640
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
641
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
642
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
643
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
644
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
645
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
646
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
647
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
648
        #rounder ", %%mm4                        \n\t"\
649
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
650
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
651
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
652
        "movq 56(%2), %%mm5                        \n\t" /* C7        C5        C7        C5 */\
653
        "pmaddwd %%mm3, %%mm5                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
654
        #rounder ", %%mm0                        \n\t"\
655
        "paddd %%mm0, %%mm1                        \n\t" /* A1                a1 */\
656
        "paddd %%mm0, %%mm0                        \n\t" \
657
        "psubd %%mm1, %%mm0                        \n\t" /* A2                a2 */\
658
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
659
        "paddd %%mm5, %%mm7                        \n\t" /* B0                b0 */\
660
        "movq 72(%2), %%mm5                        \n\t" /* -C5        -C1        -C5        -C1 */\
661
        "pmaddwd %%mm3, %%mm5                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
662
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
663
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
664
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
665
        "paddd %%mm2, %%mm5                        \n\t" /* B1                b1 */\
666
        "psrad $" #shift ", %%mm7                \n\t"\
667
        "psrad $" #shift ", %%mm4                \n\t"\
668
        "movq %%mm1, %%mm2                        \n\t" /* A1                a1 */\
669
        "paddd %%mm5, %%mm1                        \n\t" /* A1+B1                a1+b1 */\
670
        "psubd %%mm5, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
671
        "psrad $" #shift ", %%mm1                \n\t"\
672
        "psrad $" #shift ", %%mm2                \n\t"\
673
        "packssdw %%mm1, %%mm7                        \n\t" /* A1+B1        a1+b1        A0+B0        a0+b0 */\
674
        "packssdw %%mm4, %%mm2                        \n\t" /* A0-B0        a0-b0        A1-B1        a1-b1 */\
675
        "movq %%mm7, " #dst "                        \n\t"\
676
        "movq " #src1 ", %%mm1                        \n\t" /* R3        R1        r3        r1 */\
677
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
678
        "movq %%mm2, 24+" #dst "                \n\t"\
679
        "pmaddwd %%mm1, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
680
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
681
        "pmaddwd 96(%2), %%mm1                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
682
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
683
        "movq %%mm0, %%mm2                        \n\t" /* A2                a2 */\
684
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
685
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
686
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
687
        "psubd %%mm4, %%mm0                        \n\t" /* a2-B2                a2-b2 */\
688
        "psrad $" #shift ", %%mm2                \n\t"\
689
        "psrad $" #shift ", %%mm0                \n\t"\
690
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
691
        "paddd %%mm1, %%mm3                        \n\t" /* B3                b3 */\
692
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
693
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
694
        "psrad $" #shift ", %%mm6                \n\t"\
695
        "packssdw %%mm6, %%mm2                        \n\t" /* A3+B3        a3+b3        A2+B2        a2+b2 */\
696
        "movq %%mm2, 8+" #dst "                        \n\t"\
697
        "psrad $" #shift ", %%mm4                \n\t"\
698
        "packssdw %%mm0, %%mm4                        \n\t" /* A2-B2        a2-b2        A3-B3        a3-b3 */\
699
        "movq %%mm4, 16+" #dst "                \n\t"\
700

    
701
//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
702
DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
703
Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
704
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
705
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
706

    
707
#undef IDCT
708
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
709
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
710
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
711
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
712
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
713
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
714
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
715
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
716
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
717
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
718
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
719
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
720
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
721
        #rounder ", %%mm4                        \n\t"\
722
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
723
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
724
        #rounder ", %%mm0                        \n\t"\
725
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
726
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
727
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
728
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
729
        "paddd %%mm1, %%mm0                        \n\t" /* A1                a1 */\
730
        "psubd %%mm1, %%mm5                        \n\t" /* A2                a2 */\
731
        "movq 56(%2), %%mm1                        \n\t" /* C7        C5        C7        C5 */\
732
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
733
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
734
        "paddd %%mm1, %%mm7                        \n\t" /* B0                b0 */\
735
        "movq 72(%2), %%mm1                        \n\t" /* -C5        -C1        -C5        -C1 */\
736
        "pmaddwd %%mm3, %%mm1                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
737
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
738
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
739
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
740
        "paddd %%mm2, %%mm1                        \n\t" /* B1                b1 */\
741
        "psrad $" #shift ", %%mm7                \n\t"\
742
        "psrad $" #shift ", %%mm4                \n\t"\
743
        "movq %%mm0, %%mm2                        \n\t" /* A1                a1 */\
744
        "paddd %%mm1, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
745
        "psubd %%mm1, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
746
        "psrad $" #shift ", %%mm0                \n\t"\
747
        "psrad $" #shift ", %%mm2                \n\t"\
748
        "packssdw %%mm7, %%mm7                        \n\t" /* A0+B0        a0+b0 */\
749
        "movd %%mm7, " #dst "                        \n\t"\
750
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
751
        "movd %%mm0, 16+" #dst "                \n\t"\
752
        "packssdw %%mm2, %%mm2                        \n\t" /* A1-B1        a1-b1 */\
753
        "movd %%mm2, 96+" #dst "                \n\t"\
754
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
755
        "movd %%mm4, 112+" #dst "                \n\t"\
756
        "movq " #src1 ", %%mm0                        \n\t" /* R3        R1        r3        r1 */\
757
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
758
        "pmaddwd %%mm0, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
759
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
760
        "pmaddwd 96(%2), %%mm0                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
761
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
762
        "movq %%mm5, %%mm2                        \n\t" /* A2                a2 */\
763
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
764
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
765
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
766
        "psubd %%mm4, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
767
        "psrad $" #shift ", %%mm2                \n\t"\
768
        "psrad $" #shift ", %%mm5                \n\t"\
769
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
770
        "paddd %%mm0, %%mm3                        \n\t" /* B3                b3 */\
771
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
772
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
773
        "psrad $" #shift ", %%mm6                \n\t"\
774
        "psrad $" #shift ", %%mm4                \n\t"\
775
        "packssdw %%mm2, %%mm2                        \n\t" /* A2+B2        a2+b2 */\
776
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
777
        "movd %%mm2, 32+" #dst "                \n\t"\
778
        "packssdw %%mm4, %%mm4                        \n\t" /* A3-B3        a3-b3 */\
779
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
780
        "movd %%mm6, 48+" #dst "                \n\t"\
781
        "movd %%mm4, 64+" #dst "                \n\t"\
782
        "movd %%mm5, 80+" #dst "                \n\t"
783

    
784

    
785
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
786
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
787
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
788
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
789
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
790
        "jmp 9f                                        \n\t"
791

    
792
        "#.balign 16                                \n\t"\
793
        "4:                                        \n\t"
794
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
795
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
796

    
797
#undef IDCT
798
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
799
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
800
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
801
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
802
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
803
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
804
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
805
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
806
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
807
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
808
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
809
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
810
        #rounder ", %%mm4                        \n\t"\
811
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
812
        #rounder ", %%mm0                        \n\t"\
813
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
814
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
815
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
816
        "paddd %%mm1, %%mm0                        \n\t" /* A1                a1 */\
817
        "psubd %%mm1, %%mm5                        \n\t" /* A2                a2 */\
818
        "movq 56(%2), %%mm1                        \n\t" /* C7        C5        C7        C5 */\
819
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
820
        "movq 72(%2), %%mm7                        \n\t" /* -C5        -C1        -C5        -C1 */\
821
        "pmaddwd %%mm3, %%mm7                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
822
        "paddd %%mm4, %%mm1                        \n\t" /* A0+B0                a0+b0 */\
823
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
824
        "psubd %%mm1, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
825
        "psrad $" #shift ", %%mm1                \n\t"\
826
        "psrad $" #shift ", %%mm4                \n\t"\
827
        "movq %%mm0, %%mm2                        \n\t" /* A1                a1 */\
828
        "paddd %%mm7, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
829
        "psubd %%mm7, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
830
        "psrad $" #shift ", %%mm0                \n\t"\
831
        "psrad $" #shift ", %%mm2                \n\t"\
832
        "packssdw %%mm1, %%mm1                        \n\t" /* A0+B0        a0+b0 */\
833
        "movd %%mm1, " #dst "                        \n\t"\
834
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
835
        "movd %%mm0, 16+" #dst "                \n\t"\
836
        "packssdw %%mm2, %%mm2                        \n\t" /* A1-B1        a1-b1 */\
837
        "movd %%mm2, 96+" #dst "                \n\t"\
838
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
839
        "movd %%mm4, 112+" #dst "                \n\t"\
840
        "movq 88(%2), %%mm1                        \n\t" /* C3        C7        C3         C7 */\
841
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
842
        "movq %%mm5, %%mm2                        \n\t" /* A2                a2 */\
843
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
844
        "paddd %%mm1, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
845
        "psubd %%mm1, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
846
        "psrad $" #shift ", %%mm2                \n\t"\
847
        "psrad $" #shift ", %%mm5                \n\t"\
848
        "movq %%mm6, %%mm1                        \n\t" /* A3                a3 */\
849
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
850
        "psubd %%mm3, %%mm1                        \n\t" /* a3-B3                a3-b3 */\
851
        "psrad $" #shift ", %%mm6                \n\t"\
852
        "psrad $" #shift ", %%mm1                \n\t"\
853
        "packssdw %%mm2, %%mm2                        \n\t" /* A2+B2        a2+b2 */\
854
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
855
        "movd %%mm2, 32+" #dst "                \n\t"\
856
        "packssdw %%mm1, %%mm1                        \n\t" /* A3-B3        a3-b3 */\
857
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
858
        "movd %%mm6, 48+" #dst "                \n\t"\
859
        "movd %%mm1, 64+" #dst "                \n\t"\
860
        "movd %%mm5, 80+" #dst "                \n\t"        
861

    
862
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
863
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
864
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
865
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
866
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
867
        "jmp 9f                                        \n\t"
868

    
869
        "#.balign 16                                \n\t"\
870
        "6:                                        \n\t"
871
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
872

    
873
#undef IDCT
874
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
875
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
876
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
877
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
878
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
879
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
880
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
881
        #rounder ", %%mm4                        \n\t"\
882
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
883
        #rounder ", %%mm0                        \n\t"\
884
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
885
        "movq 56(%2), %%mm1                        \n\t" /* C7        C5        C7        C5 */\
886
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
887
        "movq 72(%2), %%mm7                        \n\t" /* -C5        -C1        -C5        -C1 */\
888
        "pmaddwd %%mm3, %%mm7                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
889
        "paddd %%mm4, %%mm1                        \n\t" /* A0+B0                a0+b0 */\
890
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
891
        "psubd %%mm1, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
892
        "psrad $" #shift ", %%mm1                \n\t"\
893
        "psrad $" #shift ", %%mm4                \n\t"\
894
        "movq %%mm0, %%mm2                        \n\t" /* A1                a1 */\
895
        "paddd %%mm7, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
896
        "psubd %%mm7, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
897
        "psrad $" #shift ", %%mm0                \n\t"\
898
        "psrad $" #shift ", %%mm2                \n\t"\
899
        "packssdw %%mm1, %%mm1                        \n\t" /* A0+B0        a0+b0 */\
900
        "movd %%mm1, " #dst "                        \n\t"\
901
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
902
        "movd %%mm0, 16+" #dst "                \n\t"\
903
        "packssdw %%mm2, %%mm2                        \n\t" /* A1-B1        a1-b1 */\
904
        "movd %%mm2, 96+" #dst "                \n\t"\
905
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
906
        "movd %%mm4, 112+" #dst "                \n\t"\
907
        "movq 88(%2), %%mm1                        \n\t" /* C3        C7        C3         C7 */\
908
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
909
        "movq %%mm5, %%mm2                        \n\t" /* A2                a2 */\
910
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
911
        "paddd %%mm1, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
912
        "psubd %%mm1, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
913
        "psrad $" #shift ", %%mm2                \n\t"\
914
        "psrad $" #shift ", %%mm5                \n\t"\
915
        "movq %%mm6, %%mm1                        \n\t" /* A3                a3 */\
916
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
917
        "psubd %%mm3, %%mm1                        \n\t" /* a3-B3                a3-b3 */\
918
        "psrad $" #shift ", %%mm6                \n\t"\
919
        "psrad $" #shift ", %%mm1                \n\t"\
920
        "packssdw %%mm2, %%mm2                        \n\t" /* A2+B2        a2+b2 */\
921
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
922
        "movd %%mm2, 32+" #dst "                \n\t"\
923
        "packssdw %%mm1, %%mm1                        \n\t" /* A3-B3        a3-b3 */\
924
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
925
        "movd %%mm6, 48+" #dst "                \n\t"\
926
        "movd %%mm1, 64+" #dst "                \n\t"\
927
        "movd %%mm5, 80+" #dst "                \n\t"        
928

    
929

    
930
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
931
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
932
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
933
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
934
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
935
        "jmp 9f                                        \n\t"
936

    
937
        "#.balign 16                                \n\t"\
938
        "2:                                        \n\t"
939
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
940

    
941
#undef IDCT
942
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
943
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
944
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
945
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
946
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
947
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
948
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
949
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
950
        #rounder ", %%mm4                        \n\t"\
951
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
952
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
953
        #rounder ", %%mm0                        \n\t"\
954
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
955
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
956
        "movq 56(%2), %%mm1                        \n\t" /* C7        C5        C7        C5 */\
957
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
958
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
959
        "paddd %%mm1, %%mm7                        \n\t" /* B0                b0 */\
960
        "movq 72(%2), %%mm1                        \n\t" /* -C5        -C1        -C5        -C1 */\
961
        "pmaddwd %%mm3, %%mm1                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
962
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
963
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
964
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
965
        "paddd %%mm2, %%mm1                        \n\t" /* B1                b1 */\
966
        "psrad $" #shift ", %%mm7                \n\t"\
967
        "psrad $" #shift ", %%mm4                \n\t"\
968
        "movq %%mm0, %%mm2                        \n\t" /* A1                a1 */\
969
        "paddd %%mm1, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
970
        "psubd %%mm1, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
971
        "psrad $" #shift ", %%mm0                \n\t"\
972
        "psrad $" #shift ", %%mm2                \n\t"\
973
        "packssdw %%mm7, %%mm7                        \n\t" /* A0+B0        a0+b0 */\
974
        "movd %%mm7, " #dst "                        \n\t"\
975
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
976
        "movd %%mm0, 16+" #dst "                \n\t"\
977
        "packssdw %%mm2, %%mm2                        \n\t" /* A1-B1        a1-b1 */\
978
        "movd %%mm2, 96+" #dst "                \n\t"\
979
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
980
        "movd %%mm4, 112+" #dst "                \n\t"\
981
        "movq " #src1 ", %%mm0                        \n\t" /* R3        R1        r3        r1 */\
982
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
983
        "pmaddwd %%mm0, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
984
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
985
        "pmaddwd 96(%2), %%mm0                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
986
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
987
        "movq %%mm5, %%mm2                        \n\t" /* A2                a2 */\
988
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
989
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
990
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
991
        "psubd %%mm4, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
992
        "psrad $" #shift ", %%mm2                \n\t"\
993
        "psrad $" #shift ", %%mm5                \n\t"\
994
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
995
        "paddd %%mm0, %%mm3                        \n\t" /* B3                b3 */\
996
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
997
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
998
        "psrad $" #shift ", %%mm6                \n\t"\
999
        "psrad $" #shift ", %%mm4                \n\t"\
1000
        "packssdw %%mm2, %%mm2                        \n\t" /* A2+B2        a2+b2 */\
1001
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
1002
        "movd %%mm2, 32+" #dst "                \n\t"\
1003
        "packssdw %%mm4, %%mm4                        \n\t" /* A3-B3        a3-b3 */\
1004
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
1005
        "movd %%mm6, 48+" #dst "                \n\t"\
1006
        "movd %%mm4, 64+" #dst "                \n\t"\
1007
        "movd %%mm5, 80+" #dst "                \n\t"
1008

    
1009
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1010
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1011
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1012
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1013
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1014
        "jmp 9f                                        \n\t"
1015

    
1016
        "#.balign 16                                \n\t"\
1017
        "3:                                        \n\t"
1018
#undef IDCT
1019
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1020
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
1021
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
1022
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
1023
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1024
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
1025
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1026
        #rounder ", %%mm4                        \n\t"\
1027
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1028
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
1029
        #rounder ", %%mm0                        \n\t"\
1030
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
1031
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1032
        "movq 64(%2), %%mm3                        \n\t"\
1033
        "pmaddwd %%mm2, %%mm3                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
1034
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
1035
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
1036
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
1037
        "psrad $" #shift ", %%mm7                \n\t"\
1038
        "psrad $" #shift ", %%mm4                \n\t"\
1039
        "movq %%mm0, %%mm1                        \n\t" /* A1                a1 */\
1040
        "paddd %%mm3, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
1041
        "psubd %%mm3, %%mm1                        \n\t" /* A1-B1                a1-b1 */\
1042
        "psrad $" #shift ", %%mm0                \n\t"\
1043
        "psrad $" #shift ", %%mm1                \n\t"\
1044
        "packssdw %%mm7, %%mm7                        \n\t" /* A0+B0        a0+b0 */\
1045
        "movd %%mm7, " #dst "                        \n\t"\
1046
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
1047
        "movd %%mm0, 16+" #dst "                \n\t"\
1048
        "packssdw %%mm1, %%mm1                        \n\t" /* A1-B1        a1-b1 */\
1049
        "movd %%mm1, 96+" #dst "                \n\t"\
1050
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
1051
        "movd %%mm4, 112+" #dst "                \n\t"\
1052
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
1053
        "pmaddwd %%mm2, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
1054
        "pmaddwd 96(%2), %%mm2                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
1055
        "movq %%mm5, %%mm1                        \n\t" /* A2                a2 */\
1056
        "paddd %%mm4, %%mm1                        \n\t" /* A2+B2                a2+b2 */\
1057
        "psubd %%mm4, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
1058
        "psrad $" #shift ", %%mm1                \n\t"\
1059
        "psrad $" #shift ", %%mm5                \n\t"\
1060
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
1061
        "paddd %%mm2, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
1062
        "psubd %%mm2, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
1063
        "psrad $" #shift ", %%mm6                \n\t"\
1064
        "psrad $" #shift ", %%mm4                \n\t"\
1065
        "packssdw %%mm1, %%mm1                        \n\t" /* A2+B2        a2+b2 */\
1066
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
1067
        "movd %%mm1, 32+" #dst "                \n\t"\
1068
        "packssdw %%mm4, %%mm4                        \n\t" /* A3-B3        a3-b3 */\
1069
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
1070
        "movd %%mm6, 48+" #dst "                \n\t"\
1071
        "movd %%mm4, 64+" #dst "                \n\t"\
1072
        "movd %%mm5, 80+" #dst "                \n\t"
1073

    
1074

    
1075
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1076
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1077
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1078
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1079
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1080
        "jmp 9f                                        \n\t"
1081

    
1082
        "#.balign 16                                \n\t"\
1083
        "5:                                        \n\t"
1084
#undef IDCT
1085
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1086
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
1087
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
1088
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
1089
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1090
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
1091
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1092
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
1093
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
1094
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
1095
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
1096
        #rounder ", %%mm4                        \n\t"\
1097
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1098
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
1099
        #rounder ", %%mm0                        \n\t"\
1100
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
1101
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1102
        "paddd %%mm1, %%mm0                        \n\t" /* A1                a1 */\
1103
        "psubd %%mm1, %%mm5                        \n\t" /* A2                a2 */\
1104
        "movq 8+" #src0 ", %%mm2                \n\t" /* R4        R0        r4        r0 */\
1105
        "movq 8+" #src4 ", %%mm3                \n\t" /* R6        R2        r6        r2 */\
1106
        "movq 16(%2), %%mm1                        \n\t" /* C4        C4        C4        C4 */\
1107
        "pmaddwd %%mm2, %%mm1                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1108
        "movq 24(%2), %%mm7                        \n\t" /* -C4        C4        -C4        C4 */\
1109
        "pmaddwd %%mm7, %%mm2                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1110
        "movq 32(%2), %%mm7                        \n\t" /* C6        C2        C6        C2 */\
1111
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
1112
        "pmaddwd 40(%2), %%mm3                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
1113
        #rounder ", %%mm1                        \n\t"\
1114
        "paddd %%mm1, %%mm7                        \n\t" /* A0                a0 */\
1115
        "paddd %%mm1, %%mm1                        \n\t" /* 2C0                2c0 */\
1116
        #rounder ", %%mm2                        \n\t"\
1117
        "psubd %%mm7, %%mm1                        \n\t" /* A3                a3 */\
1118
        "paddd %%mm2, %%mm3                        \n\t" /* A1                a1 */\
1119
        "paddd %%mm2, %%mm2                        \n\t" /* 2C1                2c1 */\
1120
        "psubd %%mm3, %%mm2                        \n\t" /* A2                a2 */\
1121
        "psrad $" #shift ", %%mm4                \n\t"\
1122
        "psrad $" #shift ", %%mm7                \n\t"\
1123
        "psrad $" #shift ", %%mm3                \n\t"\
1124
        "packssdw %%mm7, %%mm4                        \n\t" /* A0        a0 */\
1125
        "movq %%mm4, " #dst "                        \n\t"\
1126
        "psrad $" #shift ", %%mm0                \n\t"\
1127
        "packssdw %%mm3, %%mm0                        \n\t" /* A1        a1 */\
1128
        "movq %%mm0, 16+" #dst "                \n\t"\
1129
        "movq %%mm0, 96+" #dst "                \n\t"\
1130
        "movq %%mm4, 112+" #dst "                \n\t"\
1131
        "psrad $" #shift ", %%mm5                \n\t"\
1132
        "psrad $" #shift ", %%mm6                \n\t"\
1133
        "psrad $" #shift ", %%mm2                \n\t"\
1134
        "packssdw %%mm2, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
1135
        "movq %%mm5, 32+" #dst "                \n\t"\
1136
        "psrad $" #shift ", %%mm1                \n\t"\
1137
        "packssdw %%mm1, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
1138
        "movq %%mm6, 48+" #dst "                \n\t"\
1139
        "movq %%mm6, 64+" #dst "                \n\t"\
1140
        "movq %%mm5, 80+" #dst "                \n\t"        
1141
        
1142

    
1143
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1144
IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1145
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1146
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1147
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1148
        "jmp 9f                                        \n\t"
1149

    
1150

    
1151
        "#.balign 16                                \n\t"\
1152
        "1:                                        \n\t"
1153
#undef IDCT
1154
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1155
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
1156
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
1157
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
1158
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
1159
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1160
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
1161
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1162
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
1163
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
1164
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
1165
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
1166
        #rounder ", %%mm4                        \n\t"\
1167
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1168
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
1169
        #rounder ", %%mm0                        \n\t"\
1170
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
1171
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
1172
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
1173
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1174
        "paddd %%mm1, %%mm0                        \n\t" /* A1                a1 */\
1175
        "psubd %%mm1, %%mm5                        \n\t" /* A2                a2 */\
1176
        "movq 64(%2), %%mm1                        \n\t"\
1177
        "pmaddwd %%mm2, %%mm1                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
1178
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
1179
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
1180
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
1181
        "psrad $" #shift ", %%mm7                \n\t"\
1182
        "psrad $" #shift ", %%mm4                \n\t"\
1183
        "movq %%mm0, %%mm3                        \n\t" /* A1                a1 */\
1184
        "paddd %%mm1, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
1185
        "psubd %%mm1, %%mm3                        \n\t" /* A1-B1                a1-b1 */\
1186
        "psrad $" #shift ", %%mm0                \n\t"\
1187
        "psrad $" #shift ", %%mm3                \n\t"\
1188
        "packssdw %%mm7, %%mm7                        \n\t" /* A0+B0        a0+b0 */\
1189
        "movd %%mm7, " #dst "                        \n\t"\
1190
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
1191
        "movd %%mm0, 16+" #dst "                \n\t"\
1192
        "packssdw %%mm3, %%mm3                        \n\t" /* A1-B1        a1-b1 */\
1193
        "movd %%mm3, 96+" #dst "                \n\t"\
1194
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
1195
        "movd %%mm4, 112+" #dst "                \n\t"\
1196
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
1197
        "pmaddwd %%mm2, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
1198
        "pmaddwd 96(%2), %%mm2                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
1199
        "movq %%mm5, %%mm3                        \n\t" /* A2                a2 */\
1200
        "paddd %%mm4, %%mm3                        \n\t" /* A2+B2                a2+b2 */\
1201
        "psubd %%mm4, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
1202
        "psrad $" #shift ", %%mm3                \n\t"\
1203
        "psrad $" #shift ", %%mm5                \n\t"\
1204
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
1205
        "paddd %%mm2, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
1206
        "psubd %%mm2, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
1207
        "psrad $" #shift ", %%mm6                \n\t"\
1208
        "packssdw %%mm3, %%mm3                        \n\t" /* A2+B2        a2+b2 */\
1209
        "movd %%mm3, 32+" #dst "                \n\t"\
1210
        "psrad $" #shift ", %%mm4                \n\t"\
1211
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
1212
        "movd %%mm6, 48+" #dst "                \n\t"\
1213
        "packssdw %%mm4, %%mm4                        \n\t" /* A3-B3        a3-b3 */\
1214
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
1215
        "movd %%mm4, 64+" #dst "                \n\t"\
1216
        "movd %%mm5, 80+" #dst "                \n\t"
1217
        
1218

    
1219
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1220
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1221
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1222
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1223
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1224
        "jmp 9f                                        \n\t"
1225

    
1226

    
1227
        "#.balign 16                                \n\t"
1228
        "7:                                        \n\t"
1229
#undef IDCT
1230
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1231
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
1232
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
1233
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1234
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
1235
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1236
        #rounder ", %%mm4                        \n\t"\
1237
        #rounder ", %%mm0                        \n\t"\
1238
        "psrad $" #shift ", %%mm4                \n\t"\
1239
        "psrad $" #shift ", %%mm0                \n\t"\
1240
        "movq 8+" #src0 ", %%mm2                \n\t" /* R4        R0        r4        r0 */\
1241
        "movq 16(%2), %%mm1                        \n\t" /* C4        C4        C4        C4 */\
1242
        "pmaddwd %%mm2, %%mm1                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1243
        "movq 24(%2), %%mm7                        \n\t" /* -C4        C4        -C4        C4 */\
1244
        "pmaddwd %%mm7, %%mm2                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1245
        "movq 32(%2), %%mm7                        \n\t" /* C6        C2        C6        C2 */\
1246
        #rounder ", %%mm1                        \n\t"\
1247
        #rounder ", %%mm2                        \n\t"\
1248
        "psrad $" #shift ", %%mm1                \n\t"\
1249
        "packssdw %%mm1, %%mm4                        \n\t" /* A0        a0 */\
1250
        "movq %%mm4, " #dst "                        \n\t"\
1251
        "psrad $" #shift ", %%mm2                \n\t"\
1252
        "packssdw %%mm2, %%mm0                        \n\t" /* A1        a1 */\
1253
        "movq %%mm0, 16+" #dst "                \n\t"\
1254
        "movq %%mm0, 96+" #dst "                \n\t"\
1255
        "movq %%mm4, 112+" #dst "                \n\t"\
1256
        "movq %%mm0, 32+" #dst "                \n\t"\
1257
        "movq %%mm4, 48+" #dst "                \n\t"\
1258
        "movq %%mm4, 64+" #dst "                \n\t"\
1259
        "movq %%mm0, 80+" #dst "                \n\t"        
1260

    
1261
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1262
IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1263
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1264
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1265
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1266

    
1267

    
1268
#endif
1269

    
1270
/*
1271
Input
1272
 00 40 04 44 20 60 24 64
1273
 10 30 14 34 50 70 54 74
1274
 01 41 03 43 21 61 23 63
1275
 11 31 13 33 51 71 53 73
1276
 02 42 06 46 22 62 26 66
1277
 12 32 16 36 52 72 56 76
1278
 05 45 07 47 25 65 27 67
1279
 15 35 17 37 55 75 57 77
1280
  
1281
Temp
1282
 00 04 10 14 20 24 30 34
1283
 40 44 50 54 60 64 70 74
1284
 01 03 11 13 21 23 31 33
1285
 41 43 51 53 61 63 71 73
1286
 02 06 12 16 22 26 32 36
1287
 42 46 52 56 62 66 72 76
1288
 05 07 15 17 25 27 35 37
1289
 45 47 55 57 65 67 75 77
1290
*/
1291

    
1292
"9: \n\t"
1293
                :: "r" (block), "r" (temp), "r" (coeffs)
1294
                : "%eax"
1295
        );
1296
}
1297

    
1298
void ff_simple_idct_mmx(int16_t *block)
1299
{
1300
    idct(block);
1301
}
1302

    
1303
//FIXME merge add/put into the idct
1304

    
1305
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1306
{
1307
    idct(block);
1308
    put_pixels_clamped_mmx(block, dest, line_size);
1309
}
1310
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1311
{
1312
    idct(block);
1313
    add_pixels_clamped_mmx(block, dest, line_size);
1314
}