Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / simple_idct_mmx.c @ 2912e87a

History | View | Annotate | Download (71.1 KB)

1
/*
2
 * Simple IDCT MMX
3
 *
4
 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of Libav.
7
 *
8
 * Libav is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * Libav is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with Libav; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22
#include "libavcodec/dsputil.h"
23
#include "libavcodec/simple_idct.h"
24
#include "dsputil_mmx.h"
25

    
26
/*
27
23170.475006
28
22725.260826
29
21406.727617
30
19265.545870
31
16384.000000
32
12872.826198
33
8866.956905
34
4520.335430
35
*/
36
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
40
#if 0
41
#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42
#else
43
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
44
#endif
45
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46
#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47
#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48

    
49
#define ROW_SHIFT 11
50
#define COL_SHIFT 20 // 6
51

    
52
DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
53
DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
54

    
55
DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
56
        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
57
//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
58
//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
59
        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
60
        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
61
//        0, 0, 0, 0,
62
//        0, 0, 0, 0,
63

    
64
 C4,  C4,  C4,  C4,
65
 C4, -C4,  C4, -C4,
66

    
67
 C2,  C6,  C2,  C6,
68
 C6, -C2,  C6, -C2,
69

    
70
 C1,  C3,  C1,  C3,
71
 C5,  C7,  C5,  C7,
72

    
73
 C3, -C7,  C3, -C7,
74
-C1, -C5, -C1, -C5,
75

    
76
 C5, -C1,  C5, -C1,
77
 C7,  C3,  C7,  C3,
78

    
79
 C7, -C5,  C7, -C5,
80
 C3, -C1,  C3, -C1
81
};
82

    
83
#if 0
84
static void unused_var_killer(void)
85
{
86
        int a= wm1010 + d40000;
87
        temp[0]=a;
88
}
89

90
static void inline idctCol (int16_t * col, int16_t *input)
91
{
92
#undef C0
93
#undef C1
94
#undef C2
95
#undef C3
96
#undef C4
97
#undef C5
98
#undef C6
99
#undef C7
100
        int a0, a1, a2, a3, b0, b1, b2, b3;
101
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
106
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
107
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
108
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
109
/*
110
        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
111
                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
112
                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
113
                return;
114
        }*/
115

116
col[8*0] = input[8*0 + 0];
117
col[8*1] = input[8*2 + 0];
118
col[8*2] = input[8*0 + 1];
119
col[8*3] = input[8*2 + 1];
120
col[8*4] = input[8*4 + 0];
121
col[8*5] = input[8*6 + 0];
122
col[8*6] = input[8*4 + 1];
123
col[8*7] = input[8*6 + 1];
124

125
        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
126
        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
127
        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
128
        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
129

130
        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
131
        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
132
        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
133
        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
134

135
        col[8*0] = (a0 + b0) >> COL_SHIFT;
136
        col[8*1] = (a1 + b1) >> COL_SHIFT;
137
        col[8*2] = (a2 + b2) >> COL_SHIFT;
138
        col[8*3] = (a3 + b3) >> COL_SHIFT;
139
        col[8*4] = (a3 - b3) >> COL_SHIFT;
140
        col[8*5] = (a2 - b2) >> COL_SHIFT;
141
        col[8*6] = (a1 - b1) >> COL_SHIFT;
142
        col[8*7] = (a0 - b0) >> COL_SHIFT;
143
}
144

145
static void inline idctRow (int16_t * output, int16_t * input)
146
{
147
        int16_t row[8];
148

149
        int a0, a1, a2, a3, b0, b1, b2, b3;
150
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
155
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
156
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
157
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
158

159
row[0] = input[0];
160
row[2] = input[1];
161
row[4] = input[4];
162
row[6] = input[5];
163
row[1] = input[8];
164
row[3] = input[9];
165
row[5] = input[12];
166
row[7] = input[13];
167

168
        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
169
                row[0] = row[1] = row[2] = row[3] = row[4] =
170
                        row[5] = row[6] = row[7] = row[0]<<3;
171
        output[0]  = row[0];
172
        output[2]  = row[1];
173
        output[4]  = row[2];
174
        output[6]  = row[3];
175
        output[8]  = row[4];
176
        output[10] = row[5];
177
        output[12] = row[6];
178
        output[14] = row[7];
179
                return;
180
        }
181

182
        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
183
        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
184
        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
185
        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
186

187
        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
188
        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
189
        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
190
        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
191

192
        row[0] = (a0 + b0) >> ROW_SHIFT;
193
        row[1] = (a1 + b1) >> ROW_SHIFT;
194
        row[2] = (a2 + b2) >> ROW_SHIFT;
195
        row[3] = (a3 + b3) >> ROW_SHIFT;
196
        row[4] = (a3 - b3) >> ROW_SHIFT;
197
        row[5] = (a2 - b2) >> ROW_SHIFT;
198
        row[6] = (a1 - b1) >> ROW_SHIFT;
199
        row[7] = (a0 - b0) >> ROW_SHIFT;
200

201
        output[0]  = row[0];
202
        output[2]  = row[1];
203
        output[4]  = row[2];
204
        output[6]  = row[3];
205
        output[8]  = row[4];
206
        output[10] = row[5];
207
        output[12] = row[6];
208
        output[14] = row[7];
209
}
210
#endif
211

    
212
static inline void idct(int16_t *block)
213
{
214
        DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
215
        int16_t * const temp= (int16_t*)align_tmp;
216

    
217
        __asm__ volatile(
218
#if 0 //Alternative, simpler variant
219

220
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
221
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
222
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
223
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
224
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
225
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
226
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
227
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
228
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
229
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
230
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
231
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
232
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
233
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
234
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
235
        #rounder ", %%mm4               \n\t"\
236
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
237
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
238
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
239
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
240
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
241
        #rounder ", %%mm0               \n\t"\
242
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
243
        "paddd %%mm0, %%mm0             \n\t" \
244
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
245
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
246
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
247
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
248
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
249
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
250
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
251
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
252
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
253
        "psrad $" #shift ", %%mm7       \n\t"\
254
        "psrad $" #shift ", %%mm4       \n\t"\
255
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
256
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
257
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
258
        "psrad $" #shift ", %%mm1       \n\t"\
259
        "psrad $" #shift ", %%mm2       \n\t"\
260
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
261
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
262
        "movq %%mm7, " #dst "           \n\t"\
263
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
264
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
265
        "movq %%mm2, 24+" #dst "        \n\t"\
266
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
267
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
268
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
269
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
270
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
271
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
272
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
273
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
274
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
275
        "psrad $" #shift ", %%mm2       \n\t"\
276
        "psrad $" #shift ", %%mm0       \n\t"\
277
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
278
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
279
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
280
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
281
        "psrad $" #shift ", %%mm6       \n\t"\
282
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
283
        "movq %%mm2, 8+" #dst "         \n\t"\
284
        "psrad $" #shift ", %%mm4       \n\t"\
285
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
286
        "movq %%mm4, 16+" #dst "        \n\t"\
287

288
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
289
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
290
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
291
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
292
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
293
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
294
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
295
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
296
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
297
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
298
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
299
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
300
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
301
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
302
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
303
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
304
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
305
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
306
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
307
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
308
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
309
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
310
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
311
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
312
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
313
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
314
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
315
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
316
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
317
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
318
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
319
        "psrad $" #shift ", %%mm7       \n\t"\
320
        "psrad $" #shift ", %%mm4       \n\t"\
321
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
322
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
323
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
324
        "psrad $" #shift ", %%mm0       \n\t"\
325
        "psrad $" #shift ", %%mm2       \n\t"\
326
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
327
        "movd %%mm7, " #dst "           \n\t"\
328
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
329
        "movd %%mm0, 16+" #dst "        \n\t"\
330
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
331
        "movd %%mm2, 96+" #dst "        \n\t"\
332
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
333
        "movd %%mm4, 112+" #dst "       \n\t"\
334
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
335
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
336
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
337
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
338
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
339
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
340
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
341
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
342
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
343
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
344
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
345
        "psrad $" #shift ", %%mm2       \n\t"\
346
        "psrad $" #shift ", %%mm5       \n\t"\
347
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
348
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
349
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
350
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
351
        "psrad $" #shift ", %%mm6       \n\t"\
352
        "psrad $" #shift ", %%mm4       \n\t"\
353
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
354
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
355
        "movd %%mm2, 32+" #dst "        \n\t"\
356
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
357
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
358
        "movd %%mm6, 48+" #dst "        \n\t"\
359
        "movd %%mm4, 64+" #dst "        \n\t"\
360
        "movd %%mm5, 80+" #dst "        \n\t"\
361

362

363
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
364
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
365
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
366
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
367
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
368
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
369
        "pand %%mm0, %%mm4              \n\t"\
370
        "por %%mm1, %%mm4               \n\t"\
371
        "por %%mm2, %%mm4               \n\t"\
372
        "por %%mm3, %%mm4               \n\t"\
373
        "packssdw %%mm4,%%mm4           \n\t"\
374
        "movd %%mm4, %%eax              \n\t"\
375
        "orl %%eax, %%eax               \n\t"\
376
        "jz 1f                          \n\t"\
377
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
378
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
379
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
380
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
381
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
382
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
383
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
384
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
385
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
386
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
387
        #rounder ", %%mm4               \n\t"\
388
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
389
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
390
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
391
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
392
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
393
        #rounder ", %%mm0               \n\t"\
394
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
395
        "paddd %%mm0, %%mm0             \n\t" \
396
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
397
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
398
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
399
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
400
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
401
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
402
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
403
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
404
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
405
        "psrad $" #shift ", %%mm7       \n\t"\
406
        "psrad $" #shift ", %%mm4       \n\t"\
407
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
408
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
409
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
410
        "psrad $" #shift ", %%mm1       \n\t"\
411
        "psrad $" #shift ", %%mm2       \n\t"\
412
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
413
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
414
        "movq %%mm7, " #dst "           \n\t"\
415
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
416
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
417
        "movq %%mm2, 24+" #dst "        \n\t"\
418
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
419
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
420
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
421
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
422
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
423
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
424
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
425
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
426
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
427
        "psrad $" #shift ", %%mm2       \n\t"\
428
        "psrad $" #shift ", %%mm0       \n\t"\
429
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
430
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
431
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
432
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
433
        "psrad $" #shift ", %%mm6       \n\t"\
434
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
435
        "movq %%mm2, 8+" #dst "         \n\t"\
436
        "psrad $" #shift ", %%mm4       \n\t"\
437
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
438
        "movq %%mm4, 16+" #dst "        \n\t"\
439
        "jmp 2f                         \n\t"\
440
        "1:                             \n\t"\
441
        "pslld $16, %%mm0               \n\t"\
442
        "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
443
        "psrad $13, %%mm0               \n\t"\
444
        "packssdw %%mm0, %%mm0          \n\t"\
445
        "movq %%mm0, " #dst "           \n\t"\
446
        "movq %%mm0, 8+" #dst "         \n\t"\
447
        "movq %%mm0, 16+" #dst "        \n\t"\
448
        "movq %%mm0, 24+" #dst "        \n\t"\
449
        "2:                             \n\t"
450

451

452
//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
453
ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
454
/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
455
ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
456
ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
457

458
DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
459
DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
460
DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
461

462

463
//IDCT(      src0,   src4,   src1,    src5,    dst, shift)
464
COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
465
COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
466
COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
467
COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
468

469
#else
470

    
471
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
472
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
473
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
474
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
475
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
476
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
477
        "pand %%mm0, %%mm4              \n\t"\
478
        "por %%mm1, %%mm4               \n\t"\
479
        "por %%mm2, %%mm4               \n\t"\
480
        "por %%mm3, %%mm4               \n\t"\
481
        "packssdw %%mm4,%%mm4           \n\t"\
482
        "movd %%mm4, %%eax              \n\t"\
483
        "orl %%eax, %%eax               \n\t"\
484
        "jz 1f                          \n\t"\
485
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
486
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
487
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
488
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
489
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
490
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
491
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
492
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
493
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
494
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
495
        #rounder ", %%mm4               \n\t"\
496
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
497
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
498
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
499
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
500
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
501
        #rounder ", %%mm0               \n\t"\
502
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
503
        "paddd %%mm0, %%mm0             \n\t" \
504
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
505
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
506
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
507
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
508
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
509
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
510
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
511
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
512
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
513
        "psrad $" #shift ", %%mm7       \n\t"\
514
        "psrad $" #shift ", %%mm4       \n\t"\
515
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
516
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
517
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
518
        "psrad $" #shift ", %%mm1       \n\t"\
519
        "psrad $" #shift ", %%mm2       \n\t"\
520
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
521
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
522
        "movq %%mm7, " #dst "           \n\t"\
523
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
524
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
525
        "movq %%mm2, 24+" #dst "        \n\t"\
526
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
527
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
528
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
529
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
530
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
531
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
532
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
533
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
534
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
535
        "psrad $" #shift ", %%mm2       \n\t"\
536
        "psrad $" #shift ", %%mm0       \n\t"\
537
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
538
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
539
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
540
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
541
        "psrad $" #shift ", %%mm6       \n\t"\
542
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
543
        "movq %%mm2, 8+" #dst "         \n\t"\
544
        "psrad $" #shift ", %%mm4       \n\t"\
545
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
546
        "movq %%mm4, 16+" #dst "        \n\t"\
547
        "jmp 2f                         \n\t"\
548
        "1:                             \n\t"\
549
        "pslld $16, %%mm0               \n\t"\
550
        "paddd "MANGLE(d40000)", %%mm0  \n\t"\
551
        "psrad $13, %%mm0               \n\t"\
552
        "packssdw %%mm0, %%mm0          \n\t"\
553
        "movq %%mm0, " #dst "           \n\t"\
554
        "movq %%mm0, 8+" #dst "         \n\t"\
555
        "movq %%mm0, 16+" #dst "        \n\t"\
556
        "movq %%mm0, 24+" #dst "        \n\t"\
557
        "2:                             \n\t"
558

    
559
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
560
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
561
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
562
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
563
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
564
        "movq %%mm0, %%mm4              \n\t"\
565
        "por %%mm1, %%mm4               \n\t"\
566
        "por %%mm2, %%mm4               \n\t"\
567
        "por %%mm3, %%mm4               \n\t"\
568
        "packssdw %%mm4,%%mm4           \n\t"\
569
        "movd %%mm4, %%eax              \n\t"\
570
        "orl %%eax, %%eax               \n\t"\
571
        "jz " #bt "                     \n\t"\
572
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
573
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
574
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
575
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
576
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
577
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
578
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
579
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
580
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
581
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
582
        #rounder ", %%mm4               \n\t"\
583
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
584
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
585
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
586
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
587
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
588
        #rounder ", %%mm0               \n\t"\
589
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
590
        "paddd %%mm0, %%mm0             \n\t" \
591
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
592
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
593
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
594
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
595
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
596
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
597
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
598
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
599
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
600
        "psrad $" #shift ", %%mm7       \n\t"\
601
        "psrad $" #shift ", %%mm4       \n\t"\
602
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
603
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
604
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
605
        "psrad $" #shift ", %%mm1       \n\t"\
606
        "psrad $" #shift ", %%mm2       \n\t"\
607
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
608
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
609
        "movq %%mm7, " #dst "           \n\t"\
610
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
611
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
612
        "movq %%mm2, 24+" #dst "        \n\t"\
613
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
614
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
615
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
616
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
617
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
618
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
619
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
620
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
621
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
622
        "psrad $" #shift ", %%mm2       \n\t"\
623
        "psrad $" #shift ", %%mm0       \n\t"\
624
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
625
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
626
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
627
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
628
        "psrad $" #shift ", %%mm6       \n\t"\
629
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
630
        "movq %%mm2, 8+" #dst "         \n\t"\
631
        "psrad $" #shift ", %%mm4       \n\t"\
632
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
633
        "movq %%mm4, 16+" #dst "        \n\t"\
634

    
635
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
636
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
637
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
638
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
639
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
640
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
641
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
642
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
643
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
644
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
645
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
646
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
647
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
648
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
649
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
650
        #rounder ", %%mm4               \n\t"\
651
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
652
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
653
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
654
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
655
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
656
        #rounder ", %%mm0               \n\t"\
657
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
658
        "paddd %%mm0, %%mm0             \n\t" \
659
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
660
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
661
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
662
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
663
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
664
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
665
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
666
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
667
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
668
        "psrad $" #shift ", %%mm7       \n\t"\
669
        "psrad $" #shift ", %%mm4       \n\t"\
670
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
671
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
672
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
673
        "psrad $" #shift ", %%mm1       \n\t"\
674
        "psrad $" #shift ", %%mm2       \n\t"\
675
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
676
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
677
        "movq %%mm7, " #dst "           \n\t"\
678
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
679
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
680
        "movq %%mm2, 24+" #dst "        \n\t"\
681
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
682
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
683
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
684
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
685
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
686
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
687
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
688
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
689
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
690
        "psrad $" #shift ", %%mm2       \n\t"\
691
        "psrad $" #shift ", %%mm0       \n\t"\
692
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
693
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
694
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
695
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
696
        "psrad $" #shift ", %%mm6       \n\t"\
697
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
698
        "movq %%mm2, 8+" #dst "         \n\t"\
699
        "psrad $" #shift ", %%mm4       \n\t"\
700
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
701
        "movq %%mm4, 16+" #dst "        \n\t"\
702

    
703
//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
704
DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
705
Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
706
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
707
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
708

    
709
#undef IDCT
710
#define IDCT(src0, src4, src1, src5, dst, shift) \
711
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
712
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
713
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
714
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
715
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
716
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
717
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
718
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
719
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
720
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
721
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
722
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
723
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
724
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
725
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
726
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
727
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
728
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
729
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
730
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
731
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
732
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
733
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
734
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
735
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
736
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
737
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
738
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
739
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
740
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
741
        "psrad $" #shift ", %%mm7       \n\t"\
742
        "psrad $" #shift ", %%mm4       \n\t"\
743
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
744
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
745
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
746
        "psrad $" #shift ", %%mm0       \n\t"\
747
        "psrad $" #shift ", %%mm2       \n\t"\
748
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
749
        "movd %%mm7, " #dst "           \n\t"\
750
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
751
        "movd %%mm0, 16+" #dst "        \n\t"\
752
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
753
        "movd %%mm2, 96+" #dst "        \n\t"\
754
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
755
        "movd %%mm4, 112+" #dst "       \n\t"\
756
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
757
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
758
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
759
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
760
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
761
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
762
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
763
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
764
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
765
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
766
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
767
        "psrad $" #shift ", %%mm2       \n\t"\
768
        "psrad $" #shift ", %%mm5       \n\t"\
769
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
770
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
771
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
772
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
773
        "psrad $" #shift ", %%mm6       \n\t"\
774
        "psrad $" #shift ", %%mm4       \n\t"\
775
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
776
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
777
        "movd %%mm2, 32+" #dst "        \n\t"\
778
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
779
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
780
        "movd %%mm6, 48+" #dst "        \n\t"\
781
        "movd %%mm4, 64+" #dst "        \n\t"\
782
        "movd %%mm5, 80+" #dst "        \n\t"
783

    
784

    
785
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
786
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
787
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
788
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
789
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
790
        "jmp 9f                         \n\t"
791

    
792
        "# .p2align 4                   \n\t"\
793
        "4:                             \n\t"
794
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
795
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
796

    
797
#undef IDCT
798
#define IDCT(src0, src4, src1, src5, dst, shift) \
799
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
800
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
801
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
802
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
803
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
804
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
805
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
806
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
807
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
808
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
809
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
810
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
811
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
812
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
813
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
814
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
815
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
816
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
817
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
818
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
819
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
820
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
821
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
822
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
823
        "psrad $" #shift ", %%mm1       \n\t"\
824
        "psrad $" #shift ", %%mm4       \n\t"\
825
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
826
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
827
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
828
        "psrad $" #shift ", %%mm0       \n\t"\
829
        "psrad $" #shift ", %%mm2       \n\t"\
830
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
831
        "movd %%mm1, " #dst "           \n\t"\
832
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
833
        "movd %%mm0, 16+" #dst "        \n\t"\
834
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
835
        "movd %%mm2, 96+" #dst "        \n\t"\
836
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
837
        "movd %%mm4, 112+" #dst "       \n\t"\
838
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
839
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
840
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
841
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
842
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
843
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
844
        "psrad $" #shift ", %%mm2       \n\t"\
845
        "psrad $" #shift ", %%mm5       \n\t"\
846
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
847
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
848
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
849
        "psrad $" #shift ", %%mm6       \n\t"\
850
        "psrad $" #shift ", %%mm1       \n\t"\
851
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
852
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
853
        "movd %%mm2, 32+" #dst "        \n\t"\
854
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
855
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
856
        "movd %%mm6, 48+" #dst "        \n\t"\
857
        "movd %%mm1, 64+" #dst "        \n\t"\
858
        "movd %%mm5, 80+" #dst "        \n\t"
859

    
860
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
861
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
862
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
863
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
864
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
865
        "jmp 9f                         \n\t"
866

    
867
        "# .p2align 4                   \n\t"\
868
        "6:                             \n\t"
869
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
870

    
871
#undef IDCT
872
#define IDCT(src0, src4, src1, src5, dst, shift) \
873
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
874
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
875
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
876
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
877
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
878
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
879
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
880
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
881
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
882
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
883
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
884
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
885
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
886
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
887
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
888
        "psrad $" #shift ", %%mm1       \n\t"\
889
        "psrad $" #shift ", %%mm4       \n\t"\
890
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
891
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
892
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
893
        "psrad $" #shift ", %%mm0       \n\t"\
894
        "psrad $" #shift ", %%mm2       \n\t"\
895
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
896
        "movd %%mm1, " #dst "           \n\t"\
897
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
898
        "movd %%mm0, 16+" #dst "        \n\t"\
899
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
900
        "movd %%mm2, 96+" #dst "        \n\t"\
901
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
902
        "movd %%mm4, 112+" #dst "       \n\t"\
903
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
904
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
905
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
906
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
907
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
908
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
909
        "psrad $" #shift ", %%mm2       \n\t"\
910
        "psrad $" #shift ", %%mm5       \n\t"\
911
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
912
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
913
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
914
        "psrad $" #shift ", %%mm6       \n\t"\
915
        "psrad $" #shift ", %%mm1       \n\t"\
916
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
917
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
918
        "movd %%mm2, 32+" #dst "        \n\t"\
919
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
920
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
921
        "movd %%mm6, 48+" #dst "        \n\t"\
922
        "movd %%mm1, 64+" #dst "        \n\t"\
923
        "movd %%mm5, 80+" #dst "        \n\t"
924

    
925

    
926
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
927
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
928
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
929
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
930
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
931
        "jmp 9f                         \n\t"
932

    
933
        "# .p2align 4                   \n\t"\
934
        "2:                             \n\t"
935
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
936

    
937
#undef IDCT
938
#define IDCT(src0, src4, src1, src5, dst, shift) \
939
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
940
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
941
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
942
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
943
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
944
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
945
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
946
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
947
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
948
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
949
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
950
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
951
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
952
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
953
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
954
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
955
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
956
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
957
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
958
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
959
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
960
        "psrad $" #shift ", %%mm7       \n\t"\
961
        "psrad $" #shift ", %%mm4       \n\t"\
962
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
963
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
964
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
965
        "psrad $" #shift ", %%mm0       \n\t"\
966
        "psrad $" #shift ", %%mm2       \n\t"\
967
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
968
        "movd %%mm7, " #dst "           \n\t"\
969
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
970
        "movd %%mm0, 16+" #dst "        \n\t"\
971
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
972
        "movd %%mm2, 96+" #dst "        \n\t"\
973
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
974
        "movd %%mm4, 112+" #dst "       \n\t"\
975
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
976
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
977
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
978
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
979
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
980
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
981
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
982
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
983
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
984
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
985
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
986
        "psrad $" #shift ", %%mm2       \n\t"\
987
        "psrad $" #shift ", %%mm5       \n\t"\
988
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
989
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
990
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
991
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
992
        "psrad $" #shift ", %%mm6       \n\t"\
993
        "psrad $" #shift ", %%mm4       \n\t"\
994
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
995
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
996
        "movd %%mm2, 32+" #dst "        \n\t"\
997
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
998
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
999
        "movd %%mm6, 48+" #dst "        \n\t"\
1000
        "movd %%mm4, 64+" #dst "        \n\t"\
1001
        "movd %%mm5, 80+" #dst "        \n\t"
1002

    
1003
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1004
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1005
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1006
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1007
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1008
        "jmp 9f                         \n\t"
1009

    
1010
        "# .p2align 4                   \n\t"\
1011
        "3:                             \n\t"
1012
#undef IDCT
1013
#define IDCT(src0, src4, src1, src5, dst, shift) \
1014
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1015
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1016
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1017
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1018
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1019
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1020
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1021
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1022
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1023
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1024
        "movq 64(%2), %%mm3             \n\t"\
1025
        "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1026
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1027
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1028
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1029
        "psrad $" #shift ", %%mm7       \n\t"\
1030
        "psrad $" #shift ", %%mm4       \n\t"\
1031
        "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
1032
        "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1033
        "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
1034
        "psrad $" #shift ", %%mm0       \n\t"\
1035
        "psrad $" #shift ", %%mm1       \n\t"\
1036
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1037
        "movd %%mm7, " #dst "           \n\t"\
1038
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1039
        "movd %%mm0, 16+" #dst "        \n\t"\
1040
        "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
1041
        "movd %%mm1, 96+" #dst "        \n\t"\
1042
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1043
        "movd %%mm4, 112+" #dst "       \n\t"\
1044
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1045
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1046
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1047
        "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
1048
        "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
1049
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1050
        "psrad $" #shift ", %%mm1       \n\t"\
1051
        "psrad $" #shift ", %%mm5       \n\t"\
1052
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1053
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1054
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1055
        "psrad $" #shift ", %%mm6       \n\t"\
1056
        "psrad $" #shift ", %%mm4       \n\t"\
1057
        "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
1058
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1059
        "movd %%mm1, 32+" #dst "        \n\t"\
1060
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1061
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1062
        "movd %%mm6, 48+" #dst "        \n\t"\
1063
        "movd %%mm4, 64+" #dst "        \n\t"\
1064
        "movd %%mm5, 80+" #dst "        \n\t"
1065

    
1066

    
1067
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1068
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1069
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1070
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1071
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1072
        "jmp 9f                         \n\t"
1073

    
1074
        "# .p2align 4                   \n\t"\
1075
        "5:                             \n\t"
1076
#undef IDCT
1077
#define IDCT(src0, src4, src1, src5, dst, shift) \
1078
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1079
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1080
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1081
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1082
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1083
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1084
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1085
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1086
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1087
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1088
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1089
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1090
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1091
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1092
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1093
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1094
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1095
        "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
1096
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1097
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1098
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1099
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1100
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1101
        "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1102
        "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1103
        "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
1104
        "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
1105
        "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
1106
        "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
1107
        "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
1108
        "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
1109
        "psrad $" #shift ", %%mm4       \n\t"\
1110
        "psrad $" #shift ", %%mm7       \n\t"\
1111
        "psrad $" #shift ", %%mm3       \n\t"\
1112
        "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
1113
        "movq %%mm4, " #dst "           \n\t"\
1114
        "psrad $" #shift ", %%mm0       \n\t"\
1115
        "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
1116
        "movq %%mm0, 16+" #dst "        \n\t"\
1117
        "movq %%mm0, 96+" #dst "        \n\t"\
1118
        "movq %%mm4, 112+" #dst "       \n\t"\
1119
        "psrad $" #shift ", %%mm5       \n\t"\
1120
        "psrad $" #shift ", %%mm6       \n\t"\
1121
        "psrad $" #shift ", %%mm2       \n\t"\
1122
        "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1123
        "movq %%mm5, 32+" #dst "        \n\t"\
1124
        "psrad $" #shift ", %%mm1       \n\t"\
1125
        "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1126
        "movq %%mm6, 48+" #dst "        \n\t"\
1127
        "movq %%mm6, 64+" #dst "        \n\t"\
1128
        "movq %%mm5, 80+" #dst "        \n\t"
1129

    
1130

    
1131
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1132
IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1133
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1134
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1135
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1136
        "jmp 9f                         \n\t"
1137

    
1138

    
1139
        "# .p2align 4                   \n\t"\
1140
        "1:                             \n\t"
1141
#undef IDCT
1142
#define IDCT(src0, src4, src1, src5, dst, shift) \
1143
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1144
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1145
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1146
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1147
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1148
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1149
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1150
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1151
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1152
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1153
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1154
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1155
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1156
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1157
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1158
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1159
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1160
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1161
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1162
        "movq 64(%2), %%mm1             \n\t"\
1163
        "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1164
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1165
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1166
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1167
        "psrad $" #shift ", %%mm7       \n\t"\
1168
        "psrad $" #shift ", %%mm4       \n\t"\
1169
        "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
1170
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1171
        "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
1172
        "psrad $" #shift ", %%mm0       \n\t"\
1173
        "psrad $" #shift ", %%mm3       \n\t"\
1174
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1175
        "movd %%mm7, " #dst "           \n\t"\
1176
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1177
        "movd %%mm0, 16+" #dst "        \n\t"\
1178
        "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
1179
        "movd %%mm3, 96+" #dst "        \n\t"\
1180
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1181
        "movd %%mm4, 112+" #dst "       \n\t"\
1182
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1183
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1184
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1185
        "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
1186
        "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
1187
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1188
        "psrad $" #shift ", %%mm3       \n\t"\
1189
        "psrad $" #shift ", %%mm5       \n\t"\
1190
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1191
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1192
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1193
        "psrad $" #shift ", %%mm6       \n\t"\
1194
        "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
1195
        "movd %%mm3, 32+" #dst "        \n\t"\
1196
        "psrad $" #shift ", %%mm4       \n\t"\
1197
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1198
        "movd %%mm6, 48+" #dst "        \n\t"\
1199
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1200
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1201
        "movd %%mm4, 64+" #dst "        \n\t"\
1202
        "movd %%mm5, 80+" #dst "        \n\t"
1203

    
1204

    
1205
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1206
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1207
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1208
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1209
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1210
        "jmp 9f                         \n\t"
1211

    
1212

    
1213
        "# .p2align 4                   \n\t"
1214
        "7:                             \n\t"
1215
#undef IDCT
1216
#define IDCT(src0, src4, src1, src5, dst, shift) \
1217
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1218
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1219
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1220
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1221
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1222
        "psrad $" #shift ", %%mm4       \n\t"\
1223
        "psrad $" #shift ", %%mm0       \n\t"\
1224
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1225
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1226
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1227
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1228
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1229
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1230
        "psrad $" #shift ", %%mm1       \n\t"\
1231
        "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
1232
        "movq %%mm4, " #dst "           \n\t"\
1233
        "psrad $" #shift ", %%mm2       \n\t"\
1234
        "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
1235
        "movq %%mm0, 16+" #dst "        \n\t"\
1236
        "movq %%mm0, 96+" #dst "        \n\t"\
1237
        "movq %%mm4, 112+" #dst "       \n\t"\
1238
        "movq %%mm0, 32+" #dst "        \n\t"\
1239
        "movq %%mm4, 48+" #dst "        \n\t"\
1240
        "movq %%mm4, 64+" #dst "        \n\t"\
1241
        "movq %%mm0, 80+" #dst "        \n\t"
1242

    
1243
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1244
IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1245
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1246
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1247
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1248

    
1249

    
1250
#endif
1251

    
1252
/*
1253
Input
1254
 00 40 04 44 20 60 24 64
1255
 10 30 14 34 50 70 54 74
1256
 01 41 03 43 21 61 23 63
1257
 11 31 13 33 51 71 53 73
1258
 02 42 06 46 22 62 26 66
1259
 12 32 16 36 52 72 56 76
1260
 05 45 07 47 25 65 27 67
1261
 15 35 17 37 55 75 57 77
1262

1263
Temp
1264
 00 04 10 14 20 24 30 34
1265
 40 44 50 54 60 64 70 74
1266
 01 03 11 13 21 23 31 33
1267
 41 43 51 53 61 63 71 73
1268
 02 06 12 16 22 26 32 36
1269
 42 46 52 56 62 66 72 76
1270
 05 07 15 17 25 27 35 37
1271
 45 47 55 57 65 67 75 77
1272
*/
1273

    
1274
"9: \n\t"
1275
                :: "r" (block), "r" (temp), "r" (coeffs)
1276
                : "%eax"
1277
        );
1278
}
1279

    
1280
void ff_simple_idct_mmx(int16_t *block)
1281
{
1282
    idct(block);
1283
}
1284

    
1285
//FIXME merge add/put into the idct
1286

    
1287
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1288
{
1289
    idct(block);
1290
    ff_put_pixels_clamped_mmx(block, dest, line_size);
1291
}
1292
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1293
{
1294
    idct(block);
1295
    ff_add_pixels_clamped_mmx(block, dest, line_size);
1296
}