Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / simple_idct_mmx.c @ d2bb7db1

History | View | Annotate | Download (52.5 KB)

1 37e8dcda Arpi
/*
2 ff4ec49e Fabrice Bellard
 * Simple IDCT MMX
3
 *
4
 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 */
20 37e8dcda Arpi
#include "../dsputil.h"
21 e96682e6 Michael Niedermayer
#include "../simple_idct.h"
22 ff4ec49e Fabrice Bellard
23 9e1795dd Michael Niedermayer
/*
24
23170.475006
25
22725.260826
26
21406.727617
27
19265.545870
28
16384.000000
29
12872.826198
30
8866.956905
31
4520.335430
32
*/
33 37e8dcda Arpi
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37 9e1795dd Michael Niedermayer
#if 0
38 37e8dcda Arpi
#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39 9e1795dd Michael Niedermayer
#else
40
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
41
#endif
42 37e8dcda Arpi
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43
#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44
#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45
46
#define ROW_SHIFT 11
47
#define COL_SHIFT 20 // 6
48
49 5c0513bd Dmitry Baryshkov
static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
50
static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
51 41338ac0 Michael Niedermayer
52
static const int16_t __attribute__((aligned(8))) coeffs[]= {
53 37e8dcda Arpi
        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
54
//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
55
//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
56
        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
57
        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
58
//        0, 0, 0, 0,
59
//        0, 0, 0, 0,
60
61 0a8d8945 Michael Niedermayer
 C4,  C4,  C4,  C4,
62
 C4, -C4,  C4, -C4,
63
 
64
 C2,  C6,  C2,  C6,
65
 C6, -C2,  C6, -C2,
66
 
67
 C1,  C3,  C1,  C3,
68
 C5,  C7,  C5,  C7,
69
 
70
 C3, -C7,  C3, -C7,
71
-C1, -C5, -C1, -C5,
72
 
73
 C5, -C1,  C5, -C1,
74
 C7,  C3,  C7,  C3,
75
 
76
 C7, -C5,  C7, -C5,
77
 C3, -C1,  C3, -C1
78
};
79
80 ef5b1b5a Juanjo
#if 0
81 0a8d8945 Michael Niedermayer
static void unused_var_killer(){
82
        int a= wm1010 + d40000;
83
        temp[0]=a;
84
}
85

86 37e8dcda Arpi
static void inline idctCol (int16_t * col, int16_t *input)
87
{
88
#undef C0
89
#undef C1
90
#undef C2
91
#undef C3
92
#undef C4
93
#undef C5
94
#undef C6
95
#undef C7
96
        int a0, a1, a2, a3, b0, b1, b2, b3;
97
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
98
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
99
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101 ccf589a8 Michael Niedermayer
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102 37e8dcda Arpi
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103
        const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104
        const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105
/*
106
        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
107
                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
108
                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
109
                return;
110
        }*/
111

112
col[8*0] = input[8*0 + 0];
113
col[8*1] = input[8*2 + 0];
114
col[8*2] = input[8*0 + 1];
115
col[8*3] = input[8*2 + 1];
116
col[8*4] = input[8*4 + 0];
117
col[8*5] = input[8*6 + 0];
118
col[8*6] = input[8*4 + 1];
119
col[8*7] = input[8*6 + 1];
120

121
        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
122
        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
123
        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
124
        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
125

126
        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
127
        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
128
        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
129
        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
130

131
        col[8*0] = (a0 + b0) >> COL_SHIFT;
132
        col[8*1] = (a1 + b1) >> COL_SHIFT;
133
        col[8*2] = (a2 + b2) >> COL_SHIFT;
134
        col[8*3] = (a3 + b3) >> COL_SHIFT;
135
        col[8*4] = (a3 - b3) >> COL_SHIFT;
136
        col[8*5] = (a2 - b2) >> COL_SHIFT;
137
        col[8*6] = (a1 - b1) >> COL_SHIFT;
138
        col[8*7] = (a0 - b0) >> COL_SHIFT;
139
}
140

141
static void inline idctRow (int16_t * output, int16_t * input)
142
{
143
        int16_t row[8];
144

145
        int a0, a1, a2, a3, b0, b1, b2, b3;
146
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
147
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
148
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150 ccf589a8 Michael Niedermayer
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151 37e8dcda Arpi
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152
        const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153
        const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154

155
row[0] = input[0];
156
row[2] = input[1];
157
row[4] = input[4];
158
row[6] = input[5];
159
row[1] = input[8];
160
row[3] = input[9];
161
row[5] = input[12];
162
row[7] = input[13];
163

164
        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
165
                row[0] = row[1] = row[2] = row[3] = row[4] =
166
                        row[5] = row[6] = row[7] = row[0]<<3;
167
        output[0] = row[0];
168
        output[2] = row[1];
169
        output[4] = row[2];
170
        output[6] = row[3];
171
        output[8] = row[4];
172
        output[10] = row[5];
173
        output[12] = row[6];
174
        output[14] = row[7];
175
                return;
176
        }
177

178
        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
179
        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
180
        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
181
        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
182

183
        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
184
        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
185
        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
186
        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
187

188
        row[0] = (a0 + b0) >> ROW_SHIFT;
189
        row[1] = (a1 + b1) >> ROW_SHIFT;
190
        row[2] = (a2 + b2) >> ROW_SHIFT;
191
        row[3] = (a3 + b3) >> ROW_SHIFT;
192
        row[4] = (a3 - b3) >> ROW_SHIFT;
193
        row[5] = (a2 - b2) >> ROW_SHIFT;
194
        row[6] = (a1 - b1) >> ROW_SHIFT;
195
        row[7] = (a0 - b0) >> ROW_SHIFT;
196

197
        output[0] = row[0];
198
        output[2] = row[1];
199
        output[4] = row[2];
200
        output[6] = row[3];
201
        output[8] = row[4];
202
        output[10] = row[5];
203
        output[12] = row[6];
204
        output[14] = row[7];
205
}
206
#endif
207
208
static inline void idct(int16_t *block)
209
{
210 41338ac0 Michael Niedermayer
        int64_t __attribute__((aligned(8))) align_tmp[16];
211
        int16_t * const temp= (int16_t*)align_tmp;
212
213 37e8dcda Arpi
        asm volatile(
214
#if 0 //Alternative, simpler variant
215 0a8d8945 Michael Niedermayer

216
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
217
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
218
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
219 37e8dcda Arpi
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
220
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
221 0a8d8945 Michael Niedermayer
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
222
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
223
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
224
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
225
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
226
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
227
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
228
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
229
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
230
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
231 37e8dcda Arpi
        #rounder ", %%mm4                        \n\t"\
232 0a8d8945 Michael Niedermayer
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
233
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
234
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
235
        "movq 56(%2), %%mm5                        \n\t" /* C7        C5        C7        C5 */\
236
        "pmaddwd %%mm3, %%mm5                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
237
        #rounder ", %%mm0                        \n\t"\
238
        "paddd %%mm0, %%mm1                        \n\t" /* A1                a1 */\
239
        "paddd %%mm0, %%mm0                        \n\t" \
240
        "psubd %%mm1, %%mm0                        \n\t" /* A2                a2 */\
241
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
242
        "paddd %%mm5, %%mm7                        \n\t" /* B0                b0 */\
243
        "movq 72(%2), %%mm5                        \n\t" /* -C5        -C1        -C5        -C1 */\
244
        "pmaddwd %%mm3, %%mm5                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
245
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
246 37e8dcda Arpi
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
247 0a8d8945 Michael Niedermayer
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
248
        "paddd %%mm2, %%mm5                        \n\t" /* B1                b1 */\
249
        "psrad $" #shift ", %%mm7                \n\t"\
250
        "psrad $" #shift ", %%mm4                \n\t"\
251
        "movq %%mm1, %%mm2                        \n\t" /* A1                a1 */\
252
        "paddd %%mm5, %%mm1                        \n\t" /* A1+B1                a1+b1 */\
253
        "psubd %%mm5, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
254
        "psrad $" #shift ", %%mm1                \n\t"\
255
        "psrad $" #shift ", %%mm2                \n\t"\
256
        "packssdw %%mm1, %%mm7                        \n\t" /* A1+B1        a1+b1        A0+B0        a0+b0 */\
257
        "packssdw %%mm4, %%mm2                        \n\t" /* A0-B0        a0-b0        A1-B1        a1-b1 */\
258
        "movq %%mm7, " #dst "                        \n\t"\
259
        "movq " #src1 ", %%mm1                        \n\t" /* R3        R1        r3        r1 */\
260
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
261
        "movq %%mm2, 24+" #dst "                \n\t"\
262
        "pmaddwd %%mm1, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
263
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
264
        "pmaddwd 96(%2), %%mm1                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
265
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
266
        "movq %%mm0, %%mm2                        \n\t" /* A2                a2 */\
267
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
268
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
269
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
270
        "psubd %%mm4, %%mm0                        \n\t" /* a2-B2                a2-b2 */\
271
        "psrad $" #shift ", %%mm2                \n\t"\
272
        "psrad $" #shift ", %%mm0                \n\t"\
273
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
274
        "paddd %%mm1, %%mm3                        \n\t" /* B3                b3 */\
275
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
276
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
277 37e8dcda Arpi
        "psrad $" #shift ", %%mm6                \n\t"\
278 0a8d8945 Michael Niedermayer
        "packssdw %%mm6, %%mm2                        \n\t" /* A3+B3        a3+b3        A2+B2        a2+b2 */\
279
        "movq %%mm2, 8+" #dst "                        \n\t"\
280 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
281 0a8d8945 Michael Niedermayer
        "packssdw %%mm0, %%mm4                        \n\t" /* A2-B2        a2-b2        A3-B3        a3-b3 */\
282
        "movq %%mm4, 16+" #dst "                \n\t"\
283

284
#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
285
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
286
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
287
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
288
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
289
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
290
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
291
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
292
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
293
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
294
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
295
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
296
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
297 37e8dcda Arpi
        #rounder ", %%mm4                        \n\t"\
298 0a8d8945 Michael Niedermayer
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
299
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
300
        #rounder ", %%mm0                        \n\t"\
301
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
302
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
303
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
304
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
305
        "paddd %%mm1, %%mm0                        \n\t" /* A1                a1 */\
306
        "psubd %%mm1, %%mm5                        \n\t" /* A2                a2 */\
307
        "movq 56(%2), %%mm1                        \n\t" /* C7        C5        C7        C5 */\
308
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
309
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
310
        "paddd %%mm1, %%mm7                        \n\t" /* B0                b0 */\
311
        "movq 72(%2), %%mm1                        \n\t" /* -C5        -C1        -C5        -C1 */\
312
        "pmaddwd %%mm3, %%mm1                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
313
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
314
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
315
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
316
        "paddd %%mm2, %%mm1                        \n\t" /* B1                b1 */\
317
        "psrad $" #shift ", %%mm7                \n\t"\
318 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
319 0a8d8945 Michael Niedermayer
        "movq %%mm0, %%mm2                        \n\t" /* A1                a1 */\
320
        "paddd %%mm1, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
321
        "psubd %%mm1, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
322
        "psrad $" #shift ", %%mm0                \n\t"\
323
        "psrad $" #shift ", %%mm2                \n\t"\
324
        "packssdw %%mm7, %%mm7                        \n\t" /* A0+B0        a0+b0 */\
325
        "movd %%mm7, " #dst "                        \n\t"\
326
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
327
        "movd %%mm0, 16+" #dst "                \n\t"\
328
        "packssdw %%mm2, %%mm2                        \n\t" /* A1-B1        a1-b1 */\
329
        "movd %%mm2, 96+" #dst "                \n\t"\
330
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
331
        "movd %%mm4, 112+" #dst "                \n\t"\
332
        "movq " #src1 ", %%mm0                        \n\t" /* R3        R1        r3        r1 */\
333
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
334
        "pmaddwd %%mm0, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
335
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
336
        "pmaddwd 96(%2), %%mm0                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
337 37e8dcda Arpi
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
338 0a8d8945 Michael Niedermayer
        "movq %%mm5, %%mm2                        \n\t" /* A2                a2 */\
339
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
340
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
341
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
342
        "psubd %%mm4, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
343
        "psrad $" #shift ", %%mm2                \n\t"\
344
        "psrad $" #shift ", %%mm5                \n\t"\
345
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
346
        "paddd %%mm0, %%mm3                        \n\t" /* B3                b3 */\
347
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
348
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
349 37e8dcda Arpi
        "psrad $" #shift ", %%mm6                \n\t"\
350
        "psrad $" #shift ", %%mm4                \n\t"\
351 0a8d8945 Michael Niedermayer
        "packssdw %%mm2, %%mm2                        \n\t" /* A2+B2        a2+b2 */\
352
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
353
        "movd %%mm2, 32+" #dst "                \n\t"\
354
        "packssdw %%mm4, %%mm4                        \n\t" /* A3-B3        a3-b3 */\
355
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
356
        "movd %%mm6, 48+" #dst "                \n\t"\
357
        "movd %%mm4, 64+" #dst "                \n\t"\
358
        "movd %%mm5, 80+" #dst "                \n\t"\
359

360
        
361
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
362
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
363
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
364 37e8dcda Arpi
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
365
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
366 4bdd9157 Nick Kurshev
        "movq "MANGLE(wm1010)", %%mm4                \n\t"\
367 37e8dcda Arpi
        "pand %%mm0, %%mm4                        \n\t"\
368
        "por %%mm1, %%mm4                        \n\t"\
369
        "por %%mm2, %%mm4                        \n\t"\
370
        "por %%mm3, %%mm4                        \n\t"\
371
        "packssdw %%mm4,%%mm4                        \n\t"\
372
        "movd %%mm4, %%eax                        \n\t"\
373
        "orl %%eax, %%eax                        \n\t"\
374
        "jz 1f                                        \n\t"\
375 0a8d8945 Michael Niedermayer
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
376
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
377
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
378
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
379
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
380
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
381
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
382
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
383
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
384
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
385 37e8dcda Arpi
        #rounder ", %%mm4                        \n\t"\
386 0a8d8945 Michael Niedermayer
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
387
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
388
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
389
        "movq 56(%2), %%mm5                        \n\t" /* C7        C5        C7        C5 */\
390
        "pmaddwd %%mm3, %%mm5                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
391
        #rounder ", %%mm0                        \n\t"\
392
        "paddd %%mm0, %%mm1                        \n\t" /* A1                a1 */\
393
        "paddd %%mm0, %%mm0                        \n\t" \
394
        "psubd %%mm1, %%mm0                        \n\t" /* A2                a2 */\
395
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
396
        "paddd %%mm5, %%mm7                        \n\t" /* B0                b0 */\
397
        "movq 72(%2), %%mm5                        \n\t" /* -C5        -C1        -C5        -C1 */\
398
        "pmaddwd %%mm3, %%mm5                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
399
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
400 37e8dcda Arpi
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
401 0a8d8945 Michael Niedermayer
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
402
        "paddd %%mm2, %%mm5                        \n\t" /* B1                b1 */\
403
        "psrad $" #shift ", %%mm7                \n\t"\
404 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
405 0a8d8945 Michael Niedermayer
        "movq %%mm1, %%mm2                        \n\t" /* A1                a1 */\
406
        "paddd %%mm5, %%mm1                        \n\t" /* A1+B1                a1+b1 */\
407
        "psubd %%mm5, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
408
        "psrad $" #shift ", %%mm1                \n\t"\
409
        "psrad $" #shift ", %%mm2                \n\t"\
410
        "packssdw %%mm1, %%mm7                        \n\t" /* A1+B1        a1+b1        A0+B0        a0+b0 */\
411
        "packssdw %%mm4, %%mm2                        \n\t" /* A0-B0        a0-b0        A1-B1        a1-b1 */\
412
        "movq %%mm7, " #dst "                        \n\t"\
413
        "movq " #src1 ", %%mm1                        \n\t" /* R3        R1        r3        r1 */\
414
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
415
        "movq %%mm2, 24+" #dst "                \n\t"\
416
        "pmaddwd %%mm1, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
417
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
418
        "pmaddwd 96(%2), %%mm1                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
419 37e8dcda Arpi
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
420 0a8d8945 Michael Niedermayer
        "movq %%mm0, %%mm2                        \n\t" /* A2                a2 */\
421
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
422
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
423
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
424
        "psubd %%mm4, %%mm0                        \n\t" /* a2-B2                a2-b2 */\
425 37e8dcda Arpi
        "psrad $" #shift ", %%mm2                \n\t"\
426
        "psrad $" #shift ", %%mm0                \n\t"\
427 0a8d8945 Michael Niedermayer
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
428
        "paddd %%mm1, %%mm3                        \n\t" /* B3                b3 */\
429
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
430
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
431
        "psrad $" #shift ", %%mm6                \n\t"\
432
        "packssdw %%mm6, %%mm2                        \n\t" /* A3+B3        a3+b3        A2+B2        a2+b2 */\
433
        "movq %%mm2, 8+" #dst "                        \n\t"\
434
        "psrad $" #shift ", %%mm4                \n\t"\
435
        "packssdw %%mm0, %%mm4                        \n\t" /* A2-B2        a2-b2        A3-B3        a3-b3 */\
436
        "movq %%mm4, 16+" #dst "                \n\t"\
437 37e8dcda Arpi
        "jmp 2f                                        \n\t"\
438
        "1:                                        \n\t"\
439 0a8d8945 Michael Niedermayer
        "pslld $16, %%mm0                        \n\t"\
440 4bdd9157 Nick Kurshev
        "#paddd "MANGLE(d40000)", %%mm0                \n\t"\
441 0a8d8945 Michael Niedermayer
        "psrad $13, %%mm0                        \n\t"\
442
        "packssdw %%mm0, %%mm0                        \n\t"\
443
        "movq %%mm0, " #dst "                        \n\t"\
444
        "movq %%mm0, 8+" #dst "                        \n\t"\
445
        "movq %%mm0, 16+" #dst "                \n\t"\
446
        "movq %%mm0, 24+" #dst "                \n\t"\
447
        "2:                                        \n\t"
448 37e8dcda Arpi

449

450 0a8d8945 Michael Niedermayer
//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
451
ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
452
/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
453
ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
454
ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
455

456
DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
457
DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
458
DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
459

460

461
//IDCT(      src0,   src4,   src1,    src5,    dst, rounder, shift)
462
COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
463
COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
464
COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
465
COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
466 37e8dcda Arpi

467 0a8d8945 Michael Niedermayer
#else
468
469
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
470
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
471
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
472 37e8dcda Arpi
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
473
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
474 4bdd9157 Nick Kurshev
        "movq "MANGLE(wm1010)", %%mm4                \n\t"\
475 37e8dcda Arpi
        "pand %%mm0, %%mm4                        \n\t"\
476
        "por %%mm1, %%mm4                        \n\t"\
477
        "por %%mm2, %%mm4                        \n\t"\
478
        "por %%mm3, %%mm4                        \n\t"\
479
        "packssdw %%mm4,%%mm4                        \n\t"\
480
        "movd %%mm4, %%eax                        \n\t"\
481
        "orl %%eax, %%eax                        \n\t"\
482
        "jz 1f                                        \n\t"\
483 0a8d8945 Michael Niedermayer
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
484
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
485
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
486
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
487
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
488
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
489
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
490
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
491
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
492
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
493 37e8dcda Arpi
        #rounder ", %%mm4                        \n\t"\
494 0a8d8945 Michael Niedermayer
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
495
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
496
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
497
        "movq 56(%2), %%mm5                        \n\t" /* C7        C5        C7        C5 */\
498
        "pmaddwd %%mm3, %%mm5                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
499
        #rounder ", %%mm0                        \n\t"\
500
        "paddd %%mm0, %%mm1                        \n\t" /* A1                a1 */\
501
        "paddd %%mm0, %%mm0                        \n\t" \
502
        "psubd %%mm1, %%mm0                        \n\t" /* A2                a2 */\
503
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
504
        "paddd %%mm5, %%mm7                        \n\t" /* B0                b0 */\
505
        "movq 72(%2), %%mm5                        \n\t" /* -C5        -C1        -C5        -C1 */\
506
        "pmaddwd %%mm3, %%mm5                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
507
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
508 37e8dcda Arpi
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
509 0a8d8945 Michael Niedermayer
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
510
        "paddd %%mm2, %%mm5                        \n\t" /* B1                b1 */\
511
        "psrad $" #shift ", %%mm7                \n\t"\
512 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
513 0a8d8945 Michael Niedermayer
        "movq %%mm1, %%mm2                        \n\t" /* A1                a1 */\
514
        "paddd %%mm5, %%mm1                        \n\t" /* A1+B1                a1+b1 */\
515
        "psubd %%mm5, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
516
        "psrad $" #shift ", %%mm1                \n\t"\
517
        "psrad $" #shift ", %%mm2                \n\t"\
518
        "packssdw %%mm1, %%mm7                        \n\t" /* A1+B1        a1+b1        A0+B0        a0+b0 */\
519
        "packssdw %%mm4, %%mm2                        \n\t" /* A0-B0        a0-b0        A1-B1        a1-b1 */\
520
        "movq %%mm7, " #dst "                        \n\t"\
521
        "movq " #src1 ", %%mm1                        \n\t" /* R3        R1        r3        r1 */\
522
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
523
        "movq %%mm2, 24+" #dst "                \n\t"\
524
        "pmaddwd %%mm1, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
525
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
526
        "pmaddwd 96(%2), %%mm1                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
527 37e8dcda Arpi
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
528 0a8d8945 Michael Niedermayer
        "movq %%mm0, %%mm2                        \n\t" /* A2                a2 */\
529
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
530
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
531
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
532
        "psubd %%mm4, %%mm0                        \n\t" /* a2-B2                a2-b2 */\
533 37e8dcda Arpi
        "psrad $" #shift ", %%mm2                \n\t"\
534
        "psrad $" #shift ", %%mm0                \n\t"\
535 0a8d8945 Michael Niedermayer
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
536
        "paddd %%mm1, %%mm3                        \n\t" /* B3                b3 */\
537
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
538
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
539
        "psrad $" #shift ", %%mm6                \n\t"\
540
        "packssdw %%mm6, %%mm2                        \n\t" /* A3+B3        a3+b3        A2+B2        a2+b2 */\
541
        "movq %%mm2, 8+" #dst "                        \n\t"\
542
        "psrad $" #shift ", %%mm4                \n\t"\
543
        "packssdw %%mm0, %%mm4                        \n\t" /* A2-B2        a2-b2        A3-B3        a3-b3 */\
544
        "movq %%mm4, 16+" #dst "                \n\t"\
545 37e8dcda Arpi
        "jmp 2f                                        \n\t"\
546
        "1:                                        \n\t"\
547 0a8d8945 Michael Niedermayer
        "pslld $16, %%mm0                        \n\t"\
548 4bdd9157 Nick Kurshev
        "paddd "MANGLE(d40000)", %%mm0                \n\t"\
549 0a8d8945 Michael Niedermayer
        "psrad $13, %%mm0                        \n\t"\
550
        "packssdw %%mm0, %%mm0                        \n\t"\
551
        "movq %%mm0, " #dst "                        \n\t"\
552
        "movq %%mm0, 8+" #dst "                        \n\t"\
553
        "movq %%mm0, 16+" #dst "                \n\t"\
554
        "movq %%mm0, 24+" #dst "                \n\t"\
555
        "2:                                        \n\t"
556 37e8dcda Arpi
557 0a8d8945 Michael Niedermayer
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
558
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
559
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
560 37e8dcda Arpi
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
561
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
562
        "movq %%mm0, %%mm4                        \n\t"\
563
        "por %%mm1, %%mm4                        \n\t"\
564
        "por %%mm2, %%mm4                        \n\t"\
565
        "por %%mm3, %%mm4                        \n\t"\
566 0a8d8945 Michael Niedermayer
        "packssdw %%mm4,%%mm4                        \n\t"\
567 37e8dcda Arpi
        "movd %%mm4, %%eax                        \n\t"\
568
        "orl %%eax, %%eax                        \n\t"\
569
        "jz " #bt "                                \n\t"\
570 0a8d8945 Michael Niedermayer
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
571
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
572
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
573
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
574
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
575
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
576
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
577
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
578
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
579
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
580 37e8dcda Arpi
        #rounder ", %%mm4                        \n\t"\
581 0a8d8945 Michael Niedermayer
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
582
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
583
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
584
        "movq 56(%2), %%mm5                        \n\t" /* C7        C5        C7        C5 */\
585
        "pmaddwd %%mm3, %%mm5                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
586
        #rounder ", %%mm0                        \n\t"\
587
        "paddd %%mm0, %%mm1                        \n\t" /* A1                a1 */\
588
        "paddd %%mm0, %%mm0                        \n\t" \
589
        "psubd %%mm1, %%mm0                        \n\t" /* A2                a2 */\
590
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
591
        "paddd %%mm5, %%mm7                        \n\t" /* B0                b0 */\
592
        "movq 72(%2), %%mm5                        \n\t" /* -C5        -C1        -C5        -C1 */\
593
        "pmaddwd %%mm3, %%mm5                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
594
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
595 37e8dcda Arpi
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
596 0a8d8945 Michael Niedermayer
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
597
        "paddd %%mm2, %%mm5                        \n\t" /* B1                b1 */\
598
        "psrad $" #shift ", %%mm7                \n\t"\
599 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
600 0a8d8945 Michael Niedermayer
        "movq %%mm1, %%mm2                        \n\t" /* A1                a1 */\
601
        "paddd %%mm5, %%mm1                        \n\t" /* A1+B1                a1+b1 */\
602
        "psubd %%mm5, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
603
        "psrad $" #shift ", %%mm1                \n\t"\
604
        "psrad $" #shift ", %%mm2                \n\t"\
605
        "packssdw %%mm1, %%mm7                        \n\t" /* A1+B1        a1+b1        A0+B0        a0+b0 */\
606
        "packssdw %%mm4, %%mm2                        \n\t" /* A0-B0        a0-b0        A1-B1        a1-b1 */\
607
        "movq %%mm7, " #dst "                        \n\t"\
608
        "movq " #src1 ", %%mm1                        \n\t" /* R3        R1        r3        r1 */\
609
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
610
        "movq %%mm2, 24+" #dst "                \n\t"\
611
        "pmaddwd %%mm1, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
612
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
613
        "pmaddwd 96(%2), %%mm1                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
614 37e8dcda Arpi
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
615 0a8d8945 Michael Niedermayer
        "movq %%mm0, %%mm2                        \n\t" /* A2                a2 */\
616
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
617
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
618
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
619
        "psubd %%mm4, %%mm0                        \n\t" /* a2-B2                a2-b2 */\
620
        "psrad $" #shift ", %%mm2                \n\t"\
621
        "psrad $" #shift ", %%mm0                \n\t"\
622
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
623
        "paddd %%mm1, %%mm3                        \n\t" /* B3                b3 */\
624
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
625
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
626 37e8dcda Arpi
        "psrad $" #shift ", %%mm6                \n\t"\
627 0a8d8945 Michael Niedermayer
        "packssdw %%mm6, %%mm2                        \n\t" /* A3+B3        a3+b3        A2+B2        a2+b2 */\
628
        "movq %%mm2, 8+" #dst "                        \n\t"\
629 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
630 0a8d8945 Michael Niedermayer
        "packssdw %%mm0, %%mm4                        \n\t" /* A2-B2        a2-b2        A3-B3        a3-b3 */\
631
        "movq %%mm4, 16+" #dst "                \n\t"\
632
633
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
634
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
635
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
636
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
637
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
638
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
639
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
640
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
641
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
642
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
643
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
644
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
645
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
646
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
647
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
648
        #rounder ", %%mm4                        \n\t"\
649
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
650
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
651
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
652
        "movq 56(%2), %%mm5                        \n\t" /* C7        C5        C7        C5 */\
653
        "pmaddwd %%mm3, %%mm5                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
654 37e8dcda Arpi
        #rounder ", %%mm0                        \n\t"\
655 0a8d8945 Michael Niedermayer
        "paddd %%mm0, %%mm1                        \n\t" /* A1                a1 */\
656
        "paddd %%mm0, %%mm0                        \n\t" \
657
        "psubd %%mm1, %%mm0                        \n\t" /* A2                a2 */\
658
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
659
        "paddd %%mm5, %%mm7                        \n\t" /* B0                b0 */\
660
        "movq 72(%2), %%mm5                        \n\t" /* -C5        -C1        -C5        -C1 */\
661
        "pmaddwd %%mm3, %%mm5                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
662
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
663
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
664
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
665
        "paddd %%mm2, %%mm5                        \n\t" /* B1                b1 */\
666
        "psrad $" #shift ", %%mm7                \n\t"\
667
        "psrad $" #shift ", %%mm4                \n\t"\
668
        "movq %%mm1, %%mm2                        \n\t" /* A1                a1 */\
669
        "paddd %%mm5, %%mm1                        \n\t" /* A1+B1                a1+b1 */\
670
        "psubd %%mm5, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
671
        "psrad $" #shift ", %%mm1                \n\t"\
672
        "psrad $" #shift ", %%mm2                \n\t"\
673
        "packssdw %%mm1, %%mm7                        \n\t" /* A1+B1        a1+b1        A0+B0        a0+b0 */\
674
        "packssdw %%mm4, %%mm2                        \n\t" /* A0-B0        a0-b0        A1-B1        a1-b1 */\
675
        "movq %%mm7, " #dst "                        \n\t"\
676
        "movq " #src1 ", %%mm1                        \n\t" /* R3        R1        r3        r1 */\
677
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
678
        "movq %%mm2, 24+" #dst "                \n\t"\
679
        "pmaddwd %%mm1, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
680
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
681
        "pmaddwd 96(%2), %%mm1                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
682
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
683
        "movq %%mm0, %%mm2                        \n\t" /* A2                a2 */\
684
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
685
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
686
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
687
        "psubd %%mm4, %%mm0                        \n\t" /* a2-B2                a2-b2 */\
688 37e8dcda Arpi
        "psrad $" #shift ", %%mm2                \n\t"\
689
        "psrad $" #shift ", %%mm0                \n\t"\
690 0a8d8945 Michael Niedermayer
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
691
        "paddd %%mm1, %%mm3                        \n\t" /* B3                b3 */\
692
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
693
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
694
        "psrad $" #shift ", %%mm6                \n\t"\
695
        "packssdw %%mm6, %%mm2                        \n\t" /* A3+B3        a3+b3        A2+B2        a2+b2 */\
696
        "movq %%mm2, 8+" #dst "                        \n\t"\
697
        "psrad $" #shift ", %%mm4                \n\t"\
698
        "packssdw %%mm0, %%mm4                        \n\t" /* A2-B2        a2-b2        A3-B3        a3-b3 */\
699
        "movq %%mm4, 16+" #dst "                \n\t"\
700
701
//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
702
DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
703
Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
704
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
705
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
706
707
#undef IDCT
708
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
709
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
710
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
711 37e8dcda Arpi
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
712
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
713 0a8d8945 Michael Niedermayer
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
714
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
715
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
716
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
717
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
718
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
719
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
720
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
721
        #rounder ", %%mm4                        \n\t"\
722
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
723
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
724
        #rounder ", %%mm0                        \n\t"\
725
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
726 37e8dcda Arpi
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
727 0a8d8945 Michael Niedermayer
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
728
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
729
        "paddd %%mm1, %%mm0                        \n\t" /* A1                a1 */\
730
        "psubd %%mm1, %%mm5                        \n\t" /* A2                a2 */\
731
        "movq 56(%2), %%mm1                        \n\t" /* C7        C5        C7        C5 */\
732
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
733
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
734
        "paddd %%mm1, %%mm7                        \n\t" /* B0                b0 */\
735
        "movq 72(%2), %%mm1                        \n\t" /* -C5        -C1        -C5        -C1 */\
736
        "pmaddwd %%mm3, %%mm1                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
737
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
738 37e8dcda Arpi
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
739 0a8d8945 Michael Niedermayer
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
740
        "paddd %%mm2, %%mm1                        \n\t" /* B1                b1 */\
741
        "psrad $" #shift ", %%mm7                \n\t"\
742 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
743 0a8d8945 Michael Niedermayer
        "movq %%mm0, %%mm2                        \n\t" /* A1                a1 */\
744
        "paddd %%mm1, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
745
        "psubd %%mm1, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
746
        "psrad $" #shift ", %%mm0                \n\t"\
747
        "psrad $" #shift ", %%mm2                \n\t"\
748
        "packssdw %%mm7, %%mm7                        \n\t" /* A0+B0        a0+b0 */\
749
        "movd %%mm7, " #dst "                        \n\t"\
750
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
751
        "movd %%mm0, 16+" #dst "                \n\t"\
752
        "packssdw %%mm2, %%mm2                        \n\t" /* A1-B1        a1-b1 */\
753
        "movd %%mm2, 96+" #dst "                \n\t"\
754
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
755
        "movd %%mm4, 112+" #dst "                \n\t"\
756
        "movq " #src1 ", %%mm0                        \n\t" /* R3        R1        r3        r1 */\
757
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
758
        "pmaddwd %%mm0, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
759
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
760
        "pmaddwd 96(%2), %%mm0                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
761 37e8dcda Arpi
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
762 0a8d8945 Michael Niedermayer
        "movq %%mm5, %%mm2                        \n\t" /* A2                a2 */\
763
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
764
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
765
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
766
        "psubd %%mm4, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
767
        "psrad $" #shift ", %%mm2                \n\t"\
768
        "psrad $" #shift ", %%mm5                \n\t"\
769
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
770
        "paddd %%mm0, %%mm3                        \n\t" /* B3                b3 */\
771
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
772
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
773 37e8dcda Arpi
        "psrad $" #shift ", %%mm6                \n\t"\
774
        "psrad $" #shift ", %%mm4                \n\t"\
775 0a8d8945 Michael Niedermayer
        "packssdw %%mm2, %%mm2                        \n\t" /* A2+B2        a2+b2 */\
776
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
777
        "movd %%mm2, 32+" #dst "                \n\t"\
778
        "packssdw %%mm4, %%mm4                        \n\t" /* A3-B3        a3-b3 */\
779
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
780
        "movd %%mm6, 48+" #dst "                \n\t"\
781
        "movd %%mm4, 64+" #dst "                \n\t"\
782
        "movd %%mm5, 80+" #dst "                \n\t"
783
784
785
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
786
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
787
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
788
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
789
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
790 37e8dcda Arpi
        "jmp 9f                                        \n\t"
791
792
        "#.balign 16                                \n\t"\
793
        "4:                                        \n\t"
794 0a8d8945 Michael Niedermayer
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
795
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
796 37e8dcda Arpi
797 0a8d8945 Michael Niedermayer
#undef IDCT
798
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
799
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
800
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
801 37e8dcda Arpi
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
802 0a8d8945 Michael Niedermayer
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
803
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
804
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
805
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
806
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
807
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
808
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
809
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
810
        #rounder ", %%mm4                        \n\t"\
811
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
812
        #rounder ", %%mm0                        \n\t"\
813 37e8dcda Arpi
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
814 0a8d8945 Michael Niedermayer
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
815
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
816
        "paddd %%mm1, %%mm0                        \n\t" /* A1                a1 */\
817
        "psubd %%mm1, %%mm5                        \n\t" /* A2                a2 */\
818
        "movq 56(%2), %%mm1                        \n\t" /* C7        C5        C7        C5 */\
819
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
820 37e8dcda Arpi
        "movq 72(%2), %%mm7                        \n\t" /* -C5        -C1        -C5        -C1 */\
821
        "pmaddwd %%mm3, %%mm7                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
822 0a8d8945 Michael Niedermayer
        "paddd %%mm4, %%mm1                        \n\t" /* A0+B0                a0+b0 */\
823
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
824
        "psubd %%mm1, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
825
        "psrad $" #shift ", %%mm1                \n\t"\
826 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
827 0a8d8945 Michael Niedermayer
        "movq %%mm0, %%mm2                        \n\t" /* A1                a1 */\
828
        "paddd %%mm7, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
829
        "psubd %%mm7, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
830 37e8dcda Arpi
        "psrad $" #shift ", %%mm0                \n\t"\
831 0a8d8945 Michael Niedermayer
        "psrad $" #shift ", %%mm2                \n\t"\
832
        "packssdw %%mm1, %%mm1                        \n\t" /* A0+B0        a0+b0 */\
833
        "movd %%mm1, " #dst "                        \n\t"\
834
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
835
        "movd %%mm0, 16+" #dst "                \n\t"\
836
        "packssdw %%mm2, %%mm2                        \n\t" /* A1-B1        a1-b1 */\
837
        "movd %%mm2, 96+" #dst "                \n\t"\
838
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
839
        "movd %%mm4, 112+" #dst "                \n\t"\
840
        "movq 88(%2), %%mm1                        \n\t" /* C3        C7        C3         C7 */\
841
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
842
        "movq %%mm5, %%mm2                        \n\t" /* A2                a2 */\
843
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
844
        "paddd %%mm1, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
845
        "psubd %%mm1, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
846
        "psrad $" #shift ", %%mm2                \n\t"\
847
        "psrad $" #shift ", %%mm5                \n\t"\
848
        "movq %%mm6, %%mm1                        \n\t" /* A3                a3 */\
849
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
850
        "psubd %%mm3, %%mm1                        \n\t" /* a3-B3                a3-b3 */\
851
        "psrad $" #shift ", %%mm6                \n\t"\
852
        "psrad $" #shift ", %%mm1                \n\t"\
853
        "packssdw %%mm2, %%mm2                        \n\t" /* A2+B2        a2+b2 */\
854
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
855
        "movd %%mm2, 32+" #dst "                \n\t"\
856
        "packssdw %%mm1, %%mm1                        \n\t" /* A3-B3        a3-b3 */\
857
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
858
        "movd %%mm6, 48+" #dst "                \n\t"\
859
        "movd %%mm1, 64+" #dst "                \n\t"\
860
        "movd %%mm5, 80+" #dst "                \n\t"        
861
862
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
863
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
864
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
865
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
866
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
867 37e8dcda Arpi
        "jmp 9f                                        \n\t"
868
869
        "#.balign 16                                \n\t"\
870
        "6:                                        \n\t"
871 0a8d8945 Michael Niedermayer
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
872 37e8dcda Arpi
873 0a8d8945 Michael Niedermayer
#undef IDCT
874
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
875
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
876 37e8dcda Arpi
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
877 0a8d8945 Michael Niedermayer
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
878
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
879
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
880
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
881
        #rounder ", %%mm4                        \n\t"\
882
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
883
        #rounder ", %%mm0                        \n\t"\
884
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
885
        "movq 56(%2), %%mm1                        \n\t" /* C7        C5        C7        C5 */\
886
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
887 37e8dcda Arpi
        "movq 72(%2), %%mm7                        \n\t" /* -C5        -C1        -C5        -C1 */\
888
        "pmaddwd %%mm3, %%mm7                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
889 0a8d8945 Michael Niedermayer
        "paddd %%mm4, %%mm1                        \n\t" /* A0+B0                a0+b0 */\
890
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
891
        "psubd %%mm1, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
892
        "psrad $" #shift ", %%mm1                \n\t"\
893 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
894 0a8d8945 Michael Niedermayer
        "movq %%mm0, %%mm2                        \n\t" /* A1                a1 */\
895
        "paddd %%mm7, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
896
        "psubd %%mm7, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
897 37e8dcda Arpi
        "psrad $" #shift ", %%mm0                \n\t"\
898 0a8d8945 Michael Niedermayer
        "psrad $" #shift ", %%mm2                \n\t"\
899
        "packssdw %%mm1, %%mm1                        \n\t" /* A0+B0        a0+b0 */\
900
        "movd %%mm1, " #dst "                        \n\t"\
901
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
902
        "movd %%mm0, 16+" #dst "                \n\t"\
903
        "packssdw %%mm2, %%mm2                        \n\t" /* A1-B1        a1-b1 */\
904
        "movd %%mm2, 96+" #dst "                \n\t"\
905
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
906
        "movd %%mm4, 112+" #dst "                \n\t"\
907
        "movq 88(%2), %%mm1                        \n\t" /* C3        C7        C3         C7 */\
908
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
909
        "movq %%mm5, %%mm2                        \n\t" /* A2                a2 */\
910
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
911
        "paddd %%mm1, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
912
        "psubd %%mm1, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
913
        "psrad $" #shift ", %%mm2                \n\t"\
914
        "psrad $" #shift ", %%mm5                \n\t"\
915
        "movq %%mm6, %%mm1                        \n\t" /* A3                a3 */\
916
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
917
        "psubd %%mm3, %%mm1                        \n\t" /* a3-B3                a3-b3 */\
918
        "psrad $" #shift ", %%mm6                \n\t"\
919
        "psrad $" #shift ", %%mm1                \n\t"\
920
        "packssdw %%mm2, %%mm2                        \n\t" /* A2+B2        a2+b2 */\
921
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
922
        "movd %%mm2, 32+" #dst "                \n\t"\
923
        "packssdw %%mm1, %%mm1                        \n\t" /* A3-B3        a3-b3 */\
924
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
925
        "movd %%mm6, 48+" #dst "                \n\t"\
926
        "movd %%mm1, 64+" #dst "                \n\t"\
927
        "movd %%mm5, 80+" #dst "                \n\t"        
928
929
930
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
931
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
932
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
933
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
934
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
935 37e8dcda Arpi
        "jmp 9f                                        \n\t"
936
937
        "#.balign 16                                \n\t"\
938
        "2:                                        \n\t"
939 0a8d8945 Michael Niedermayer
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
940 37e8dcda Arpi
941 0a8d8945 Michael Niedermayer
#undef IDCT
942
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
943
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
944 37e8dcda Arpi
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
945
        "movq " #src5 ", %%mm3                        \n\t" /* R7        R5        r7        r5 */\
946 0a8d8945 Michael Niedermayer
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
947
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
948
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
949
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
950
        #rounder ", %%mm4                        \n\t"\
951
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
952
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
953
        #rounder ", %%mm0                        \n\t"\
954
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
955
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
956
        "movq 56(%2), %%mm1                        \n\t" /* C7        C5        C7        C5 */\
957
        "pmaddwd %%mm3, %%mm1                        \n\t" /* C7R7+C5R5        C7r7+C5r5 */\
958
        "pmaddwd 64(%2), %%mm2                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
959
        "paddd %%mm1, %%mm7                        \n\t" /* B0                b0 */\
960
        "movq 72(%2), %%mm1                        \n\t" /* -C5        -C1        -C5        -C1 */\
961
        "pmaddwd %%mm3, %%mm1                        \n\t" /* -C5R7-C1R5        -C5r7-C1r5 */\
962
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
963 37e8dcda Arpi
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
964 0a8d8945 Michael Niedermayer
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
965
        "paddd %%mm2, %%mm1                        \n\t" /* B1                b1 */\
966
        "psrad $" #shift ", %%mm7                \n\t"\
967 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
968 0a8d8945 Michael Niedermayer
        "movq %%mm0, %%mm2                        \n\t" /* A1                a1 */\
969
        "paddd %%mm1, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
970
        "psubd %%mm1, %%mm2                        \n\t" /* A1-B1                a1-b1 */\
971
        "psrad $" #shift ", %%mm0                \n\t"\
972
        "psrad $" #shift ", %%mm2                \n\t"\
973
        "packssdw %%mm7, %%mm7                        \n\t" /* A0+B0        a0+b0 */\
974
        "movd %%mm7, " #dst "                        \n\t"\
975
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
976
        "movd %%mm0, 16+" #dst "                \n\t"\
977
        "packssdw %%mm2, %%mm2                        \n\t" /* A1-B1        a1-b1 */\
978
        "movd %%mm2, 96+" #dst "                \n\t"\
979
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
980
        "movd %%mm4, 112+" #dst "                \n\t"\
981
        "movq " #src1 ", %%mm0                        \n\t" /* R3        R1        r3        r1 */\
982
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
983
        "pmaddwd %%mm0, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
984
        "movq 88(%2), %%mm7                        \n\t" /* C3        C7        C3         C7 */\
985
        "pmaddwd 96(%2), %%mm0                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
986 37e8dcda Arpi
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C3R7+C7R5        C3r7+C7r5 */\
987 0a8d8945 Michael Niedermayer
        "movq %%mm5, %%mm2                        \n\t" /* A2                a2 */\
988
        "pmaddwd 104(%2), %%mm3                        \n\t" /* -C1R7+C3R5        -C1r7+C3r5 */\
989
        "paddd %%mm7, %%mm4                        \n\t" /* B2                b2 */\
990
        "paddd %%mm4, %%mm2                        \n\t" /* A2+B2                a2+b2 */\
991
        "psubd %%mm4, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
992
        "psrad $" #shift ", %%mm2                \n\t"\
993
        "psrad $" #shift ", %%mm5                \n\t"\
994
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
995
        "paddd %%mm0, %%mm3                        \n\t" /* B3                b3 */\
996
        "paddd %%mm3, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
997
        "psubd %%mm3, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
998 37e8dcda Arpi
        "psrad $" #shift ", %%mm6                \n\t"\
999
        "psrad $" #shift ", %%mm4                \n\t"\
1000 0a8d8945 Michael Niedermayer
        "packssdw %%mm2, %%mm2                        \n\t" /* A2+B2        a2+b2 */\
1001
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
1002
        "movd %%mm2, 32+" #dst "                \n\t"\
1003
        "packssdw %%mm4, %%mm4                        \n\t" /* A3-B3        a3-b3 */\
1004
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
1005
        "movd %%mm6, 48+" #dst "                \n\t"\
1006
        "movd %%mm4, 64+" #dst "                \n\t"\
1007
        "movd %%mm5, 80+" #dst "                \n\t"
1008
1009
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1010
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1011
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1012
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1013
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1014 37e8dcda Arpi
        "jmp 9f                                        \n\t"
1015
1016
        "#.balign 16                                \n\t"\
1017
        "3:                                        \n\t"
1018 0a8d8945 Michael Niedermayer
#undef IDCT
1019
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1020
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
1021 37e8dcda Arpi
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
1022 0a8d8945 Michael Niedermayer
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
1023
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1024
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
1025
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1026
        #rounder ", %%mm4                        \n\t"\
1027
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1028
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
1029
        #rounder ", %%mm0                        \n\t"\
1030
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
1031
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1032
        "movq 64(%2), %%mm3                        \n\t"\
1033
        "pmaddwd %%mm2, %%mm3                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
1034
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
1035 37e8dcda Arpi
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
1036 0a8d8945 Michael Niedermayer
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
1037
        "psrad $" #shift ", %%mm7                \n\t"\
1038 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
1039 0a8d8945 Michael Niedermayer
        "movq %%mm0, %%mm1                        \n\t" /* A1                a1 */\
1040
        "paddd %%mm3, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
1041
        "psubd %%mm3, %%mm1                        \n\t" /* A1-B1                a1-b1 */\
1042
        "psrad $" #shift ", %%mm0                \n\t"\
1043
        "psrad $" #shift ", %%mm1                \n\t"\
1044
        "packssdw %%mm7, %%mm7                        \n\t" /* A0+B0        a0+b0 */\
1045
        "movd %%mm7, " #dst "                        \n\t"\
1046
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
1047
        "movd %%mm0, 16+" #dst "                \n\t"\
1048
        "packssdw %%mm1, %%mm1                        \n\t" /* A1-B1        a1-b1 */\
1049
        "movd %%mm1, 96+" #dst "                \n\t"\
1050
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
1051
        "movd %%mm4, 112+" #dst "                \n\t"\
1052
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
1053
        "pmaddwd %%mm2, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
1054
        "pmaddwd 96(%2), %%mm2                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
1055
        "movq %%mm5, %%mm1                        \n\t" /* A2                a2 */\
1056
        "paddd %%mm4, %%mm1                        \n\t" /* A2+B2                a2+b2 */\
1057
        "psubd %%mm4, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
1058
        "psrad $" #shift ", %%mm1                \n\t"\
1059 37e8dcda Arpi
        "psrad $" #shift ", %%mm5                \n\t"\
1060 0a8d8945 Michael Niedermayer
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
1061
        "paddd %%mm2, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
1062
        "psubd %%mm2, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
1063 37e8dcda Arpi
        "psrad $" #shift ", %%mm6                \n\t"\
1064
        "psrad $" #shift ", %%mm4                \n\t"\
1065 0a8d8945 Michael Niedermayer
        "packssdw %%mm1, %%mm1                        \n\t" /* A2+B2        a2+b2 */\
1066
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
1067
        "movd %%mm1, 32+" #dst "                \n\t"\
1068
        "packssdw %%mm4, %%mm4                        \n\t" /* A3-B3        a3-b3 */\
1069
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
1070
        "movd %%mm6, 48+" #dst "                \n\t"\
1071
        "movd %%mm4, 64+" #dst "                \n\t"\
1072
        "movd %%mm5, 80+" #dst "                \n\t"
1073
1074
1075
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1076
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1077
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1078
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1079
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1080 37e8dcda Arpi
        "jmp 9f                                        \n\t"
1081
1082
        "#.balign 16                                \n\t"\
1083
        "5:                                        \n\t"
1084 0a8d8945 Michael Niedermayer
#undef IDCT
1085
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1086
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
1087
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
1088
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
1089
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1090
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
1091
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1092
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
1093
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
1094
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
1095
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
1096
        #rounder ", %%mm4                        \n\t"\
1097
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1098 37e8dcda Arpi
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
1099 0a8d8945 Michael Niedermayer
        #rounder ", %%mm0                        \n\t"\
1100
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
1101
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1102
        "paddd %%mm1, %%mm0                        \n\t" /* A1                a1 */\
1103
        "psubd %%mm1, %%mm5                        \n\t" /* A2                a2 */\
1104
        "movq 8+" #src0 ", %%mm2                \n\t" /* R4        R0        r4        r0 */\
1105
        "movq 8+" #src4 ", %%mm3                \n\t" /* R6        R2        r6        r2 */\
1106
        "movq 16(%2), %%mm1                        \n\t" /* C4        C4        C4        C4 */\
1107
        "pmaddwd %%mm2, %%mm1                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1108
        "movq 24(%2), %%mm7                        \n\t" /* -C4        C4        -C4        C4 */\
1109
        "pmaddwd %%mm7, %%mm2                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1110
        "movq 32(%2), %%mm7                        \n\t" /* C6        C2        C6        C2 */\
1111
        "pmaddwd %%mm3, %%mm7                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
1112
        "pmaddwd 40(%2), %%mm3                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
1113
        #rounder ", %%mm1                        \n\t"\
1114
        "paddd %%mm1, %%mm7                        \n\t" /* A0                a0 */\
1115
        "paddd %%mm1, %%mm1                        \n\t" /* 2C0                2c0 */\
1116
        #rounder ", %%mm2                        \n\t"\
1117
        "psubd %%mm7, %%mm1                        \n\t" /* A3                a3 */\
1118
        "paddd %%mm2, %%mm3                        \n\t" /* A1                a1 */\
1119
        "paddd %%mm2, %%mm2                        \n\t" /* 2C1                2c1 */\
1120
        "psubd %%mm3, %%mm2                        \n\t" /* A2                a2 */\
1121 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
1122
        "psrad $" #shift ", %%mm7                \n\t"\
1123 0a8d8945 Michael Niedermayer
        "psrad $" #shift ", %%mm3                \n\t"\
1124
        "packssdw %%mm7, %%mm4                        \n\t" /* A0        a0 */\
1125
        "movq %%mm4, " #dst "                        \n\t"\
1126 37e8dcda Arpi
        "psrad $" #shift ", %%mm0                \n\t"\
1127 0a8d8945 Michael Niedermayer
        "packssdw %%mm3, %%mm0                        \n\t" /* A1        a1 */\
1128
        "movq %%mm0, 16+" #dst "                \n\t"\
1129
        "movq %%mm0, 96+" #dst "                \n\t"\
1130
        "movq %%mm4, 112+" #dst "                \n\t"\
1131
        "psrad $" #shift ", %%mm5                \n\t"\
1132
        "psrad $" #shift ", %%mm6                \n\t"\
1133 37e8dcda Arpi
        "psrad $" #shift ", %%mm2                \n\t"\
1134 0a8d8945 Michael Niedermayer
        "packssdw %%mm2, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
1135
        "movq %%mm5, 32+" #dst "                \n\t"\
1136
        "psrad $" #shift ", %%mm1                \n\t"\
1137
        "packssdw %%mm1, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
1138
        "movq %%mm6, 48+" #dst "                \n\t"\
1139
        "movq %%mm6, 64+" #dst "                \n\t"\
1140
        "movq %%mm5, 80+" #dst "                \n\t"        
1141
        
1142
1143
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1144
IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1145
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1146
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1147
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1148 37e8dcda Arpi
        "jmp 9f                                        \n\t"
1149
1150
1151
        "#.balign 16                                \n\t"\
1152
        "1:                                        \n\t"
1153 0a8d8945 Michael Niedermayer
#undef IDCT
1154
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1155
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
1156
        "movq " #src4 ", %%mm1                        \n\t" /* R6        R2        r6        r2 */\
1157 37e8dcda Arpi
        "movq " #src1 ", %%mm2                        \n\t" /* R3        R1        r3        r1 */\
1158 0a8d8945 Michael Niedermayer
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
1159
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1160
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
1161
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1162
        "movq 32(%2), %%mm5                        \n\t" /* C6        C2        C6        C2 */\
1163
        "pmaddwd %%mm1, %%mm5                        \n\t" /* C6R6+C2R2        C6r6+C2r2 */\
1164
        "movq 40(%2), %%mm6                        \n\t" /* -C2        C6        -C2        C6 */\
1165
        "pmaddwd %%mm6, %%mm1                        \n\t" /* -C2R6+C6R2        -C2r6+C6r2 */\
1166
        #rounder ", %%mm4                        \n\t"\
1167
        "movq %%mm4, %%mm6                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1168
        "movq 48(%2), %%mm7                        \n\t" /* C3        C1        C3        C1 */\
1169
        #rounder ", %%mm0                        \n\t"\
1170
        "pmaddwd %%mm2, %%mm7                        \n\t" /* C3R3+C1R1        C3r3+C1r1 */\
1171 37e8dcda Arpi
        "paddd %%mm5, %%mm4                        \n\t" /* A0                a0 */\
1172 0a8d8945 Michael Niedermayer
        "psubd %%mm5, %%mm6                        \n\t" /* A3                a3 */\
1173
        "movq %%mm0, %%mm5                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1174
        "paddd %%mm1, %%mm0                        \n\t" /* A1                a1 */\
1175
        "psubd %%mm1, %%mm5                        \n\t" /* A2                a2 */\
1176
        "movq 64(%2), %%mm1                        \n\t"\
1177
        "pmaddwd %%mm2, %%mm1                        \n\t" /* -C7R3+C3R1        -C7r3+C3r1 */\
1178
        "paddd %%mm4, %%mm7                        \n\t" /* A0+B0                a0+b0 */\
1179 37e8dcda Arpi
        "paddd %%mm4, %%mm4                        \n\t" /* 2A0                2a0 */\
1180 0a8d8945 Michael Niedermayer
        "psubd %%mm7, %%mm4                        \n\t" /* A0-B0                a0-b0 */\
1181
        "psrad $" #shift ", %%mm7                \n\t"\
1182 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
1183 0a8d8945 Michael Niedermayer
        "movq %%mm0, %%mm3                        \n\t" /* A1                a1 */\
1184
        "paddd %%mm1, %%mm0                        \n\t" /* A1+B1                a1+b1 */\
1185
        "psubd %%mm1, %%mm3                        \n\t" /* A1-B1                a1-b1 */\
1186
        "psrad $" #shift ", %%mm0                \n\t"\
1187
        "psrad $" #shift ", %%mm3                \n\t"\
1188
        "packssdw %%mm7, %%mm7                        \n\t" /* A0+B0        a0+b0 */\
1189
        "movd %%mm7, " #dst "                        \n\t"\
1190
        "packssdw %%mm0, %%mm0                        \n\t" /* A1+B1        a1+b1 */\
1191
        "movd %%mm0, 16+" #dst "                \n\t"\
1192
        "packssdw %%mm3, %%mm3                        \n\t" /* A1-B1        a1-b1 */\
1193
        "movd %%mm3, 96+" #dst "                \n\t"\
1194
        "packssdw %%mm4, %%mm4                        \n\t" /* A0-B0        a0-b0 */\
1195
        "movd %%mm4, 112+" #dst "                \n\t"\
1196
        "movq 80(%2), %%mm4                        \n\t" /* -C1        C5        -C1         C5 */\
1197
        "pmaddwd %%mm2, %%mm4                        \n\t" /* -C1R3+C5R1        -C1r3+C5r1 */\
1198
        "pmaddwd 96(%2), %%mm2                        \n\t" /* -C5R3+C7R1        -C5r3+C7r1 */\
1199
        "movq %%mm5, %%mm3                        \n\t" /* A2                a2 */\
1200
        "paddd %%mm4, %%mm3                        \n\t" /* A2+B2                a2+b2 */\
1201
        "psubd %%mm4, %%mm5                        \n\t" /* a2-B2                a2-b2 */\
1202
        "psrad $" #shift ", %%mm3                \n\t"\
1203
        "psrad $" #shift ", %%mm5                \n\t"\
1204
        "movq %%mm6, %%mm4                        \n\t" /* A3                a3 */\
1205
        "paddd %%mm2, %%mm6                        \n\t" /* A3+B3                a3+b3 */\
1206
        "psubd %%mm2, %%mm4                        \n\t" /* a3-B3                a3-b3 */\
1207 37e8dcda Arpi
        "psrad $" #shift ", %%mm6                \n\t"\
1208 0a8d8945 Michael Niedermayer
        "packssdw %%mm3, %%mm3                        \n\t" /* A2+B2        a2+b2 */\
1209
        "movd %%mm3, 32+" #dst "                \n\t"\
1210 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
1211 0a8d8945 Michael Niedermayer
        "packssdw %%mm6, %%mm6                        \n\t" /* A3+B3        a3+b3 */\
1212
        "movd %%mm6, 48+" #dst "                \n\t"\
1213
        "packssdw %%mm4, %%mm4                        \n\t" /* A3-B3        a3-b3 */\
1214
        "packssdw %%mm5, %%mm5                        \n\t" /* A2-B2        a2-b2 */\
1215
        "movd %%mm4, 64+" #dst "                \n\t"\
1216
        "movd %%mm5, 80+" #dst "                \n\t"
1217
        
1218
1219
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1220
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1221
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1222
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1223
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1224 37e8dcda Arpi
        "jmp 9f                                        \n\t"
1225
1226
1227
        "#.balign 16                                \n\t"
1228
        "7:                                        \n\t"
1229 0a8d8945 Michael Niedermayer
#undef IDCT
1230
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1231
        "movq " #src0 ", %%mm0                        \n\t" /* R4        R0        r4        r0 */\
1232
        "movq 16(%2), %%mm4                        \n\t" /* C4        C4        C4        C4 */\
1233
        "pmaddwd %%mm0, %%mm4                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1234
        "movq 24(%2), %%mm5                        \n\t" /* -C4        C4        -C4        C4 */\
1235
        "pmaddwd %%mm5, %%mm0                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1236
        #rounder ", %%mm4                        \n\t"\
1237
        #rounder ", %%mm0                        \n\t"\
1238 37e8dcda Arpi
        "psrad $" #shift ", %%mm4                \n\t"\
1239
        "psrad $" #shift ", %%mm0                \n\t"\
1240 0a8d8945 Michael Niedermayer
        "movq 8+" #src0 ", %%mm2                \n\t" /* R4        R0        r4        r0 */\
1241
        "movq 16(%2), %%mm1                        \n\t" /* C4        C4        C4        C4 */\
1242
        "pmaddwd %%mm2, %%mm1                        \n\t" /* C4R4+C4R0        C4r4+C4r0 */\
1243
        "movq 24(%2), %%mm7                        \n\t" /* -C4        C4        -C4        C4 */\
1244
        "pmaddwd %%mm7, %%mm2                        \n\t" /* -C4R4+C4R0        -C4r4+C4r0 */\
1245
        "movq 32(%2), %%mm7                        \n\t" /* C6        C2        C6        C2 */\
1246
        #rounder ", %%mm1                        \n\t"\
1247
        #rounder ", %%mm2                        \n\t"\
1248 37e8dcda Arpi
        "psrad $" #shift ", %%mm1                \n\t"\
1249 0a8d8945 Michael Niedermayer
        "packssdw %%mm1, %%mm4                        \n\t" /* A0        a0 */\
1250
        "movq %%mm4, " #dst "                        \n\t"\
1251
        "psrad $" #shift ", %%mm2                \n\t"\
1252
        "packssdw %%mm2, %%mm0                        \n\t" /* A1        a1 */\
1253
        "movq %%mm0, 16+" #dst "                \n\t"\
1254
        "movq %%mm0, 96+" #dst "                \n\t"\
1255
        "movq %%mm4, 112+" #dst "                \n\t"\
1256
        "movq %%mm0, 32+" #dst "                \n\t"\
1257
        "movq %%mm4, 48+" #dst "                \n\t"\
1258
        "movq %%mm4, 64+" #dst "                \n\t"\
1259
        "movq %%mm0, 80+" #dst "                \n\t"        
1260
1261
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1262
IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1263
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1264
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1265
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1266 37e8dcda Arpi
1267
1268
#endif
1269
1270
/*
1271
Input
1272 0a8d8945 Michael Niedermayer
 00 40 04 44 20 60 24 64
1273
 10 30 14 34 50 70 54 74
1274
 01 41 03 43 21 61 23 63
1275 37e8dcda Arpi
 11 31 13 33 51 71 53 73
1276 0a8d8945 Michael Niedermayer
 02 42 06 46 22 62 26 66
1277
 12 32 16 36 52 72 56 76
1278
 05 45 07 47 25 65 27 67
1279
 15 35 17 37 55 75 57 77
1280
  
1281 37e8dcda Arpi
Temp
1282 0a8d8945 Michael Niedermayer
 00 04 10 14 20 24 30 34
1283
 40 44 50 54 60 64 70 74
1284 37e8dcda Arpi
 01 03 11 13 21 23 31 33
1285
 41 43 51 53 61 63 71 73
1286 0a8d8945 Michael Niedermayer
 02 06 12 16 22 26 32 36
1287
 42 46 52 56 62 66 72 76
1288 37e8dcda Arpi
 05 07 15 17 25 27 35 37
1289
 45 47 55 57 65 67 75 77
1290
*/
1291
1292
"9: \n\t"
1293
                :: "r" (block), "r" (temp), "r" (coeffs)
1294
                : "%eax"
1295
        );
1296
}
1297
1298 2ad1516a Michael Niedermayer
void ff_simple_idct_mmx(int16_t *block)
1299 37e8dcda Arpi
{
1300 2ad1516a Michael Niedermayer
    idct(block);
1301
}
1302
1303
//FIXME merge add/put into the idct
1304
1305 0c1a9eda Zdenek Kabelac
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1306 2ad1516a Michael Niedermayer
{
1307
    idct(block);
1308 ec7e0bf0 Zdenek Kabelac
    put_pixels_clamped_mmx(block, dest, line_size);
1309 2ad1516a Michael Niedermayer
}
1310 0c1a9eda Zdenek Kabelac
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1311 2ad1516a Michael Niedermayer
{
1312
    idct(block);
1313 ec7e0bf0 Zdenek Kabelac
    add_pixels_clamped_mmx(block, dest, line_size);
1314 37e8dcda Arpi
}