Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / simple_idct_mmx.c @ 4eca52ed

History | View | Annotate | Download (71 KB)

1 37e8dcda Arpi
/*
2 ff4ec49e Fabrice Bellard
 * Simple IDCT MMX
3
 *
4
 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5
 *
6 b78e7197 Diego Biurrun
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9 ff4ec49e Fabrice Bellard
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11 b78e7197 Diego Biurrun
 * version 2.1 of the License, or (at your option) any later version.
12 ff4ec49e Fabrice Bellard
 *
13 b78e7197 Diego Biurrun
 * FFmpeg is distributed in the hope that it will be useful,
14 ff4ec49e Fabrice Bellard
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19 b78e7197 Diego Biurrun
 * License along with FFmpeg; if not, write to the Free Software
20 5509bffa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ff4ec49e Fabrice Bellard
 */
22 245976da Diego Biurrun
#include "libavcodec/dsputil.h"
23
#include "libavcodec/simple_idct.h"
24 4e36a5b4 Måns Rullgård
#include "dsputil_mmx.h"
25 ff4ec49e Fabrice Bellard
26 9e1795dd Michael Niedermayer
/*
27
23170.475006
28
22725.260826
29
21406.727617
30
19265.545870
31
16384.000000
32
12872.826198
33
8866.956905
34
4520.335430
35
*/
36 37e8dcda Arpi
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
40 9e1795dd Michael Niedermayer
#if 0
41 37e8dcda Arpi
#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 9e1795dd Michael Niedermayer
#else
43
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
44
#endif
45 37e8dcda Arpi
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 bb270c08 Diego Biurrun
#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47
#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48 37e8dcda Arpi
49
#define ROW_SHIFT 11
50
#define COL_SHIFT 20 // 6
51
52 766324fc Reimar Döffinger
DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
53
DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
54 41338ac0 Michael Niedermayer
55 c6727809 Måns Rullgård
DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
56 bb270c08 Diego Biurrun
        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
57
//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
58
//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
59
        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
60
        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
61
//        0, 0, 0, 0,
62
//        0, 0, 0, 0,
63 37e8dcda Arpi
64 0a8d8945 Michael Niedermayer
 C4,  C4,  C4,  C4,
65
 C4, -C4,  C4, -C4,
66 115329f1 Diego Biurrun
67 0a8d8945 Michael Niedermayer
 C2,  C6,  C2,  C6,
68
 C6, -C2,  C6, -C2,
69 115329f1 Diego Biurrun
70 0a8d8945 Michael Niedermayer
 C1,  C3,  C1,  C3,
71
 C5,  C7,  C5,  C7,
72 115329f1 Diego Biurrun
73 0a8d8945 Michael Niedermayer
 C3, -C7,  C3, -C7,
74
-C1, -C5, -C1, -C5,
75 115329f1 Diego Biurrun
76 0a8d8945 Michael Niedermayer
 C5, -C1,  C5, -C1,
77
 C7,  C3,  C7,  C3,
78 115329f1 Diego Biurrun
79 0a8d8945 Michael Niedermayer
 C7, -C5,  C7, -C5,
80
 C3, -C1,  C3, -C1
81
};
82
83 ef5b1b5a Juanjo
#if 0
84 c47d146b Diego Biurrun
static void unused_var_killer(void)
85
{
86 bb270c08 Diego Biurrun
        int a= wm1010 + d40000;
87
        temp[0]=a;
88 0a8d8945 Michael Niedermayer
}
89

90 37e8dcda Arpi
static void inline idctCol (int16_t * col, int16_t *input)
91
{
92
#undef C0
93
#undef C1
94
#undef C2
95
#undef C3
96
#undef C4
97
#undef C5
98
#undef C6
99
#undef C7
100 bb270c08 Diego Biurrun
        int a0, a1, a2, a3, b0, b1, b2, b3;
101
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
106
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
107
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
108
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
109 37e8dcda Arpi
/*
110 bb270c08 Diego Biurrun
        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
111
                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
112
                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
113
                return;
114
        }*/
115 37e8dcda Arpi

116
col[8*0] = input[8*0 + 0];
117
col[8*1] = input[8*2 + 0];
118
col[8*2] = input[8*0 + 1];
119
col[8*3] = input[8*2 + 1];
120
col[8*4] = input[8*4 + 0];
121
col[8*5] = input[8*6 + 0];
122
col[8*6] = input[8*4 + 1];
123
col[8*7] = input[8*6 + 1];
124

125 bb270c08 Diego Biurrun
        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
126
        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
127
        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
128
        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
129

130
        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
131
        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
132
        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
133
        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
134

135
        col[8*0] = (a0 + b0) >> COL_SHIFT;
136
        col[8*1] = (a1 + b1) >> COL_SHIFT;
137
        col[8*2] = (a2 + b2) >> COL_SHIFT;
138
        col[8*3] = (a3 + b3) >> COL_SHIFT;
139
        col[8*4] = (a3 - b3) >> COL_SHIFT;
140
        col[8*5] = (a2 - b2) >> COL_SHIFT;
141
        col[8*6] = (a1 - b1) >> COL_SHIFT;
142
        col[8*7] = (a0 - b0) >> COL_SHIFT;
143 37e8dcda Arpi
}
144

145
static void inline idctRow (int16_t * output, int16_t * input)
146
{
147 bb270c08 Diego Biurrun
        int16_t row[8];
148

149
        int a0, a1, a2, a3, b0, b1, b2, b3;
150
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
155
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
156
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
157
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
158 37e8dcda Arpi

159
row[0] = input[0];
160
row[2] = input[1];
161
row[4] = input[4];
162
row[6] = input[5];
163
row[1] = input[8];
164
row[3] = input[9];
165
row[5] = input[12];
166
row[7] = input[13];
167

168 bb270c08 Diego Biurrun
        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
169
                row[0] = row[1] = row[2] = row[3] = row[4] =
170
                        row[5] = row[6] = row[7] = row[0]<<3;
171
        output[0]  = row[0];
172
        output[2]  = row[1];
173
        output[4]  = row[2];
174
        output[6]  = row[3];
175
        output[8]  = row[4];
176
        output[10] = row[5];
177
        output[12] = row[6];
178
        output[14] = row[7];
179
                return;
180
        }
181

182
        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
183
        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
184
        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
185
        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
186

187
        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
188
        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
189
        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
190
        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
191

192
        row[0] = (a0 + b0) >> ROW_SHIFT;
193
        row[1] = (a1 + b1) >> ROW_SHIFT;
194
        row[2] = (a2 + b2) >> ROW_SHIFT;
195
        row[3] = (a3 + b3) >> ROW_SHIFT;
196
        row[4] = (a3 - b3) >> ROW_SHIFT;
197
        row[5] = (a2 - b2) >> ROW_SHIFT;
198
        row[6] = (a1 - b1) >> ROW_SHIFT;
199
        row[7] = (a0 - b0) >> ROW_SHIFT;
200

201
        output[0]  = row[0];
202
        output[2]  = row[1];
203
        output[4]  = row[2];
204
        output[6]  = row[3];
205
        output[8]  = row[4];
206
        output[10] = row[5];
207
        output[12] = row[6];
208
        output[14] = row[7];
209 37e8dcda Arpi
}
210
#endif
211
212
static inline void idct(int16_t *block)
213
{
214 c6727809 Måns Rullgård
        DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
215 bb270c08 Diego Biurrun
        int16_t * const temp= (int16_t*)align_tmp;
216 41338ac0 Michael Niedermayer
217 be449fca Diego Pettenò
        __asm__ volatile(
218 37e8dcda Arpi
#if 0 //Alternative, simpler variant
219 0a8d8945 Michael Niedermayer

220
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
221 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
222
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
223
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
224
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
225
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
226
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
227
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
228
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
229
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
230
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
231
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
232
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
233
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
234
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
235
        #rounder ", %%mm4               \n\t"\
236
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
237
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
238
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
239
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
240
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
241
        #rounder ", %%mm0               \n\t"\
242
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
243
        "paddd %%mm0, %%mm0             \n\t" \
244
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
245
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
246
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
247
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
248
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
249
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
250
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
251
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
252
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
253
        "psrad $" #shift ", %%mm7       \n\t"\
254
        "psrad $" #shift ", %%mm4       \n\t"\
255
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
256
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
257
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
258
        "psrad $" #shift ", %%mm1       \n\t"\
259
        "psrad $" #shift ", %%mm2       \n\t"\
260
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
261
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
262
        "movq %%mm7, " #dst "           \n\t"\
263
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
264
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
265
        "movq %%mm2, 24+" #dst "        \n\t"\
266
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
267
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
268
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
269
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
270
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
271
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
272
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
273
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
274
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
275
        "psrad $" #shift ", %%mm2       \n\t"\
276
        "psrad $" #shift ", %%mm0       \n\t"\
277
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
278
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
279
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
280
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
281
        "psrad $" #shift ", %%mm6       \n\t"\
282
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
283
        "movq %%mm2, 8+" #dst "         \n\t"\
284
        "psrad $" #shift ", %%mm4       \n\t"\
285
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
286
        "movq %%mm4, 16+" #dst "        \n\t"\
287 0a8d8945 Michael Niedermayer

288 347be472 John Dalgliesh
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
289 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
290
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
291
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
292
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
293
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
294
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
295
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
296
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
297
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
298
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
299
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
300
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
301
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
302
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
303
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
304
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
305
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
306
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
307
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
308
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
309
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
310
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
311
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
312
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
313
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
314
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
315
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
316
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
317
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
318
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
319
        "psrad $" #shift ", %%mm7       \n\t"\
320
        "psrad $" #shift ", %%mm4       \n\t"\
321
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
322
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
323
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
324
        "psrad $" #shift ", %%mm0       \n\t"\
325
        "psrad $" #shift ", %%mm2       \n\t"\
326
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
327
        "movd %%mm7, " #dst "           \n\t"\
328
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
329
        "movd %%mm0, 16+" #dst "        \n\t"\
330
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
331
        "movd %%mm2, 96+" #dst "        \n\t"\
332
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
333
        "movd %%mm4, 112+" #dst "       \n\t"\
334
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
335
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
336
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
337
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
338
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
339
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
340
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
341
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
342
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
343
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
344
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
345
        "psrad $" #shift ", %%mm2       \n\t"\
346
        "psrad $" #shift ", %%mm5       \n\t"\
347
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
348
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
349
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
350
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
351
        "psrad $" #shift ", %%mm6       \n\t"\
352
        "psrad $" #shift ", %%mm4       \n\t"\
353
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
354
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
355
        "movd %%mm2, 32+" #dst "        \n\t"\
356
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
357
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
358
        "movd %%mm6, 48+" #dst "        \n\t"\
359
        "movd %%mm4, 64+" #dst "        \n\t"\
360
        "movd %%mm5, 80+" #dst "        \n\t"\
361 0a8d8945 Michael Niedermayer

362 115329f1 Diego Biurrun

363 0a8d8945 Michael Niedermayer
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
364 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
365
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
366
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
367
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
368
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
369
        "pand %%mm0, %%mm4              \n\t"\
370
        "por %%mm1, %%mm4               \n\t"\
371
        "por %%mm2, %%mm4               \n\t"\
372
        "por %%mm3, %%mm4               \n\t"\
373
        "packssdw %%mm4,%%mm4           \n\t"\
374
        "movd %%mm4, %%eax              \n\t"\
375
        "orl %%eax, %%eax               \n\t"\
376
        "jz 1f                          \n\t"\
377
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
378
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
379
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
380
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
381
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
382
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
383
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
384
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
385
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
386
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
387
        #rounder ", %%mm4               \n\t"\
388
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
389
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
390
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
391
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
392
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
393
        #rounder ", %%mm0               \n\t"\
394
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
395
        "paddd %%mm0, %%mm0             \n\t" \
396
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
397
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
398
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
399
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
400
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
401
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
402
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
403
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
404
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
405
        "psrad $" #shift ", %%mm7       \n\t"\
406
        "psrad $" #shift ", %%mm4       \n\t"\
407
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
408
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
409
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
410
        "psrad $" #shift ", %%mm1       \n\t"\
411
        "psrad $" #shift ", %%mm2       \n\t"\
412
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
413
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
414
        "movq %%mm7, " #dst "           \n\t"\
415
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
416
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
417
        "movq %%mm2, 24+" #dst "        \n\t"\
418
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
419
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
420
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
421
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
422
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
423
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
424
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
425
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
426
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
427
        "psrad $" #shift ", %%mm2       \n\t"\
428
        "psrad $" #shift ", %%mm0       \n\t"\
429
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
430
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
431
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
432
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
433
        "psrad $" #shift ", %%mm6       \n\t"\
434
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
435
        "movq %%mm2, 8+" #dst "         \n\t"\
436
        "psrad $" #shift ", %%mm4       \n\t"\
437
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
438
        "movq %%mm4, 16+" #dst "        \n\t"\
439
        "jmp 2f                         \n\t"\
440
        "1:                             \n\t"\
441
        "pslld $16, %%mm0               \n\t"\
442
        "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
443
        "psrad $13, %%mm0               \n\t"\
444
        "packssdw %%mm0, %%mm0          \n\t"\
445
        "movq %%mm0, " #dst "           \n\t"\
446
        "movq %%mm0, 8+" #dst "         \n\t"\
447
        "movq %%mm0, 16+" #dst "        \n\t"\
448
        "movq %%mm0, 24+" #dst "        \n\t"\
449
        "2:                             \n\t"
450 37e8dcda Arpi

451

452 0a8d8945 Michael Niedermayer
//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
453
ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
454
/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
455
ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
456
ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
457

458
DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
459
DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
460
DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
461

462

463 347be472 John Dalgliesh
//IDCT(      src0,   src4,   src1,    src5,    dst, shift)
464
COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
465
COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
466
COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
467
COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
468 37e8dcda Arpi

469 0a8d8945 Michael Niedermayer
#else
470
471
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
472 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
473
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
474
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
475
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
476
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
477
        "pand %%mm0, %%mm4              \n\t"\
478
        "por %%mm1, %%mm4               \n\t"\
479
        "por %%mm2, %%mm4               \n\t"\
480
        "por %%mm3, %%mm4               \n\t"\
481
        "packssdw %%mm4,%%mm4           \n\t"\
482
        "movd %%mm4, %%eax              \n\t"\
483
        "orl %%eax, %%eax               \n\t"\
484
        "jz 1f                          \n\t"\
485
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
486
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
487
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
488
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
489
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
490
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
491
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
492
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
493
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
494
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
495
        #rounder ", %%mm4               \n\t"\
496
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
497
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
498
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
499
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
500
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
501
        #rounder ", %%mm0               \n\t"\
502
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
503
        "paddd %%mm0, %%mm0             \n\t" \
504
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
505
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
506
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
507
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
508
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
509
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
510
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
511
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
512
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
513
        "psrad $" #shift ", %%mm7       \n\t"\
514
        "psrad $" #shift ", %%mm4       \n\t"\
515
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
516
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
517
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
518
        "psrad $" #shift ", %%mm1       \n\t"\
519
        "psrad $" #shift ", %%mm2       \n\t"\
520
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
521
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
522
        "movq %%mm7, " #dst "           \n\t"\
523
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
524
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
525
        "movq %%mm2, 24+" #dst "        \n\t"\
526
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
527
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
528
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
529
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
530
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
531
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
532
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
533
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
534
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
535
        "psrad $" #shift ", %%mm2       \n\t"\
536
        "psrad $" #shift ", %%mm0       \n\t"\
537
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
538
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
539
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
540
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
541
        "psrad $" #shift ", %%mm6       \n\t"\
542
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
543
        "movq %%mm2, 8+" #dst "         \n\t"\
544
        "psrad $" #shift ", %%mm4       \n\t"\
545
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
546
        "movq %%mm4, 16+" #dst "        \n\t"\
547
        "jmp 2f                         \n\t"\
548
        "1:                             \n\t"\
549
        "pslld $16, %%mm0               \n\t"\
550
        "paddd "MANGLE(d40000)", %%mm0  \n\t"\
551
        "psrad $13, %%mm0               \n\t"\
552
        "packssdw %%mm0, %%mm0          \n\t"\
553
        "movq %%mm0, " #dst "           \n\t"\
554
        "movq %%mm0, 8+" #dst "         \n\t"\
555
        "movq %%mm0, 16+" #dst "        \n\t"\
556
        "movq %%mm0, 24+" #dst "        \n\t"\
557
        "2:                             \n\t"
558 37e8dcda Arpi
559 0a8d8945 Michael Niedermayer
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
560 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
561
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
562
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
563
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
564
        "movq %%mm0, %%mm4              \n\t"\
565
        "por %%mm1, %%mm4               \n\t"\
566
        "por %%mm2, %%mm4               \n\t"\
567
        "por %%mm3, %%mm4               \n\t"\
568
        "packssdw %%mm4,%%mm4           \n\t"\
569
        "movd %%mm4, %%eax              \n\t"\
570
        "orl %%eax, %%eax               \n\t"\
571
        "jz " #bt "                     \n\t"\
572
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
573
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
574
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
575
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
576
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
577
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
578
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
579
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
580
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
581
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
582
        #rounder ", %%mm4               \n\t"\
583
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
584
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
585
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
586
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
587
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
588
        #rounder ", %%mm0               \n\t"\
589
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
590
        "paddd %%mm0, %%mm0             \n\t" \
591
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
592
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
593
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
594
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
595
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
596
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
597
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
598
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
599
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
600
        "psrad $" #shift ", %%mm7       \n\t"\
601
        "psrad $" #shift ", %%mm4       \n\t"\
602
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
603
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
604
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
605
        "psrad $" #shift ", %%mm1       \n\t"\
606
        "psrad $" #shift ", %%mm2       \n\t"\
607
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
608
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
609
        "movq %%mm7, " #dst "           \n\t"\
610
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
611
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
612
        "movq %%mm2, 24+" #dst "        \n\t"\
613
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
614
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
615
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
616
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
617
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
618
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
619
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
620
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
621
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
622
        "psrad $" #shift ", %%mm2       \n\t"\
623
        "psrad $" #shift ", %%mm0       \n\t"\
624
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
625
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
626
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
627
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
628
        "psrad $" #shift ", %%mm6       \n\t"\
629
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
630
        "movq %%mm2, 8+" #dst "         \n\t"\
631
        "psrad $" #shift ", %%mm4       \n\t"\
632
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
633
        "movq %%mm4, 16+" #dst "        \n\t"\
634 0a8d8945 Michael Niedermayer
635
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
636 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
637
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
638
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
639
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
640
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
641
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
642
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
643
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
644
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
645
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
646
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
647
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
648
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
649
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
650
        #rounder ", %%mm4               \n\t"\
651
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
652
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
653
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
654
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
655
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
656
        #rounder ", %%mm0               \n\t"\
657
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
658
        "paddd %%mm0, %%mm0             \n\t" \
659
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
660
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
661
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
662
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
663
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
664
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
665
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
666
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
667
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
668
        "psrad $" #shift ", %%mm7       \n\t"\
669
        "psrad $" #shift ", %%mm4       \n\t"\
670
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
671
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
672
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
673
        "psrad $" #shift ", %%mm1       \n\t"\
674
        "psrad $" #shift ", %%mm2       \n\t"\
675
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
676
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
677
        "movq %%mm7, " #dst "           \n\t"\
678
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
679
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
680
        "movq %%mm2, 24+" #dst "        \n\t"\
681
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
682
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
683
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
684
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
685
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
686
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
687
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
688
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
689
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
690
        "psrad $" #shift ", %%mm2       \n\t"\
691
        "psrad $" #shift ", %%mm0       \n\t"\
692
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
693
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
694
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
695
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
696
        "psrad $" #shift ", %%mm6       \n\t"\
697
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
698
        "movq %%mm2, 8+" #dst "         \n\t"\
699
        "psrad $" #shift ", %%mm4       \n\t"\
700
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
701
        "movq %%mm4, 16+" #dst "        \n\t"\
702 0a8d8945 Michael Niedermayer
703
//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
704
DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
705
Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
706
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
707
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
708
709
#undef IDCT
710 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
711 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
712
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
713
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
714
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
715
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
716
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
717
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
718
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
719
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
720
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
721
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
722
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
723
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
724
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
725
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
726
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
727
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
728
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
729
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
730
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
731
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
732
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
733
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
734
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
735
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
736
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
737
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
738
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
739
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
740
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
741
        "psrad $" #shift ", %%mm7       \n\t"\
742
        "psrad $" #shift ", %%mm4       \n\t"\
743
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
744
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
745
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
746
        "psrad $" #shift ", %%mm0       \n\t"\
747
        "psrad $" #shift ", %%mm2       \n\t"\
748
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
749
        "movd %%mm7, " #dst "           \n\t"\
750
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
751
        "movd %%mm0, 16+" #dst "        \n\t"\
752
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
753
        "movd %%mm2, 96+" #dst "        \n\t"\
754
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
755
        "movd %%mm4, 112+" #dst "       \n\t"\
756
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
757
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
758
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
759
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
760
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
761
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
762
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
763
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
764
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
765
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
766
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
767
        "psrad $" #shift ", %%mm2       \n\t"\
768
        "psrad $" #shift ", %%mm5       \n\t"\
769
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
770
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
771
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
772
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
773
        "psrad $" #shift ", %%mm6       \n\t"\
774
        "psrad $" #shift ", %%mm4       \n\t"\
775
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
776
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
777
        "movd %%mm2, 32+" #dst "        \n\t"\
778
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
779
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
780
        "movd %%mm6, 48+" #dst "        \n\t"\
781
        "movd %%mm4, 64+" #dst "        \n\t"\
782
        "movd %%mm5, 80+" #dst "        \n\t"
783 0a8d8945 Michael Niedermayer
784
785 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
786
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
787
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
788
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
789
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
790 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
791 37e8dcda Arpi
792 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)                      \
793 bb270c08 Diego Biurrun
        "4:                             \n\t"
794 0a8d8945 Michael Niedermayer
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
795
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
796 37e8dcda Arpi
797 0a8d8945 Michael Niedermayer
#undef IDCT
798 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
799 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
800
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
801
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
802
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
803
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
804
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
805
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
806
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
807
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
808
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
809
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
810
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
811
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
812
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
813
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
814
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
815
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
816
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
817
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
818
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
819
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
820
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
821
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
822
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
823
        "psrad $" #shift ", %%mm1       \n\t"\
824
        "psrad $" #shift ", %%mm4       \n\t"\
825
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
826
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
827
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
828
        "psrad $" #shift ", %%mm0       \n\t"\
829
        "psrad $" #shift ", %%mm2       \n\t"\
830
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
831
        "movd %%mm1, " #dst "           \n\t"\
832
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
833
        "movd %%mm0, 16+" #dst "        \n\t"\
834
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
835
        "movd %%mm2, 96+" #dst "        \n\t"\
836
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
837
        "movd %%mm4, 112+" #dst "       \n\t"\
838
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
839
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
840
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
841
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
842
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
843
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
844
        "psrad $" #shift ", %%mm2       \n\t"\
845
        "psrad $" #shift ", %%mm5       \n\t"\
846
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
847
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
848
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
849
        "psrad $" #shift ", %%mm6       \n\t"\
850
        "psrad $" #shift ", %%mm1       \n\t"\
851
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
852
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
853
        "movd %%mm2, 32+" #dst "        \n\t"\
854
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
855
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
856
        "movd %%mm6, 48+" #dst "        \n\t"\
857
        "movd %%mm1, 64+" #dst "        \n\t"\
858
        "movd %%mm5, 80+" #dst "        \n\t"
859 0a8d8945 Michael Niedermayer
860 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
861
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
862
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
863
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
864
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
865 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
866 37e8dcda Arpi
867 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)                      \
868 bb270c08 Diego Biurrun
        "6:                             \n\t"
869 0a8d8945 Michael Niedermayer
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
870 37e8dcda Arpi
871 0a8d8945 Michael Niedermayer
#undef IDCT
872 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
873 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
874
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
875
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
876
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
877
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
878
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
879
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
880
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
881
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
882
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
883
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
884
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
885
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
886
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
887
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
888
        "psrad $" #shift ", %%mm1       \n\t"\
889
        "psrad $" #shift ", %%mm4       \n\t"\
890
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
891
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
892
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
893
        "psrad $" #shift ", %%mm0       \n\t"\
894
        "psrad $" #shift ", %%mm2       \n\t"\
895
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
896
        "movd %%mm1, " #dst "           \n\t"\
897
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
898
        "movd %%mm0, 16+" #dst "        \n\t"\
899
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
900
        "movd %%mm2, 96+" #dst "        \n\t"\
901
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
902
        "movd %%mm4, 112+" #dst "       \n\t"\
903
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
904
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
905
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
906
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
907
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
908
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
909
        "psrad $" #shift ", %%mm2       \n\t"\
910
        "psrad $" #shift ", %%mm5       \n\t"\
911
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
912
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
913
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
914
        "psrad $" #shift ", %%mm6       \n\t"\
915
        "psrad $" #shift ", %%mm1       \n\t"\
916
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
917
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
918
        "movd %%mm2, 32+" #dst "        \n\t"\
919
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
920
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
921
        "movd %%mm6, 48+" #dst "        \n\t"\
922
        "movd %%mm1, 64+" #dst "        \n\t"\
923
        "movd %%mm5, 80+" #dst "        \n\t"
924 0a8d8945 Michael Niedermayer
925
926 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
927
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
928
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
929
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
930
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
931 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
932 37e8dcda Arpi
933 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)                      \
934 bb270c08 Diego Biurrun
        "2:                             \n\t"
935 0a8d8945 Michael Niedermayer
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
936 37e8dcda Arpi
937 0a8d8945 Michael Niedermayer
#undef IDCT
938 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
939 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
940
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
941
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
942
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
943
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
944
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
945
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
946
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
947
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
948
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
949
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
950
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
951
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
952
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
953
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
954
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
955
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
956
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
957
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
958
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
959
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
960
        "psrad $" #shift ", %%mm7       \n\t"\
961
        "psrad $" #shift ", %%mm4       \n\t"\
962
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
963
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
964
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
965
        "psrad $" #shift ", %%mm0       \n\t"\
966
        "psrad $" #shift ", %%mm2       \n\t"\
967
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
968
        "movd %%mm7, " #dst "           \n\t"\
969
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
970
        "movd %%mm0, 16+" #dst "        \n\t"\
971
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
972
        "movd %%mm2, 96+" #dst "        \n\t"\
973
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
974
        "movd %%mm4, 112+" #dst "       \n\t"\
975
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
976
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
977
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
978
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
979
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
980
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
981
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
982
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
983
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
984
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
985
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
986
        "psrad $" #shift ", %%mm2       \n\t"\
987
        "psrad $" #shift ", %%mm5       \n\t"\
988
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
989
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
990
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
991
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
992
        "psrad $" #shift ", %%mm6       \n\t"\
993
        "psrad $" #shift ", %%mm4       \n\t"\
994
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
995
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
996
        "movd %%mm2, 32+" #dst "        \n\t"\
997
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
998
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
999
        "movd %%mm6, 48+" #dst "        \n\t"\
1000
        "movd %%mm4, 64+" #dst "        \n\t"\
1001
        "movd %%mm5, 80+" #dst "        \n\t"
1002 0a8d8945 Michael Niedermayer
1003 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1004
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1005
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1006
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1007
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1008 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
1009 37e8dcda Arpi
1010 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)                      \
1011 bb270c08 Diego Biurrun
        "3:                             \n\t"
1012 0a8d8945 Michael Niedermayer
#undef IDCT
1013 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
1014 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1015
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1016
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1017
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1018
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1019
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1020
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1021
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1022
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1023
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1024
        "movq 64(%2), %%mm3             \n\t"\
1025
        "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1026
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1027
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1028
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1029
        "psrad $" #shift ", %%mm7       \n\t"\
1030
        "psrad $" #shift ", %%mm4       \n\t"\
1031
        "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
1032
        "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1033
        "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
1034
        "psrad $" #shift ", %%mm0       \n\t"\
1035
        "psrad $" #shift ", %%mm1       \n\t"\
1036
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1037
        "movd %%mm7, " #dst "           \n\t"\
1038
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1039
        "movd %%mm0, 16+" #dst "        \n\t"\
1040
        "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
1041
        "movd %%mm1, 96+" #dst "        \n\t"\
1042
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1043
        "movd %%mm4, 112+" #dst "       \n\t"\
1044
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1045
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1046
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1047
        "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
1048
        "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
1049
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1050
        "psrad $" #shift ", %%mm1       \n\t"\
1051
        "psrad $" #shift ", %%mm5       \n\t"\
1052
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1053
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1054
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1055
        "psrad $" #shift ", %%mm6       \n\t"\
1056
        "psrad $" #shift ", %%mm4       \n\t"\
1057
        "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
1058
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1059
        "movd %%mm1, 32+" #dst "        \n\t"\
1060
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1061
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1062
        "movd %%mm6, 48+" #dst "        \n\t"\
1063
        "movd %%mm4, 64+" #dst "        \n\t"\
1064
        "movd %%mm5, 80+" #dst "        \n\t"
1065 0a8d8945 Michael Niedermayer
1066
1067 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1068
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1069
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1070
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1071
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1072 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
1073 37e8dcda Arpi
1074 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)                      \
1075 bb270c08 Diego Biurrun
        "5:                             \n\t"
1076 0a8d8945 Michael Niedermayer
#undef IDCT
1077 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
1078 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1079
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1080
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1081
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1082
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1083
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1084
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1085
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1086
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1087
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1088
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1089
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1090
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1091
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1092
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1093
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1094
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1095
        "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
1096
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1097
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1098
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1099
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1100
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1101
        "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1102
        "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1103
        "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
1104
        "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
1105
        "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
1106
        "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
1107
        "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
1108
        "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
1109
        "psrad $" #shift ", %%mm4       \n\t"\
1110
        "psrad $" #shift ", %%mm7       \n\t"\
1111
        "psrad $" #shift ", %%mm3       \n\t"\
1112
        "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
1113
        "movq %%mm4, " #dst "           \n\t"\
1114
        "psrad $" #shift ", %%mm0       \n\t"\
1115
        "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
1116
        "movq %%mm0, 16+" #dst "        \n\t"\
1117
        "movq %%mm0, 96+" #dst "        \n\t"\
1118
        "movq %%mm4, 112+" #dst "       \n\t"\
1119
        "psrad $" #shift ", %%mm5       \n\t"\
1120
        "psrad $" #shift ", %%mm6       \n\t"\
1121
        "psrad $" #shift ", %%mm2       \n\t"\
1122
        "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1123
        "movq %%mm5, 32+" #dst "        \n\t"\
1124
        "psrad $" #shift ", %%mm1       \n\t"\
1125
        "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1126
        "movq %%mm6, 48+" #dst "        \n\t"\
1127
        "movq %%mm6, 64+" #dst "        \n\t"\
1128
        "movq %%mm5, 80+" #dst "        \n\t"
1129 115329f1 Diego Biurrun
1130 0a8d8945 Michael Niedermayer
1131 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1132
IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1133
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1134
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1135
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1136 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
1137 37e8dcda Arpi
1138
1139 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)                      \
1140 bb270c08 Diego Biurrun
        "1:                             \n\t"
1141 0a8d8945 Michael Niedermayer
#undef IDCT
1142 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
1143 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1144
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1145
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1146
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1147
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1148
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1149
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1150
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1151
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1152
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1153
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1154
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1155
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1156
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1157
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1158
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1159
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1160
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1161
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1162
        "movq 64(%2), %%mm1             \n\t"\
1163
        "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1164
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1165
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1166
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1167
        "psrad $" #shift ", %%mm7       \n\t"\
1168
        "psrad $" #shift ", %%mm4       \n\t"\
1169
        "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
1170
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1171
        "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
1172
        "psrad $" #shift ", %%mm0       \n\t"\
1173
        "psrad $" #shift ", %%mm3       \n\t"\
1174
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1175
        "movd %%mm7, " #dst "           \n\t"\
1176
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1177
        "movd %%mm0, 16+" #dst "        \n\t"\
1178
        "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
1179
        "movd %%mm3, 96+" #dst "        \n\t"\
1180
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1181
        "movd %%mm4, 112+" #dst "       \n\t"\
1182
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1183
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1184
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1185
        "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
1186
        "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
1187
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1188
        "psrad $" #shift ", %%mm3       \n\t"\
1189
        "psrad $" #shift ", %%mm5       \n\t"\
1190
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1191
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1192
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1193
        "psrad $" #shift ", %%mm6       \n\t"\
1194
        "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
1195
        "movd %%mm3, 32+" #dst "        \n\t"\
1196
        "psrad $" #shift ", %%mm4       \n\t"\
1197
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1198
        "movd %%mm6, 48+" #dst "        \n\t"\
1199
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1200
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1201
        "movd %%mm4, 64+" #dst "        \n\t"\
1202
        "movd %%mm5, 80+" #dst "        \n\t"
1203 115329f1 Diego Biurrun
1204 0a8d8945 Michael Niedermayer
1205 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1206
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1207
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1208
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1209
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1210 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
1211 37e8dcda Arpi
1212
1213 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)
1214 bb270c08 Diego Biurrun
        "7:                             \n\t"
1215 0a8d8945 Michael Niedermayer
#undef IDCT
1216 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
1217 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1218
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1219
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1220
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1221
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1222
        "psrad $" #shift ", %%mm4       \n\t"\
1223
        "psrad $" #shift ", %%mm0       \n\t"\
1224
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1225
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1226
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1227
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1228
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1229
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1230
        "psrad $" #shift ", %%mm1       \n\t"\
1231
        "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
1232
        "movq %%mm4, " #dst "           \n\t"\
1233
        "psrad $" #shift ", %%mm2       \n\t"\
1234
        "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
1235
        "movq %%mm0, 16+" #dst "        \n\t"\
1236
        "movq %%mm0, 96+" #dst "        \n\t"\
1237
        "movq %%mm4, 112+" #dst "       \n\t"\
1238
        "movq %%mm0, 32+" #dst "        \n\t"\
1239
        "movq %%mm4, 48+" #dst "        \n\t"\
1240
        "movq %%mm4, 64+" #dst "        \n\t"\
1241
        "movq %%mm0, 80+" #dst "        \n\t"
1242 0a8d8945 Michael Niedermayer
1243 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1244
IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1245
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1246
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1247
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1248 37e8dcda Arpi
1249
1250
#endif
1251
1252
/*
1253
Input
1254 0a8d8945 Michael Niedermayer
 00 40 04 44 20 60 24 64
1255
 10 30 14 34 50 70 54 74
1256
 01 41 03 43 21 61 23 63
1257 37e8dcda Arpi
 11 31 13 33 51 71 53 73
1258 0a8d8945 Michael Niedermayer
 02 42 06 46 22 62 26 66
1259
 12 32 16 36 52 72 56 76
1260
 05 45 07 47 25 65 27 67
1261
 15 35 17 37 55 75 57 77
1262 115329f1 Diego Biurrun

1263 37e8dcda Arpi
Temp
1264 0a8d8945 Michael Niedermayer
 00 04 10 14 20 24 30 34
1265
 40 44 50 54 60 64 70 74
1266 37e8dcda Arpi
 01 03 11 13 21 23 31 33
1267
 41 43 51 53 61 63 71 73
1268 0a8d8945 Michael Niedermayer
 02 06 12 16 22 26 32 36
1269
 42 46 52 56 62 66 72 76
1270 37e8dcda Arpi
 05 07 15 17 25 27 35 37
1271
 45 47 55 57 65 67 75 77
1272
*/
1273
1274
"9: \n\t"
1275 bb270c08 Diego Biurrun
                :: "r" (block), "r" (temp), "r" (coeffs)
1276
                : "%eax"
1277
        );
1278 37e8dcda Arpi
}
1279
1280 2ad1516a Michael Niedermayer
void ff_simple_idct_mmx(int16_t *block)
1281 37e8dcda Arpi
{
1282 2ad1516a Michael Niedermayer
    idct(block);
1283
}
1284
1285
//FIXME merge add/put into the idct
1286
1287 0c1a9eda Zdenek Kabelac
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1288 2ad1516a Michael Niedermayer
{
1289
    idct(block);
1290 ec7e0bf0 Zdenek Kabelac
    put_pixels_clamped_mmx(block, dest, line_size);
1291 2ad1516a Michael Niedermayer
}
1292 0c1a9eda Zdenek Kabelac
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1293 2ad1516a Michael Niedermayer
{
1294
    idct(block);
1295 ec7e0bf0 Zdenek Kabelac
    add_pixels_clamped_mmx(block, dest, line_size);
1296 37e8dcda Arpi
}