Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / simple_idct_mmx.c @ c4ff7c53

History | View | Annotate | Download (71 KB)

1 37e8dcda Arpi
/*
2 ff4ec49e Fabrice Bellard
 * Simple IDCT MMX
3
 *
4
 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5
 *
6 b78e7197 Diego Biurrun
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9 ff4ec49e Fabrice Bellard
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11 b78e7197 Diego Biurrun
 * version 2.1 of the License, or (at your option) any later version.
12 ff4ec49e Fabrice Bellard
 *
13 b78e7197 Diego Biurrun
 * FFmpeg is distributed in the hope that it will be useful,
14 ff4ec49e Fabrice Bellard
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19 b78e7197 Diego Biurrun
 * License along with FFmpeg; if not, write to the Free Software
20 5509bffa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ff4ec49e Fabrice Bellard
 */
22 245976da Diego Biurrun
#include "libavcodec/dsputil.h"
23
#include "libavcodec/simple_idct.h"
24 ff4ec49e Fabrice Bellard
25 9e1795dd Michael Niedermayer
/*
26
23170.475006
27
22725.260826
28
21406.727617
29
19265.545870
30
16384.000000
31
12872.826198
32
8866.956905
33
4520.335430
34
*/
35 37e8dcda Arpi
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39 9e1795dd Michael Niedermayer
#if 0
40 37e8dcda Arpi
#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
41 9e1795dd Michael Niedermayer
#else
42
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
43
#endif
44 37e8dcda Arpi
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 bb270c08 Diego Biurrun
#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46
#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 37e8dcda Arpi
48
#define ROW_SHIFT 11
49
#define COL_SHIFT 20 // 6
50
51 766324fc Reimar Döffinger
DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
52
DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
53 41338ac0 Michael Niedermayer
54 038f0f9b Reimar Döffinger
DECLARE_ALIGNED(8, static const int16_t, coeffs[])= {
55 bb270c08 Diego Biurrun
        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
56
//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
57
//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
58
        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
59
        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
60
//        0, 0, 0, 0,
61
//        0, 0, 0, 0,
62 37e8dcda Arpi
63 0a8d8945 Michael Niedermayer
 C4,  C4,  C4,  C4,
64
 C4, -C4,  C4, -C4,
65 115329f1 Diego Biurrun
66 0a8d8945 Michael Niedermayer
 C2,  C6,  C2,  C6,
67
 C6, -C2,  C6, -C2,
68 115329f1 Diego Biurrun
69 0a8d8945 Michael Niedermayer
 C1,  C3,  C1,  C3,
70
 C5,  C7,  C5,  C7,
71 115329f1 Diego Biurrun
72 0a8d8945 Michael Niedermayer
 C3, -C7,  C3, -C7,
73
-C1, -C5, -C1, -C5,
74 115329f1 Diego Biurrun
75 0a8d8945 Michael Niedermayer
 C5, -C1,  C5, -C1,
76
 C7,  C3,  C7,  C3,
77 115329f1 Diego Biurrun
78 0a8d8945 Michael Niedermayer
 C7, -C5,  C7, -C5,
79
 C3, -C1,  C3, -C1
80
};
81
82 ef5b1b5a Juanjo
#if 0
83 0a8d8945 Michael Niedermayer
static void unused_var_killer(){
84 bb270c08 Diego Biurrun
        int a= wm1010 + d40000;
85
        temp[0]=a;
86 0a8d8945 Michael Niedermayer
}
87

88 37e8dcda Arpi
static void inline idctCol (int16_t * col, int16_t *input)
89
{
90
#undef C0
91
#undef C1
92
#undef C2
93
#undef C3
94
#undef C4
95
#undef C5
96
#undef C6
97
#undef C7
98 bb270c08 Diego Biurrun
        int a0, a1, a2, a3, b0, b1, b2, b3;
99
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
106
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
107 37e8dcda Arpi
/*
108 bb270c08 Diego Biurrun
        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
109
                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
110
                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
111
                return;
112
        }*/
113 37e8dcda Arpi

114
col[8*0] = input[8*0 + 0];
115
col[8*1] = input[8*2 + 0];
116
col[8*2] = input[8*0 + 1];
117
col[8*3] = input[8*2 + 1];
118
col[8*4] = input[8*4 + 0];
119
col[8*5] = input[8*6 + 0];
120
col[8*6] = input[8*4 + 1];
121
col[8*7] = input[8*6 + 1];
122

123 bb270c08 Diego Biurrun
        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
124
        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
125
        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
126
        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
127

128
        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
129
        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
130
        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
131
        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
132

133
        col[8*0] = (a0 + b0) >> COL_SHIFT;
134
        col[8*1] = (a1 + b1) >> COL_SHIFT;
135
        col[8*2] = (a2 + b2) >> COL_SHIFT;
136
        col[8*3] = (a3 + b3) >> COL_SHIFT;
137
        col[8*4] = (a3 - b3) >> COL_SHIFT;
138
        col[8*5] = (a2 - b2) >> COL_SHIFT;
139
        col[8*6] = (a1 - b1) >> COL_SHIFT;
140
        col[8*7] = (a0 - b0) >> COL_SHIFT;
141 37e8dcda Arpi
}
142

143
static void inline idctRow (int16_t * output, int16_t * input)
144
{
145 bb270c08 Diego Biurrun
        int16_t row[8];
146

147
        int a0, a1, a2, a3, b0, b1, b2, b3;
148
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
155
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
156 37e8dcda Arpi

157
row[0] = input[0];
158
row[2] = input[1];
159
row[4] = input[4];
160
row[6] = input[5];
161
row[1] = input[8];
162
row[3] = input[9];
163
row[5] = input[12];
164
row[7] = input[13];
165

166 bb270c08 Diego Biurrun
        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
167
                row[0] = row[1] = row[2] = row[3] = row[4] =
168
                        row[5] = row[6] = row[7] = row[0]<<3;
169
        output[0]  = row[0];
170
        output[2]  = row[1];
171
        output[4]  = row[2];
172
        output[6]  = row[3];
173
        output[8]  = row[4];
174
        output[10] = row[5];
175
        output[12] = row[6];
176
        output[14] = row[7];
177
                return;
178
        }
179

180
        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
181
        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
182
        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
183
        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
184

185
        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
186
        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
187
        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
188
        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
189

190
        row[0] = (a0 + b0) >> ROW_SHIFT;
191
        row[1] = (a1 + b1) >> ROW_SHIFT;
192
        row[2] = (a2 + b2) >> ROW_SHIFT;
193
        row[3] = (a3 + b3) >> ROW_SHIFT;
194
        row[4] = (a3 - b3) >> ROW_SHIFT;
195
        row[5] = (a2 - b2) >> ROW_SHIFT;
196
        row[6] = (a1 - b1) >> ROW_SHIFT;
197
        row[7] = (a0 - b0) >> ROW_SHIFT;
198

199
        output[0]  = row[0];
200
        output[2]  = row[1];
201
        output[4]  = row[2];
202
        output[6]  = row[3];
203
        output[8]  = row[4];
204
        output[10] = row[5];
205
        output[12] = row[6];
206
        output[14] = row[7];
207 37e8dcda Arpi
}
208
#endif
209
210
static inline void idct(int16_t *block)
211
{
212 27215c6b Reimar Döffinger
        DECLARE_ALIGNED(8, int64_t, align_tmp[16]);
213 bb270c08 Diego Biurrun
        int16_t * const temp= (int16_t*)align_tmp;
214 41338ac0 Michael Niedermayer
215 be449fca Diego Pettenò
        __asm__ volatile(
216 37e8dcda Arpi
#if 0 //Alternative, simpler variant
217 0a8d8945 Michael Niedermayer

218
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
219 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
220
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
221
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
222
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
223
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
224
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
225
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
226
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
227
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
228
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
229
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
230
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
231
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
232
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
233
        #rounder ", %%mm4               \n\t"\
234
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
235
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
236
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
237
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
238
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
239
        #rounder ", %%mm0               \n\t"\
240
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
241
        "paddd %%mm0, %%mm0             \n\t" \
242
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
243
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
244
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
245
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
246
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
247
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
248
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
249
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
250
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
251
        "psrad $" #shift ", %%mm7       \n\t"\
252
        "psrad $" #shift ", %%mm4       \n\t"\
253
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
254
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
255
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
256
        "psrad $" #shift ", %%mm1       \n\t"\
257
        "psrad $" #shift ", %%mm2       \n\t"\
258
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
259
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
260
        "movq %%mm7, " #dst "           \n\t"\
261
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
262
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
263
        "movq %%mm2, 24+" #dst "        \n\t"\
264
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
265
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
266
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
267
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
268
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
269
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
270
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
271
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
272
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
273
        "psrad $" #shift ", %%mm2       \n\t"\
274
        "psrad $" #shift ", %%mm0       \n\t"\
275
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
276
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
277
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
278
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
279
        "psrad $" #shift ", %%mm6       \n\t"\
280
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
281
        "movq %%mm2, 8+" #dst "         \n\t"\
282
        "psrad $" #shift ", %%mm4       \n\t"\
283
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
284
        "movq %%mm4, 16+" #dst "        \n\t"\
285 0a8d8945 Michael Niedermayer

286 347be472 John Dalgliesh
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
287 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
288
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
289
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
290
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
291
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
292
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
293
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
294
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
295
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
296
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
297
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
298
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
299
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
300
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
301
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
302
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
303
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
304
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
305
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
306
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
307
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
308
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
309
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
310
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
311
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
312
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
313
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
314
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
315
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
316
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
317
        "psrad $" #shift ", %%mm7       \n\t"\
318
        "psrad $" #shift ", %%mm4       \n\t"\
319
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
320
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
321
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
322
        "psrad $" #shift ", %%mm0       \n\t"\
323
        "psrad $" #shift ", %%mm2       \n\t"\
324
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
325
        "movd %%mm7, " #dst "           \n\t"\
326
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
327
        "movd %%mm0, 16+" #dst "        \n\t"\
328
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
329
        "movd %%mm2, 96+" #dst "        \n\t"\
330
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
331
        "movd %%mm4, 112+" #dst "       \n\t"\
332
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
333
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
334
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
335
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
336
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
337
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
338
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
339
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
340
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
341
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
342
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
343
        "psrad $" #shift ", %%mm2       \n\t"\
344
        "psrad $" #shift ", %%mm5       \n\t"\
345
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
346
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
347
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
348
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
349
        "psrad $" #shift ", %%mm6       \n\t"\
350
        "psrad $" #shift ", %%mm4       \n\t"\
351
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
352
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
353
        "movd %%mm2, 32+" #dst "        \n\t"\
354
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
355
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
356
        "movd %%mm6, 48+" #dst "        \n\t"\
357
        "movd %%mm4, 64+" #dst "        \n\t"\
358
        "movd %%mm5, 80+" #dst "        \n\t"\
359 0a8d8945 Michael Niedermayer

360 115329f1 Diego Biurrun

361 0a8d8945 Michael Niedermayer
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
362 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
363
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
364
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
365
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
366
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
367
        "pand %%mm0, %%mm4              \n\t"\
368
        "por %%mm1, %%mm4               \n\t"\
369
        "por %%mm2, %%mm4               \n\t"\
370
        "por %%mm3, %%mm4               \n\t"\
371
        "packssdw %%mm4,%%mm4           \n\t"\
372
        "movd %%mm4, %%eax              \n\t"\
373
        "orl %%eax, %%eax               \n\t"\
374
        "jz 1f                          \n\t"\
375
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
376
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
377
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
378
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
379
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
380
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
381
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
382
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
383
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
384
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
385
        #rounder ", %%mm4               \n\t"\
386
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
387
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
388
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
389
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
390
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
391
        #rounder ", %%mm0               \n\t"\
392
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
393
        "paddd %%mm0, %%mm0             \n\t" \
394
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
395
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
396
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
397
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
398
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
399
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
400
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
401
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
402
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
403
        "psrad $" #shift ", %%mm7       \n\t"\
404
        "psrad $" #shift ", %%mm4       \n\t"\
405
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
406
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
407
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
408
        "psrad $" #shift ", %%mm1       \n\t"\
409
        "psrad $" #shift ", %%mm2       \n\t"\
410
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
411
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
412
        "movq %%mm7, " #dst "           \n\t"\
413
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
414
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
415
        "movq %%mm2, 24+" #dst "        \n\t"\
416
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
417
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
418
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
419
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
420
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
421
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
422
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
423
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
424
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
425
        "psrad $" #shift ", %%mm2       \n\t"\
426
        "psrad $" #shift ", %%mm0       \n\t"\
427
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
428
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
429
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
430
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
431
        "psrad $" #shift ", %%mm6       \n\t"\
432
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
433
        "movq %%mm2, 8+" #dst "         \n\t"\
434
        "psrad $" #shift ", %%mm4       \n\t"\
435
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
436
        "movq %%mm4, 16+" #dst "        \n\t"\
437
        "jmp 2f                         \n\t"\
438
        "1:                             \n\t"\
439
        "pslld $16, %%mm0               \n\t"\
440
        "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
441
        "psrad $13, %%mm0               \n\t"\
442
        "packssdw %%mm0, %%mm0          \n\t"\
443
        "movq %%mm0, " #dst "           \n\t"\
444
        "movq %%mm0, 8+" #dst "         \n\t"\
445
        "movq %%mm0, 16+" #dst "        \n\t"\
446
        "movq %%mm0, 24+" #dst "        \n\t"\
447
        "2:                             \n\t"
448 37e8dcda Arpi

449

450 0a8d8945 Michael Niedermayer
//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
451
ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
452
/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
453
ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
454
ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
455

456
DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
457
DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
458
DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
459

460

461 347be472 John Dalgliesh
//IDCT(      src0,   src4,   src1,    src5,    dst, shift)
462
COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
463
COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
464
COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
465
COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
466 37e8dcda Arpi

467 0a8d8945 Michael Niedermayer
#else
468
469
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
470 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
471
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
472
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
473
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
474
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
475
        "pand %%mm0, %%mm4              \n\t"\
476
        "por %%mm1, %%mm4               \n\t"\
477
        "por %%mm2, %%mm4               \n\t"\
478
        "por %%mm3, %%mm4               \n\t"\
479
        "packssdw %%mm4,%%mm4           \n\t"\
480
        "movd %%mm4, %%eax              \n\t"\
481
        "orl %%eax, %%eax               \n\t"\
482
        "jz 1f                          \n\t"\
483
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
484
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
485
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
486
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
487
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
488
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
489
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
490
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
491
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
492
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
493
        #rounder ", %%mm4               \n\t"\
494
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
495
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
496
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
497
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
498
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
499
        #rounder ", %%mm0               \n\t"\
500
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
501
        "paddd %%mm0, %%mm0             \n\t" \
502
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
503
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
504
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
505
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
506
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
507
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
508
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
509
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
510
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
511
        "psrad $" #shift ", %%mm7       \n\t"\
512
        "psrad $" #shift ", %%mm4       \n\t"\
513
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
514
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
515
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
516
        "psrad $" #shift ", %%mm1       \n\t"\
517
        "psrad $" #shift ", %%mm2       \n\t"\
518
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
519
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
520
        "movq %%mm7, " #dst "           \n\t"\
521
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
522
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
523
        "movq %%mm2, 24+" #dst "        \n\t"\
524
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
525
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
526
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
527
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
528
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
529
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
530
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
531
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
532
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
533
        "psrad $" #shift ", %%mm2       \n\t"\
534
        "psrad $" #shift ", %%mm0       \n\t"\
535
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
536
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
537
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
538
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
539
        "psrad $" #shift ", %%mm6       \n\t"\
540
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
541
        "movq %%mm2, 8+" #dst "         \n\t"\
542
        "psrad $" #shift ", %%mm4       \n\t"\
543
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
544
        "movq %%mm4, 16+" #dst "        \n\t"\
545
        "jmp 2f                         \n\t"\
546
        "1:                             \n\t"\
547
        "pslld $16, %%mm0               \n\t"\
548
        "paddd "MANGLE(d40000)", %%mm0  \n\t"\
549
        "psrad $13, %%mm0               \n\t"\
550
        "packssdw %%mm0, %%mm0          \n\t"\
551
        "movq %%mm0, " #dst "           \n\t"\
552
        "movq %%mm0, 8+" #dst "         \n\t"\
553
        "movq %%mm0, 16+" #dst "        \n\t"\
554
        "movq %%mm0, 24+" #dst "        \n\t"\
555
        "2:                             \n\t"
556 37e8dcda Arpi
557 0a8d8945 Michael Niedermayer
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
558 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
559
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
560
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
561
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
562
        "movq %%mm0, %%mm4              \n\t"\
563
        "por %%mm1, %%mm4               \n\t"\
564
        "por %%mm2, %%mm4               \n\t"\
565
        "por %%mm3, %%mm4               \n\t"\
566
        "packssdw %%mm4,%%mm4           \n\t"\
567
        "movd %%mm4, %%eax              \n\t"\
568
        "orl %%eax, %%eax               \n\t"\
569
        "jz " #bt "                     \n\t"\
570
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
571
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
572
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
573
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
574
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
575
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
576
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
577
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
578
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
579
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
580
        #rounder ", %%mm4               \n\t"\
581
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
582
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
583
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
584
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
585
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
586
        #rounder ", %%mm0               \n\t"\
587
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
588
        "paddd %%mm0, %%mm0             \n\t" \
589
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
590
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
591
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
592
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
593
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
594
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
595
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
596
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
597
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
598
        "psrad $" #shift ", %%mm7       \n\t"\
599
        "psrad $" #shift ", %%mm4       \n\t"\
600
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
601
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
602
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
603
        "psrad $" #shift ", %%mm1       \n\t"\
604
        "psrad $" #shift ", %%mm2       \n\t"\
605
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
606
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
607
        "movq %%mm7, " #dst "           \n\t"\
608
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
609
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
610
        "movq %%mm2, 24+" #dst "        \n\t"\
611
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
612
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
613
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
614
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
615
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
616
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
617
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
618
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
619
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
620
        "psrad $" #shift ", %%mm2       \n\t"\
621
        "psrad $" #shift ", %%mm0       \n\t"\
622
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
623
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
624
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
625
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
626
        "psrad $" #shift ", %%mm6       \n\t"\
627
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
628
        "movq %%mm2, 8+" #dst "         \n\t"\
629
        "psrad $" #shift ", %%mm4       \n\t"\
630
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
631
        "movq %%mm4, 16+" #dst "        \n\t"\
632 0a8d8945 Michael Niedermayer
633
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
634 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
635
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
636
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
637
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
638
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
639
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
640
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
641
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
642
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
643
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
644
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
645
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
646
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
647
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
648
        #rounder ", %%mm4               \n\t"\
649
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
650
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
651
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
652
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
653
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
654
        #rounder ", %%mm0               \n\t"\
655
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
656
        "paddd %%mm0, %%mm0             \n\t" \
657
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
658
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
659
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
660
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
661
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
662
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
663
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
664
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
665
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
666
        "psrad $" #shift ", %%mm7       \n\t"\
667
        "psrad $" #shift ", %%mm4       \n\t"\
668
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
669
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
670
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
671
        "psrad $" #shift ", %%mm1       \n\t"\
672
        "psrad $" #shift ", %%mm2       \n\t"\
673
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
674
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
675
        "movq %%mm7, " #dst "           \n\t"\
676
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
677
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
678
        "movq %%mm2, 24+" #dst "        \n\t"\
679
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
680
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
681
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
682
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
683
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
684
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
685
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
686
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
687
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
688
        "psrad $" #shift ", %%mm2       \n\t"\
689
        "psrad $" #shift ", %%mm0       \n\t"\
690
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
691
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
692
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
693
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
694
        "psrad $" #shift ", %%mm6       \n\t"\
695
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
696
        "movq %%mm2, 8+" #dst "         \n\t"\
697
        "psrad $" #shift ", %%mm4       \n\t"\
698
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
699
        "movq %%mm4, 16+" #dst "        \n\t"\
700 0a8d8945 Michael Niedermayer
701
//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
702
DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
703
Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
704
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
705
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
706
707
#undef IDCT
708 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
709 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
710
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
711
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
712
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
713
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
714
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
715
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
716
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
717
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
718
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
719
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
720
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
721
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
722
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
723
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
724
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
725
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
726
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
727
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
728
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
729
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
730
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
731
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
732
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
733
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
734
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
735
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
736
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
737
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
738
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
739
        "psrad $" #shift ", %%mm7       \n\t"\
740
        "psrad $" #shift ", %%mm4       \n\t"\
741
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
742
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
743
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
744
        "psrad $" #shift ", %%mm0       \n\t"\
745
        "psrad $" #shift ", %%mm2       \n\t"\
746
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
747
        "movd %%mm7, " #dst "           \n\t"\
748
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
749
        "movd %%mm0, 16+" #dst "        \n\t"\
750
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
751
        "movd %%mm2, 96+" #dst "        \n\t"\
752
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
753
        "movd %%mm4, 112+" #dst "       \n\t"\
754
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
755
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
756
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
757
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
758
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
759
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
760
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
761
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
762
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
763
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
764
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
765
        "psrad $" #shift ", %%mm2       \n\t"\
766
        "psrad $" #shift ", %%mm5       \n\t"\
767
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
768
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
769
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
770
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
771
        "psrad $" #shift ", %%mm6       \n\t"\
772
        "psrad $" #shift ", %%mm4       \n\t"\
773
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
774
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
775
        "movd %%mm2, 32+" #dst "        \n\t"\
776
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
777
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
778
        "movd %%mm6, 48+" #dst "        \n\t"\
779
        "movd %%mm4, 64+" #dst "        \n\t"\
780
        "movd %%mm5, 80+" #dst "        \n\t"
781 0a8d8945 Michael Niedermayer
782
783 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
784
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
785
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
786
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
787
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
788 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
789 37e8dcda Arpi
790 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)                      \
791 bb270c08 Diego Biurrun
        "4:                             \n\t"
792 0a8d8945 Michael Niedermayer
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
793
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
794 37e8dcda Arpi
795 0a8d8945 Michael Niedermayer
#undef IDCT
796 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
797 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
798
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
799
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
800
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
801
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
802
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
803
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
804
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
805
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
806
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
807
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
808
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
809
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
810
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
811
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
812
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
813
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
814
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
815
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
816
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
817
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
818
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
819
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
820
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
821
        "psrad $" #shift ", %%mm1       \n\t"\
822
        "psrad $" #shift ", %%mm4       \n\t"\
823
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
824
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
825
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
826
        "psrad $" #shift ", %%mm0       \n\t"\
827
        "psrad $" #shift ", %%mm2       \n\t"\
828
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
829
        "movd %%mm1, " #dst "           \n\t"\
830
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
831
        "movd %%mm0, 16+" #dst "        \n\t"\
832
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
833
        "movd %%mm2, 96+" #dst "        \n\t"\
834
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
835
        "movd %%mm4, 112+" #dst "       \n\t"\
836
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
837
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
838
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
839
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
840
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
841
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
842
        "psrad $" #shift ", %%mm2       \n\t"\
843
        "psrad $" #shift ", %%mm5       \n\t"\
844
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
845
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
846
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
847
        "psrad $" #shift ", %%mm6       \n\t"\
848
        "psrad $" #shift ", %%mm1       \n\t"\
849
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
850
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
851
        "movd %%mm2, 32+" #dst "        \n\t"\
852
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
853
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
854
        "movd %%mm6, 48+" #dst "        \n\t"\
855
        "movd %%mm1, 64+" #dst "        \n\t"\
856
        "movd %%mm5, 80+" #dst "        \n\t"
857 0a8d8945 Michael Niedermayer
858 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
859
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
860
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
861
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
862
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
863 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
864 37e8dcda Arpi
865 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)                      \
866 bb270c08 Diego Biurrun
        "6:                             \n\t"
867 0a8d8945 Michael Niedermayer
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
868 37e8dcda Arpi
869 0a8d8945 Michael Niedermayer
#undef IDCT
870 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
871 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
872
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
873
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
874
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
875
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
876
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
877
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
878
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
879
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
880
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
881
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
882
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
883
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
884
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
885
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
886
        "psrad $" #shift ", %%mm1       \n\t"\
887
        "psrad $" #shift ", %%mm4       \n\t"\
888
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
889
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
890
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
891
        "psrad $" #shift ", %%mm0       \n\t"\
892
        "psrad $" #shift ", %%mm2       \n\t"\
893
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
894
        "movd %%mm1, " #dst "           \n\t"\
895
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
896
        "movd %%mm0, 16+" #dst "        \n\t"\
897
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
898
        "movd %%mm2, 96+" #dst "        \n\t"\
899
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
900
        "movd %%mm4, 112+" #dst "       \n\t"\
901
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
902
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
903
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
904
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
905
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
906
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
907
        "psrad $" #shift ", %%mm2       \n\t"\
908
        "psrad $" #shift ", %%mm5       \n\t"\
909
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
910
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
911
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
912
        "psrad $" #shift ", %%mm6       \n\t"\
913
        "psrad $" #shift ", %%mm1       \n\t"\
914
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
915
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
916
        "movd %%mm2, 32+" #dst "        \n\t"\
917
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
918
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
919
        "movd %%mm6, 48+" #dst "        \n\t"\
920
        "movd %%mm1, 64+" #dst "        \n\t"\
921
        "movd %%mm5, 80+" #dst "        \n\t"
922 0a8d8945 Michael Niedermayer
923
924 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
925
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
926
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
927
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
928
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
929 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
930 37e8dcda Arpi
931 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)                      \
932 bb270c08 Diego Biurrun
        "2:                             \n\t"
933 0a8d8945 Michael Niedermayer
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
934 37e8dcda Arpi
935 0a8d8945 Michael Niedermayer
#undef IDCT
936 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
937 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
938
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
939
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
940
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
941
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
942
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
943
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
944
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
945
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
946
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
947
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
948
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
949
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
950
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
951
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
952
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
953
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
954
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
955
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
956
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
957
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
958
        "psrad $" #shift ", %%mm7       \n\t"\
959
        "psrad $" #shift ", %%mm4       \n\t"\
960
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
961
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
962
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
963
        "psrad $" #shift ", %%mm0       \n\t"\
964
        "psrad $" #shift ", %%mm2       \n\t"\
965
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
966
        "movd %%mm7, " #dst "           \n\t"\
967
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
968
        "movd %%mm0, 16+" #dst "        \n\t"\
969
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
970
        "movd %%mm2, 96+" #dst "        \n\t"\
971
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
972
        "movd %%mm4, 112+" #dst "       \n\t"\
973
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
974
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
975
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
976
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
977
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
978
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
979
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
980
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
981
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
982
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
983
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
984
        "psrad $" #shift ", %%mm2       \n\t"\
985
        "psrad $" #shift ", %%mm5       \n\t"\
986
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
987
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
988
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
989
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
990
        "psrad $" #shift ", %%mm6       \n\t"\
991
        "psrad $" #shift ", %%mm4       \n\t"\
992
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
993
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
994
        "movd %%mm2, 32+" #dst "        \n\t"\
995
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
996
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
997
        "movd %%mm6, 48+" #dst "        \n\t"\
998
        "movd %%mm4, 64+" #dst "        \n\t"\
999
        "movd %%mm5, 80+" #dst "        \n\t"
1000 0a8d8945 Michael Niedermayer
1001 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1002
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1003
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1004
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1005
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1006 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
1007 37e8dcda Arpi
1008 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)                      \
1009 bb270c08 Diego Biurrun
        "3:                             \n\t"
1010 0a8d8945 Michael Niedermayer
#undef IDCT
1011 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
1012 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1013
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1014
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1015
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1016
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1017
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1018
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1019
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1020
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1021
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1022
        "movq 64(%2), %%mm3             \n\t"\
1023
        "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1024
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1025
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1026
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1027
        "psrad $" #shift ", %%mm7       \n\t"\
1028
        "psrad $" #shift ", %%mm4       \n\t"\
1029
        "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
1030
        "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1031
        "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
1032
        "psrad $" #shift ", %%mm0       \n\t"\
1033
        "psrad $" #shift ", %%mm1       \n\t"\
1034
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1035
        "movd %%mm7, " #dst "           \n\t"\
1036
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1037
        "movd %%mm0, 16+" #dst "        \n\t"\
1038
        "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
1039
        "movd %%mm1, 96+" #dst "        \n\t"\
1040
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1041
        "movd %%mm4, 112+" #dst "       \n\t"\
1042
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1043
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1044
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1045
        "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
1046
        "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
1047
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1048
        "psrad $" #shift ", %%mm1       \n\t"\
1049
        "psrad $" #shift ", %%mm5       \n\t"\
1050
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1051
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1052
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1053
        "psrad $" #shift ", %%mm6       \n\t"\
1054
        "psrad $" #shift ", %%mm4       \n\t"\
1055
        "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
1056
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1057
        "movd %%mm1, 32+" #dst "        \n\t"\
1058
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1059
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1060
        "movd %%mm6, 48+" #dst "        \n\t"\
1061
        "movd %%mm4, 64+" #dst "        \n\t"\
1062
        "movd %%mm5, 80+" #dst "        \n\t"
1063 0a8d8945 Michael Niedermayer
1064
1065 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1066
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1067
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1068
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1069
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1070 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
1071 37e8dcda Arpi
1072 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)                      \
1073 bb270c08 Diego Biurrun
        "5:                             \n\t"
1074 0a8d8945 Michael Niedermayer
#undef IDCT
1075 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
1076 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1077
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1078
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1079
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1080
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1081
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1082
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1083
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1084
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1085
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1086
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1087
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1088
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1089
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1090
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1091
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1092
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1093
        "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
1094
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1095
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1096
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1097
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1098
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1099
        "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1100
        "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1101
        "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
1102
        "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
1103
        "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
1104
        "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
1105
        "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
1106
        "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
1107
        "psrad $" #shift ", %%mm4       \n\t"\
1108
        "psrad $" #shift ", %%mm7       \n\t"\
1109
        "psrad $" #shift ", %%mm3       \n\t"\
1110
        "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
1111
        "movq %%mm4, " #dst "           \n\t"\
1112
        "psrad $" #shift ", %%mm0       \n\t"\
1113
        "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
1114
        "movq %%mm0, 16+" #dst "        \n\t"\
1115
        "movq %%mm0, 96+" #dst "        \n\t"\
1116
        "movq %%mm4, 112+" #dst "       \n\t"\
1117
        "psrad $" #shift ", %%mm5       \n\t"\
1118
        "psrad $" #shift ", %%mm6       \n\t"\
1119
        "psrad $" #shift ", %%mm2       \n\t"\
1120
        "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1121
        "movq %%mm5, 32+" #dst "        \n\t"\
1122
        "psrad $" #shift ", %%mm1       \n\t"\
1123
        "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1124
        "movq %%mm6, 48+" #dst "        \n\t"\
1125
        "movq %%mm6, 64+" #dst "        \n\t"\
1126
        "movq %%mm5, 80+" #dst "        \n\t"
1127 115329f1 Diego Biurrun
1128 0a8d8945 Michael Niedermayer
1129 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1130
IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1131
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1132
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1133
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1134 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
1135 37e8dcda Arpi
1136
1137 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)                      \
1138 bb270c08 Diego Biurrun
        "1:                             \n\t"
1139 0a8d8945 Michael Niedermayer
#undef IDCT
1140 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
1141 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1142
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1143
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1144
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1145
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1146
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1147
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1148
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1149
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1150
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1151
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1152
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1153
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1154
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1155
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1156
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1157
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1158
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1159
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1160
        "movq 64(%2), %%mm1             \n\t"\
1161
        "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1162
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1163
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1164
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1165
        "psrad $" #shift ", %%mm7       \n\t"\
1166
        "psrad $" #shift ", %%mm4       \n\t"\
1167
        "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
1168
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1169
        "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
1170
        "psrad $" #shift ", %%mm0       \n\t"\
1171
        "psrad $" #shift ", %%mm3       \n\t"\
1172
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1173
        "movd %%mm7, " #dst "           \n\t"\
1174
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1175
        "movd %%mm0, 16+" #dst "        \n\t"\
1176
        "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
1177
        "movd %%mm3, 96+" #dst "        \n\t"\
1178
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1179
        "movd %%mm4, 112+" #dst "       \n\t"\
1180
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1181
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1182
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1183
        "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
1184
        "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
1185
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1186
        "psrad $" #shift ", %%mm3       \n\t"\
1187
        "psrad $" #shift ", %%mm5       \n\t"\
1188
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1189
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1190
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1191
        "psrad $" #shift ", %%mm6       \n\t"\
1192
        "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
1193
        "movd %%mm3, 32+" #dst "        \n\t"\
1194
        "psrad $" #shift ", %%mm4       \n\t"\
1195
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1196
        "movd %%mm6, 48+" #dst "        \n\t"\
1197
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1198
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1199
        "movd %%mm4, 64+" #dst "        \n\t"\
1200
        "movd %%mm5, 80+" #dst "        \n\t"
1201 115329f1 Diego Biurrun
1202 0a8d8945 Michael Niedermayer
1203 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1204
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1205
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1206
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1207
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1208 bb270c08 Diego Biurrun
        "jmp 9f                         \n\t"
1209 37e8dcda Arpi
1210
1211 4454dc1b John Dalgliesh
        "#" ASMALIGN(4)
1212 bb270c08 Diego Biurrun
        "7:                             \n\t"
1213 0a8d8945 Michael Niedermayer
#undef IDCT
1214 347be472 John Dalgliesh
#define IDCT(src0, src4, src1, src5, dst, shift) \
1215 bb270c08 Diego Biurrun
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1216
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1217
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1218
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1219
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1220
        "psrad $" #shift ", %%mm4       \n\t"\
1221
        "psrad $" #shift ", %%mm0       \n\t"\
1222
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1223
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1224
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1225
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1226
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1227
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1228
        "psrad $" #shift ", %%mm1       \n\t"\
1229
        "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
1230
        "movq %%mm4, " #dst "           \n\t"\
1231
        "psrad $" #shift ", %%mm2       \n\t"\
1232
        "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
1233
        "movq %%mm0, 16+" #dst "        \n\t"\
1234
        "movq %%mm0, 96+" #dst "        \n\t"\
1235
        "movq %%mm4, 112+" #dst "       \n\t"\
1236
        "movq %%mm0, 32+" #dst "        \n\t"\
1237
        "movq %%mm4, 48+" #dst "        \n\t"\
1238
        "movq %%mm4, 64+" #dst "        \n\t"\
1239
        "movq %%mm0, 80+" #dst "        \n\t"
1240 0a8d8945 Michael Niedermayer
1241 347be472 John Dalgliesh
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1242
IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1243
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1244
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1245
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1246 37e8dcda Arpi
1247
1248
#endif
1249
1250
/*
1251
Input
1252 0a8d8945 Michael Niedermayer
 00 40 04 44 20 60 24 64
1253
 10 30 14 34 50 70 54 74
1254
 01 41 03 43 21 61 23 63
1255 37e8dcda Arpi
 11 31 13 33 51 71 53 73
1256 0a8d8945 Michael Niedermayer
 02 42 06 46 22 62 26 66
1257
 12 32 16 36 52 72 56 76
1258
 05 45 07 47 25 65 27 67
1259
 15 35 17 37 55 75 57 77
1260 115329f1 Diego Biurrun

1261 37e8dcda Arpi
Temp
1262 0a8d8945 Michael Niedermayer
 00 04 10 14 20 24 30 34
1263
 40 44 50 54 60 64 70 74
1264 37e8dcda Arpi
 01 03 11 13 21 23 31 33
1265
 41 43 51 53 61 63 71 73
1266 0a8d8945 Michael Niedermayer
 02 06 12 16 22 26 32 36
1267
 42 46 52 56 62 66 72 76
1268 37e8dcda Arpi
 05 07 15 17 25 27 35 37
1269
 45 47 55 57 65 67 75 77
1270
*/
1271
1272
"9: \n\t"
1273 bb270c08 Diego Biurrun
                :: "r" (block), "r" (temp), "r" (coeffs)
1274
                : "%eax"
1275
        );
1276 37e8dcda Arpi
}
1277
1278 2ad1516a Michael Niedermayer
void ff_simple_idct_mmx(int16_t *block)
1279 37e8dcda Arpi
{
1280 2ad1516a Michael Niedermayer
    idct(block);
1281
}
1282
1283
//FIXME merge add/put into the idct
1284
1285 0c1a9eda Zdenek Kabelac
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1286 2ad1516a Michael Niedermayer
{
1287
    idct(block);
1288 ec7e0bf0 Zdenek Kabelac
    put_pixels_clamped_mmx(block, dest, line_size);
1289 2ad1516a Michael Niedermayer
}
1290 0c1a9eda Zdenek Kabelac
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1291 2ad1516a Michael Niedermayer
{
1292
    idct(block);
1293 ec7e0bf0 Zdenek Kabelac
    add_pixels_clamped_mmx(block, dest, line_size);
1294 37e8dcda Arpi
}