ffmpeg / libavcodec / i386 / simple_idct_mmx.c @ b550bfaa
History | View | Annotate | Download (71.1 KB)
1 |
/*
|
---|---|
2 |
* Simple IDCT MMX
|
3 |
*
|
4 |
* Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
|
5 |
*
|
6 |
* This file is part of FFmpeg.
|
7 |
*
|
8 |
* FFmpeg is free software; you can redistribute it and/or
|
9 |
* modify it under the terms of the GNU Lesser General Public
|
10 |
* License as published by the Free Software Foundation; either
|
11 |
* version 2.1 of the License, or (at your option) any later version.
|
12 |
*
|
13 |
* FFmpeg is distributed in the hope that it will be useful,
|
14 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16 |
* Lesser General Public License for more details.
|
17 |
*
|
18 |
* You should have received a copy of the GNU Lesser General Public
|
19 |
* License along with FFmpeg; if not, write to the Free Software
|
20 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
21 |
*/
|
22 |
#include "dsputil.h" |
23 |
#include "simple_idct.h" |
24 |
|
25 |
/*
|
26 |
23170.475006
|
27 |
22725.260826
|
28 |
21406.727617
|
29 |
19265.545870
|
30 |
16384.000000
|
31 |
12872.826198
|
32 |
8866.956905
|
33 |
4520.335430
|
34 |
*/
|
35 |
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
36 |
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
37 |
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
38 |
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
39 |
#if 0
|
40 |
#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
41 |
#else
|
42 |
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 |
43 |
#endif
|
44 |
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
45 |
#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
46 |
#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
47 |
|
48 |
#define ROW_SHIFT 11 |
49 |
#define COL_SHIFT 20 // 6 |
50 |
|
51 |
static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL; |
52 |
static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL; |
53 |
|
54 |
static const int16_t __attribute__((aligned(8))) coeffs[]= { |
55 |
1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, |
56 |
// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
|
57 |
// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
|
58 |
1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, |
59 |
// the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
|
60 |
// 0, 0, 0, 0,
|
61 |
// 0, 0, 0, 0,
|
62 |
|
63 |
C4, C4, C4, C4, |
64 |
C4, -C4, C4, -C4, |
65 |
|
66 |
C2, C6, C2, C6, |
67 |
C6, -C2, C6, -C2, |
68 |
|
69 |
C1, C3, C1, C3, |
70 |
C5, C7, C5, C7, |
71 |
|
72 |
C3, -C7, C3, -C7, |
73 |
-C1, -C5, -C1, -C5, |
74 |
|
75 |
C5, -C1, C5, -C1, |
76 |
C7, C3, C7, C3, |
77 |
|
78 |
C7, -C5, C7, -C5, |
79 |
C3, -C1, C3, -C1 |
80 |
}; |
81 |
|
82 |
#if 0
|
83 |
static void unused_var_killer(){
|
84 |
int a= wm1010 + d40000;
|
85 |
temp[0]=a;
|
86 |
}
|
87 |
|
88 |
static void inline idctCol (int16_t * col, int16_t *input)
|
89 |
{
|
90 |
#undef C0
|
91 |
#undef C1
|
92 |
#undef C2
|
93 |
#undef C3
|
94 |
#undef C4
|
95 |
#undef C5
|
96 |
#undef C6
|
97 |
#undef C7
|
98 |
int a0, a1, a2, a3, b0, b1, b2, b3;
|
99 |
const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
100 |
const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
101 |
const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
102 |
const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
103 |
const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
104 |
const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
105 |
const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
106 |
const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
107 |
/*
|
108 |
if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
|
109 |
col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
|
110 |
col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
|
111 |
return;
|
112 |
}*/
|
113 |
|
114 |
col[8*0] = input[8*0 + 0];
|
115 |
col[8*1] = input[8*2 + 0];
|
116 |
col[8*2] = input[8*0 + 1];
|
117 |
col[8*3] = input[8*2 + 1];
|
118 |
col[8*4] = input[8*4 + 0];
|
119 |
col[8*5] = input[8*6 + 0];
|
120 |
col[8*6] = input[8*4 + 1];
|
121 |
col[8*7] = input[8*6 + 1];
|
122 |
|
123 |
a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
|
124 |
a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
|
125 |
a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
|
126 |
a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
|
127 |
|
128 |
b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
|
129 |
b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
|
130 |
b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
|
131 |
b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
|
132 |
|
133 |
col[8*0] = (a0 + b0) >> COL_SHIFT;
|
134 |
col[8*1] = (a1 + b1) >> COL_SHIFT;
|
135 |
col[8*2] = (a2 + b2) >> COL_SHIFT;
|
136 |
col[8*3] = (a3 + b3) >> COL_SHIFT;
|
137 |
col[8*4] = (a3 - b3) >> COL_SHIFT;
|
138 |
col[8*5] = (a2 - b2) >> COL_SHIFT;
|
139 |
col[8*6] = (a1 - b1) >> COL_SHIFT;
|
140 |
col[8*7] = (a0 - b0) >> COL_SHIFT;
|
141 |
}
|
142 |
|
143 |
static void inline idctRow (int16_t * output, int16_t * input)
|
144 |
{
|
145 |
int16_t row[8];
|
146 |
|
147 |
int a0, a1, a2, a3, b0, b1, b2, b3;
|
148 |
const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
149 |
const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
150 |
const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
151 |
const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
152 |
const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
153 |
const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
154 |
const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
155 |
const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
156 |
|
157 |
row[0] = input[0];
|
158 |
row[2] = input[1];
|
159 |
row[4] = input[4];
|
160 |
row[6] = input[5];
|
161 |
row[1] = input[8];
|
162 |
row[3] = input[9];
|
163 |
row[5] = input[12];
|
164 |
row[7] = input[13];
|
165 |
|
166 |
if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
|
167 |
row[0] = row[1] = row[2] = row[3] = row[4] =
|
168 |
row[5] = row[6] = row[7] = row[0]<<3;
|
169 |
output[0] = row[0];
|
170 |
output[2] = row[1];
|
171 |
output[4] = row[2];
|
172 |
output[6] = row[3];
|
173 |
output[8] = row[4];
|
174 |
output[10] = row[5];
|
175 |
output[12] = row[6];
|
176 |
output[14] = row[7];
|
177 |
return;
|
178 |
}
|
179 |
|
180 |
a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
|
181 |
a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
|
182 |
a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
|
183 |
a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
|
184 |
|
185 |
b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
|
186 |
b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
|
187 |
b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
|
188 |
b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
|
189 |
|
190 |
row[0] = (a0 + b0) >> ROW_SHIFT;
|
191 |
row[1] = (a1 + b1) >> ROW_SHIFT;
|
192 |
row[2] = (a2 + b2) >> ROW_SHIFT;
|
193 |
row[3] = (a3 + b3) >> ROW_SHIFT;
|
194 |
row[4] = (a3 - b3) >> ROW_SHIFT;
|
195 |
row[5] = (a2 - b2) >> ROW_SHIFT;
|
196 |
row[6] = (a1 - b1) >> ROW_SHIFT;
|
197 |
row[7] = (a0 - b0) >> ROW_SHIFT;
|
198 |
|
199 |
output[0] = row[0];
|
200 |
output[2] = row[1];
|
201 |
output[4] = row[2];
|
202 |
output[6] = row[3];
|
203 |
output[8] = row[4];
|
204 |
output[10] = row[5];
|
205 |
output[12] = row[6];
|
206 |
output[14] = row[7];
|
207 |
}
|
208 |
#endif
|
209 |
|
210 |
static inline void idct(int16_t *block) |
211 |
{ |
212 |
int64_t __attribute__((aligned(8))) align_tmp[16]; |
213 |
int16_t * const temp= (int16_t*)align_tmp;
|
214 |
|
215 |
asm volatile( |
216 |
#if 0 //Alternative, simpler variant
|
217 |
|
218 |
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
219 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
220 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
221 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
222 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
223 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
224 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
225 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
226 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
227 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
228 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
229 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
230 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
231 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
232 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
233 |
#rounder ", %%mm4 \n\t"\
|
234 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
235 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
236 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
237 |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
|
238 |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
239 |
#rounder ", %%mm0 \n\t"\
|
240 |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
|
241 |
"paddd %%mm0, %%mm0 \n\t" \
|
242 |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
|
243 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
244 |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
|
245 |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
|
246 |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
247 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
248 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
249 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
250 |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
|
251 |
"psrad $" #shift ", %%mm7 \n\t"\
|
252 |
"psrad $" #shift ", %%mm4 \n\t"\
|
253 |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
|
254 |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
|
255 |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
256 |
"psrad $" #shift ", %%mm1 \n\t"\
|
257 |
"psrad $" #shift ", %%mm2 \n\t"\
|
258 |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
|
259 |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
|
260 |
"movq %%mm7, " #dst " \n\t"\
|
261 |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
|
262 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
263 |
"movq %%mm2, 24+" #dst " \n\t"\
|
264 |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
265 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
266 |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
267 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
268 |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
|
269 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
270 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
271 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
272 |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
|
273 |
"psrad $" #shift ", %%mm2 \n\t"\
|
274 |
"psrad $" #shift ", %%mm0 \n\t"\
|
275 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
276 |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
|
277 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
278 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
279 |
"psrad $" #shift ", %%mm6 \n\t"\
|
280 |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
|
281 |
"movq %%mm2, 8+" #dst " \n\t"\
|
282 |
"psrad $" #shift ", %%mm4 \n\t"\
|
283 |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
|
284 |
"movq %%mm4, 16+" #dst " \n\t"\
|
285 |
|
286 |
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
|
287 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
288 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
289 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
290 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
291 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
292 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
293 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
294 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
295 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
296 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
297 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
298 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
299 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
300 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
301 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
302 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
303 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
304 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
305 |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
|
306 |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
|
307 |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
|
308 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
309 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
310 |
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
|
311 |
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
|
312 |
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
313 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
314 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
315 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
316 |
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
|
317 |
"psrad $" #shift ", %%mm7 \n\t"\
|
318 |
"psrad $" #shift ", %%mm4 \n\t"\
|
319 |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
|
320 |
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
321 |
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
322 |
"psrad $" #shift ", %%mm0 \n\t"\
|
323 |
"psrad $" #shift ", %%mm2 \n\t"\
|
324 |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
325 |
"movd %%mm7, " #dst " \n\t"\
|
326 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
327 |
"movd %%mm0, 16+" #dst " \n\t"\
|
328 |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
329 |
"movd %%mm2, 96+" #dst " \n\t"\
|
330 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
331 |
"movd %%mm4, 112+" #dst " \n\t"\
|
332 |
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
|
333 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
334 |
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
335 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
336 |
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
337 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
338 |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
|
339 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
340 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
341 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
342 |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
|
343 |
"psrad $" #shift ", %%mm2 \n\t"\
|
344 |
"psrad $" #shift ", %%mm5 \n\t"\
|
345 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
346 |
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
|
347 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
348 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
349 |
"psrad $" #shift ", %%mm6 \n\t"\
|
350 |
"psrad $" #shift ", %%mm4 \n\t"\
|
351 |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
352 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
353 |
"movd %%mm2, 32+" #dst " \n\t"\
|
354 |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
|
355 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
|
356 |
"movd %%mm6, 48+" #dst " \n\t"\
|
357 |
"movd %%mm4, 64+" #dst " \n\t"\
|
358 |
"movd %%mm5, 80+" #dst " \n\t"\
|
359 |
|
360 |
|
361 |
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
362 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
363 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
364 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
365 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
366 |
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
|
367 |
"pand %%mm0, %%mm4 \n\t"\
|
368 |
"por %%mm1, %%mm4 \n\t"\
|
369 |
"por %%mm2, %%mm4 \n\t"\
|
370 |
"por %%mm3, %%mm4 \n\t"\
|
371 |
"packssdw %%mm4,%%mm4 \n\t"\
|
372 |
"movd %%mm4, %%eax \n\t"\
|
373 |
"orl %%eax, %%eax \n\t"\
|
374 |
"jz 1f \n\t"\
|
375 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
376 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
377 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
378 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
379 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
380 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
381 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
382 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
383 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
384 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
385 |
#rounder ", %%mm4 \n\t"\
|
386 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
387 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
388 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
389 |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
|
390 |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
391 |
#rounder ", %%mm0 \n\t"\
|
392 |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
|
393 |
"paddd %%mm0, %%mm0 \n\t" \
|
394 |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
|
395 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
396 |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
|
397 |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
|
398 |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
399 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
400 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
401 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
402 |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
|
403 |
"psrad $" #shift ", %%mm7 \n\t"\
|
404 |
"psrad $" #shift ", %%mm4 \n\t"\
|
405 |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
|
406 |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
|
407 |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
408 |
"psrad $" #shift ", %%mm1 \n\t"\
|
409 |
"psrad $" #shift ", %%mm2 \n\t"\
|
410 |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
|
411 |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
|
412 |
"movq %%mm7, " #dst " \n\t"\
|
413 |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
|
414 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
415 |
"movq %%mm2, 24+" #dst " \n\t"\
|
416 |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
417 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
418 |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
419 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
420 |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
|
421 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
422 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
423 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
424 |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
|
425 |
"psrad $" #shift ", %%mm2 \n\t"\
|
426 |
"psrad $" #shift ", %%mm0 \n\t"\
|
427 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
428 |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
|
429 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
430 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
431 |
"psrad $" #shift ", %%mm6 \n\t"\
|
432 |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
|
433 |
"movq %%mm2, 8+" #dst " \n\t"\
|
434 |
"psrad $" #shift ", %%mm4 \n\t"\
|
435 |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
|
436 |
"movq %%mm4, 16+" #dst " \n\t"\
|
437 |
"jmp 2f \n\t"\
|
438 |
"1: \n\t"\
|
439 |
"pslld $16, %%mm0 \n\t"\
|
440 |
"#paddd "MANGLE(d40000)", %%mm0 \n\t"\
|
441 |
"psrad $13, %%mm0 \n\t"\
|
442 |
"packssdw %%mm0, %%mm0 \n\t"\
|
443 |
"movq %%mm0, " #dst " \n\t"\
|
444 |
"movq %%mm0, 8+" #dst " \n\t"\
|
445 |
"movq %%mm0, 16+" #dst " \n\t"\
|
446 |
"movq %%mm0, 24+" #dst " \n\t"\
|
447 |
"2: \n\t"
|
448 |
|
449 |
|
450 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
451 |
ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
|
452 |
/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
|
453 |
ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
|
454 |
ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
|
455 |
|
456 |
DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
|
457 |
DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
|
458 |
DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
|
459 |
|
460 |
|
461 |
//IDCT( src0, src4, src1, src5, dst, shift)
|
462 |
COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
|
463 |
COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
|
464 |
COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
|
465 |
COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
|
466 |
|
467 |
#else
|
468 |
|
469 |
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
470 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
471 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
472 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
473 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
474 |
"movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
475 |
"pand %%mm0, %%mm4 \n\t"\
|
476 |
"por %%mm1, %%mm4 \n\t"\
|
477 |
"por %%mm2, %%mm4 \n\t"\
|
478 |
"por %%mm3, %%mm4 \n\t"\
|
479 |
"packssdw %%mm4,%%mm4 \n\t"\
|
480 |
"movd %%mm4, %%eax \n\t"\
|
481 |
"orl %%eax, %%eax \n\t"\
|
482 |
"jz 1f \n\t"\
|
483 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
484 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
485 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
486 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
487 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
488 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
489 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
490 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
491 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
492 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
493 |
#rounder ", %%mm4 \n\t"\ |
494 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
495 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
496 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
497 |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
498 |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
499 |
#rounder ", %%mm0 \n\t"\ |
500 |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
501 |
"paddd %%mm0, %%mm0 \n\t" \
|
502 |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
503 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
504 |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
505 |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
506 |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
507 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
508 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
509 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
510 |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
511 |
"psrad $" #shift ", %%mm7 \n\t"\ |
512 |
"psrad $" #shift ", %%mm4 \n\t"\ |
513 |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
514 |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
515 |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
516 |
"psrad $" #shift ", %%mm1 \n\t"\ |
517 |
"psrad $" #shift ", %%mm2 \n\t"\ |
518 |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
519 |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
520 |
"movq %%mm7, " #dst " \n\t"\ |
521 |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
522 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
523 |
"movq %%mm2, 24+" #dst " \n\t"\ |
524 |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
525 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
526 |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
527 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
528 |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
529 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
530 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
531 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
532 |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
533 |
"psrad $" #shift ", %%mm2 \n\t"\ |
534 |
"psrad $" #shift ", %%mm0 \n\t"\ |
535 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
536 |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
537 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
538 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
539 |
"psrad $" #shift ", %%mm6 \n\t"\ |
540 |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
541 |
"movq %%mm2, 8+" #dst " \n\t"\ |
542 |
"psrad $" #shift ", %%mm4 \n\t"\ |
543 |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
544 |
"movq %%mm4, 16+" #dst " \n\t"\ |
545 |
"jmp 2f \n\t"\
|
546 |
"1: \n\t"\
|
547 |
"pslld $16, %%mm0 \n\t"\
|
548 |
"paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
549 |
"psrad $13, %%mm0 \n\t"\
|
550 |
"packssdw %%mm0, %%mm0 \n\t"\
|
551 |
"movq %%mm0, " #dst " \n\t"\ |
552 |
"movq %%mm0, 8+" #dst " \n\t"\ |
553 |
"movq %%mm0, 16+" #dst " \n\t"\ |
554 |
"movq %%mm0, 24+" #dst " \n\t"\ |
555 |
"2: \n\t"
|
556 |
|
557 |
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
|
558 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
559 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
560 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
561 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
562 |
"movq %%mm0, %%mm4 \n\t"\
|
563 |
"por %%mm1, %%mm4 \n\t"\
|
564 |
"por %%mm2, %%mm4 \n\t"\
|
565 |
"por %%mm3, %%mm4 \n\t"\
|
566 |
"packssdw %%mm4,%%mm4 \n\t"\
|
567 |
"movd %%mm4, %%eax \n\t"\
|
568 |
"orl %%eax, %%eax \n\t"\
|
569 |
"jz " #bt " \n\t"\ |
570 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
571 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
572 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
573 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
574 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
575 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
576 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
577 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
578 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
579 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
580 |
#rounder ", %%mm4 \n\t"\ |
581 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
582 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
583 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
584 |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
585 |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
586 |
#rounder ", %%mm0 \n\t"\ |
587 |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
588 |
"paddd %%mm0, %%mm0 \n\t" \
|
589 |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
590 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
591 |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
592 |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
593 |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
594 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
595 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
596 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
597 |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
598 |
"psrad $" #shift ", %%mm7 \n\t"\ |
599 |
"psrad $" #shift ", %%mm4 \n\t"\ |
600 |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
601 |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
602 |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
603 |
"psrad $" #shift ", %%mm1 \n\t"\ |
604 |
"psrad $" #shift ", %%mm2 \n\t"\ |
605 |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
606 |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
607 |
"movq %%mm7, " #dst " \n\t"\ |
608 |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
609 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
610 |
"movq %%mm2, 24+" #dst " \n\t"\ |
611 |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
612 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
613 |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
614 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
615 |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
616 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
617 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
618 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
619 |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
620 |
"psrad $" #shift ", %%mm2 \n\t"\ |
621 |
"psrad $" #shift ", %%mm0 \n\t"\ |
622 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
623 |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
624 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
625 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
626 |
"psrad $" #shift ", %%mm6 \n\t"\ |
627 |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
628 |
"movq %%mm2, 8+" #dst " \n\t"\ |
629 |
"psrad $" #shift ", %%mm4 \n\t"\ |
630 |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
631 |
"movq %%mm4, 16+" #dst " \n\t"\ |
632 |
|
633 |
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
634 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
635 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
636 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
637 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
638 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
639 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
640 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
641 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
642 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
643 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
644 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
645 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
646 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
647 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
648 |
#rounder ", %%mm4 \n\t"\ |
649 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
650 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
651 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
652 |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
653 |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
654 |
#rounder ", %%mm0 \n\t"\ |
655 |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
656 |
"paddd %%mm0, %%mm0 \n\t" \
|
657 |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
658 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
659 |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
660 |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
661 |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
662 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
663 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
664 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
665 |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
666 |
"psrad $" #shift ", %%mm7 \n\t"\ |
667 |
"psrad $" #shift ", %%mm4 \n\t"\ |
668 |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
669 |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
670 |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
671 |
"psrad $" #shift ", %%mm1 \n\t"\ |
672 |
"psrad $" #shift ", %%mm2 \n\t"\ |
673 |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
674 |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
675 |
"movq %%mm7, " #dst " \n\t"\ |
676 |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
677 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
678 |
"movq %%mm2, 24+" #dst " \n\t"\ |
679 |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
680 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
681 |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
682 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
683 |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
684 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
685 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
686 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
687 |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
688 |
"psrad $" #shift ", %%mm2 \n\t"\ |
689 |
"psrad $" #shift ", %%mm0 \n\t"\ |
690 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
691 |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
692 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
693 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
694 |
"psrad $" #shift ", %%mm6 \n\t"\ |
695 |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
696 |
"movq %%mm2, 8+" #dst " \n\t"\ |
697 |
"psrad $" #shift ", %%mm4 \n\t"\ |
698 |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
699 |
"movq %%mm4, 16+" #dst " \n\t"\ |
700 |
|
701 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
702 |
DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) |
703 |
Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) |
704 |
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) |
705 |
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) |
706 |
|
707 |
#undef IDCT
|
708 |
#define IDCT(src0, src4, src1, src5, dst, shift) \
|
709 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
710 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
711 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
712 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
713 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
714 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
715 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
716 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
717 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
718 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
719 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
720 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
721 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
722 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
723 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
724 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
725 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
726 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
727 |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
728 |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
729 |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
730 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
731 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
732 |
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ |
733 |
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ |
734 |
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
735 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
736 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
737 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
738 |
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ |
739 |
"psrad $" #shift ", %%mm7 \n\t"\ |
740 |
"psrad $" #shift ", %%mm4 \n\t"\ |
741 |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
742 |
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
743 |
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
744 |
"psrad $" #shift ", %%mm0 \n\t"\ |
745 |
"psrad $" #shift ", %%mm2 \n\t"\ |
746 |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
747 |
"movd %%mm7, " #dst " \n\t"\ |
748 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
749 |
"movd %%mm0, 16+" #dst " \n\t"\ |
750 |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
751 |
"movd %%mm2, 96+" #dst " \n\t"\ |
752 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
753 |
"movd %%mm4, 112+" #dst " \n\t"\ |
754 |
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ |
755 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
756 |
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
757 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
758 |
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
759 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
760 |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
761 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
762 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
763 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
764 |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
765 |
"psrad $" #shift ", %%mm2 \n\t"\ |
766 |
"psrad $" #shift ", %%mm5 \n\t"\ |
767 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
768 |
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ |
769 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
770 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
771 |
"psrad $" #shift ", %%mm6 \n\t"\ |
772 |
"psrad $" #shift ", %%mm4 \n\t"\ |
773 |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
774 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
775 |
"movd %%mm2, 32+" #dst " \n\t"\ |
776 |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
777 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
778 |
"movd %%mm6, 48+" #dst " \n\t"\ |
779 |
"movd %%mm4, 64+" #dst " \n\t"\ |
780 |
"movd %%mm5, 80+" #dst " \n\t" |
781 |
|
782 |
|
783 |
//IDCT( src0, src4, src1, src5, dst, shift)
|
784 |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
785 |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
786 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
787 |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
788 |
"jmp 9f \n\t"
|
789 |
|
790 |
"#" ASMALIGN(4) \ |
791 |
"4: \n\t"
|
792 |
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) |
793 |
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) |
794 |
|
795 |
#undef IDCT
|
796 |
#define IDCT(src0, src4, src1, src5, dst, shift) \
|
797 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
798 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
799 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
800 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
801 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
802 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
803 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
804 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
805 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
806 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
807 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
808 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
809 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
810 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
811 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
812 |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
813 |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
814 |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
815 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
816 |
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
817 |
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
818 |
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
819 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
820 |
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
821 |
"psrad $" #shift ", %%mm1 \n\t"\ |
822 |
"psrad $" #shift ", %%mm4 \n\t"\ |
823 |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
824 |
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
825 |
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
826 |
"psrad $" #shift ", %%mm0 \n\t"\ |
827 |
"psrad $" #shift ", %%mm2 \n\t"\ |
828 |
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
829 |
"movd %%mm1, " #dst " \n\t"\ |
830 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
831 |
"movd %%mm0, 16+" #dst " \n\t"\ |
832 |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
833 |
"movd %%mm2, 96+" #dst " \n\t"\ |
834 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
835 |
"movd %%mm4, 112+" #dst " \n\t"\ |
836 |
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ |
837 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
838 |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
839 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
840 |
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
841 |
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
842 |
"psrad $" #shift ", %%mm2 \n\t"\ |
843 |
"psrad $" #shift ", %%mm5 \n\t"\ |
844 |
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ |
845 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
846 |
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ |
847 |
"psrad $" #shift ", %%mm6 \n\t"\ |
848 |
"psrad $" #shift ", %%mm1 \n\t"\ |
849 |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
850 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
851 |
"movd %%mm2, 32+" #dst " \n\t"\ |
852 |
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ |
853 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
854 |
"movd %%mm6, 48+" #dst " \n\t"\ |
855 |
"movd %%mm1, 64+" #dst " \n\t"\ |
856 |
"movd %%mm5, 80+" #dst " \n\t" |
857 |
|
858 |
//IDCT( src0, src4, src1, src5, dst, shift)
|
859 |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
860 |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
861 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
862 |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
863 |
"jmp 9f \n\t"
|
864 |
|
865 |
"#" ASMALIGN(4) \ |
866 |
"6: \n\t"
|
867 |
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) |
868 |
|
869 |
#undef IDCT
|
870 |
#define IDCT(src0, src4, src1, src5, dst, shift) \
|
871 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
872 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
873 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
874 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
875 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
876 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
877 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
878 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
879 |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
880 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
881 |
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
882 |
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
883 |
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
884 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
885 |
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
886 |
"psrad $" #shift ", %%mm1 \n\t"\ |
887 |
"psrad $" #shift ", %%mm4 \n\t"\ |
888 |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
889 |
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
890 |
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
891 |
"psrad $" #shift ", %%mm0 \n\t"\ |
892 |
"psrad $" #shift ", %%mm2 \n\t"\ |
893 |
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
894 |
"movd %%mm1, " #dst " \n\t"\ |
895 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
896 |
"movd %%mm0, 16+" #dst " \n\t"\ |
897 |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
898 |
"movd %%mm2, 96+" #dst " \n\t"\ |
899 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
900 |
"movd %%mm4, 112+" #dst " \n\t"\ |
901 |
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ |
902 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
903 |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
904 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
905 |
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
906 |
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
907 |
"psrad $" #shift ", %%mm2 \n\t"\ |
908 |
"psrad $" #shift ", %%mm5 \n\t"\ |
909 |
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ |
910 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
911 |
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ |
912 |
"psrad $" #shift ", %%mm6 \n\t"\ |
913 |
"psrad $" #shift ", %%mm1 \n\t"\ |
914 |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
915 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
916 |
"movd %%mm2, 32+" #dst " \n\t"\ |
917 |
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ |
918 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
919 |
"movd %%mm6, 48+" #dst " \n\t"\ |
920 |
"movd %%mm1, 64+" #dst " \n\t"\ |
921 |
"movd %%mm5, 80+" #dst " \n\t" |
922 |
|
923 |
|
924 |
//IDCT( src0, src4, src1, src5, dst, shift)
|
925 |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
926 |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
927 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
928 |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
929 |
"jmp 9f \n\t"
|
930 |
|
931 |
"#" ASMALIGN(4) \ |
932 |
"2: \n\t"
|
933 |
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) |
934 |
|
935 |
#undef IDCT
|
936 |
#define IDCT(src0, src4, src1, src5, dst, shift) \
|
937 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
938 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
939 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
940 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
941 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
942 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
943 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
944 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
945 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
946 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
947 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
948 |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
949 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
950 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
951 |
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ |
952 |
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ |
953 |
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
954 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
955 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
956 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
957 |
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ |
958 |
"psrad $" #shift ", %%mm7 \n\t"\ |
959 |
"psrad $" #shift ", %%mm4 \n\t"\ |
960 |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
961 |
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
962 |
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
963 |
"psrad $" #shift ", %%mm0 \n\t"\ |
964 |
"psrad $" #shift ", %%mm2 \n\t"\ |
965 |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
966 |
"movd %%mm7, " #dst " \n\t"\ |
967 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
968 |
"movd %%mm0, 16+" #dst " \n\t"\ |
969 |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
970 |
"movd %%mm2, 96+" #dst " \n\t"\ |
971 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
972 |
"movd %%mm4, 112+" #dst " \n\t"\ |
973 |
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ |
974 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
975 |
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
976 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
977 |
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
978 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
979 |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
980 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
981 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
982 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
983 |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
984 |
"psrad $" #shift ", %%mm2 \n\t"\ |
985 |
"psrad $" #shift ", %%mm5 \n\t"\ |
986 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
987 |
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ |
988 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
989 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
990 |
"psrad $" #shift ", %%mm6 \n\t"\ |
991 |
"psrad $" #shift ", %%mm4 \n\t"\ |
992 |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
993 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
994 |
"movd %%mm2, 32+" #dst " \n\t"\ |
995 |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
996 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
997 |
"movd %%mm6, 48+" #dst " \n\t"\ |
998 |
"movd %%mm4, 64+" #dst " \n\t"\ |
999 |
"movd %%mm5, 80+" #dst " \n\t" |
1000 |
|
1001 |
//IDCT( src0, src4, src1, src5, dst, shift)
|
1002 |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
1003 |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
1004 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
1005 |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
1006 |
"jmp 9f \n\t"
|
1007 |
|
1008 |
"#" ASMALIGN(4) \ |
1009 |
"3: \n\t"
|
1010 |
#undef IDCT
|
1011 |
#define IDCT(src0, src4, src1, src5, dst, shift) \
|
1012 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
1013 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
1014 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1015 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1016 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
1017 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1018 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1019 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
1020 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
1021 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1022 |
"movq 64(%2), %%mm3 \n\t"\
|
1023 |
"pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
1024 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
1025 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
1026 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1027 |
"psrad $" #shift ", %%mm7 \n\t"\ |
1028 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1029 |
"movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
1030 |
"paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
1031 |
"psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ |
1032 |
"psrad $" #shift ", %%mm0 \n\t"\ |
1033 |
"psrad $" #shift ", %%mm1 \n\t"\ |
1034 |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
1035 |
"movd %%mm7, " #dst " \n\t"\ |
1036 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
1037 |
"movd %%mm0, 16+" #dst " \n\t"\ |
1038 |
"packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ |
1039 |
"movd %%mm1, 96+" #dst " \n\t"\ |
1040 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1041 |
"movd %%mm4, 112+" #dst " \n\t"\ |
1042 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
1043 |
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
1044 |
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
1045 |
"movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ |
1046 |
"paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ |
1047 |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
1048 |
"psrad $" #shift ", %%mm1 \n\t"\ |
1049 |
"psrad $" #shift ", %%mm5 \n\t"\ |
1050 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
1051 |
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1052 |
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
1053 |
"psrad $" #shift ", %%mm6 \n\t"\ |
1054 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1055 |
"packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ |
1056 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1057 |
"movd %%mm1, 32+" #dst " \n\t"\ |
1058 |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
1059 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
1060 |
"movd %%mm6, 48+" #dst " \n\t"\ |
1061 |
"movd %%mm4, 64+" #dst " \n\t"\ |
1062 |
"movd %%mm5, 80+" #dst " \n\t" |
1063 |
|
1064 |
|
1065 |
//IDCT( src0, src4, src1, src5, dst, shift)
|
1066 |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
1067 |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
1068 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
1069 |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
1070 |
"jmp 9f \n\t"
|
1071 |
|
1072 |
"#" ASMALIGN(4) \ |
1073 |
"5: \n\t"
|
1074 |
#undef IDCT
|
1075 |
#define IDCT(src0, src4, src1, src5, dst, shift) \
|
1076 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
1077 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
1078 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1079 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1080 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
1081 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1082 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
1083 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
1084 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
1085 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
1086 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1087 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
1088 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
1089 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1090 |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
1091 |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
1092 |
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ |
1093 |
"movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ |
1094 |
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ |
1095 |
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1096 |
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ |
1097 |
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1098 |
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ |
1099 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
1100 |
"pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
1101 |
"paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ |
1102 |
"paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ |
1103 |
"psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ |
1104 |
"paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ |
1105 |
"paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ |
1106 |
"psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ |
1107 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1108 |
"psrad $" #shift ", %%mm7 \n\t"\ |
1109 |
"psrad $" #shift ", %%mm3 \n\t"\ |
1110 |
"packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ |
1111 |
"movq %%mm4, " #dst " \n\t"\ |
1112 |
"psrad $" #shift ", %%mm0 \n\t"\ |
1113 |
"packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ |
1114 |
"movq %%mm0, 16+" #dst " \n\t"\ |
1115 |
"movq %%mm0, 96+" #dst " \n\t"\ |
1116 |
"movq %%mm4, 112+" #dst " \n\t"\ |
1117 |
"psrad $" #shift ", %%mm5 \n\t"\ |
1118 |
"psrad $" #shift ", %%mm6 \n\t"\ |
1119 |
"psrad $" #shift ", %%mm2 \n\t"\ |
1120 |
"packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
1121 |
"movq %%mm5, 32+" #dst " \n\t"\ |
1122 |
"psrad $" #shift ", %%mm1 \n\t"\ |
1123 |
"packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1124 |
"movq %%mm6, 48+" #dst " \n\t"\ |
1125 |
"movq %%mm6, 64+" #dst " \n\t"\ |
1126 |
"movq %%mm5, 80+" #dst " \n\t" |
1127 |
|
1128 |
|
1129 |
//IDCT( src0, src4, src1, src5, dst, shift)
|
1130 |
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
1131 |
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
|
1132 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
1133 |
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
|
1134 |
"jmp 9f \n\t"
|
1135 |
|
1136 |
|
1137 |
"#" ASMALIGN(4) \ |
1138 |
"1: \n\t"
|
1139 |
#undef IDCT
|
1140 |
#define IDCT(src0, src4, src1, src5, dst, shift) \
|
1141 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
1142 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
1143 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
1144 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1145 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1146 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
1147 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1148 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
1149 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
1150 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
1151 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
1152 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1153 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
1154 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
1155 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
1156 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
1157 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1158 |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
1159 |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
1160 |
"movq 64(%2), %%mm1 \n\t"\
|
1161 |
"pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
1162 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
1163 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
1164 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1165 |
"psrad $" #shift ", %%mm7 \n\t"\ |
1166 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1167 |
"movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ |
1168 |
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
1169 |
"psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ |
1170 |
"psrad $" #shift ", %%mm0 \n\t"\ |
1171 |
"psrad $" #shift ", %%mm3 \n\t"\ |
1172 |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
1173 |
"movd %%mm7, " #dst " \n\t"\ |
1174 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
1175 |
"movd %%mm0, 16+" #dst " \n\t"\ |
1176 |
"packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ |
1177 |
"movd %%mm3, 96+" #dst " \n\t"\ |
1178 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1179 |
"movd %%mm4, 112+" #dst " \n\t"\ |
1180 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
1181 |
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
1182 |
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
1183 |
"movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ |
1184 |
"paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ |
1185 |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
1186 |
"psrad $" #shift ", %%mm3 \n\t"\ |
1187 |
"psrad $" #shift ", %%mm5 \n\t"\ |
1188 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
1189 |
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1190 |
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
1191 |
"psrad $" #shift ", %%mm6 \n\t"\ |
1192 |
"packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ |
1193 |
"movd %%mm3, 32+" #dst " \n\t"\ |
1194 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1195 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1196 |
"movd %%mm6, 48+" #dst " \n\t"\ |
1197 |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
1198 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
1199 |
"movd %%mm4, 64+" #dst " \n\t"\ |
1200 |
"movd %%mm5, 80+" #dst " \n\t" |
1201 |
|
1202 |
|
1203 |
//IDCT( src0, src4, src1, src5, dst, shift)
|
1204 |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
1205 |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
1206 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
1207 |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
1208 |
"jmp 9f \n\t"
|
1209 |
|
1210 |
|
1211 |
"#" ASMALIGN(4) |
1212 |
"7: \n\t"
|
1213 |
#undef IDCT
|
1214 |
#define IDCT(src0, src4, src1, src5, dst, shift) \
|
1215 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
1216 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1217 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1218 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
1219 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1220 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1221 |
"psrad $" #shift ", %%mm0 \n\t"\ |
1222 |
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ |
1223 |
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ |
1224 |
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1225 |
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ |
1226 |
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1227 |
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ |
1228 |
"psrad $" #shift ", %%mm1 \n\t"\ |
1229 |
"packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ |
1230 |
"movq %%mm4, " #dst " \n\t"\ |
1231 |
"psrad $" #shift ", %%mm2 \n\t"\ |
1232 |
"packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ |
1233 |
"movq %%mm0, 16+" #dst " \n\t"\ |
1234 |
"movq %%mm0, 96+" #dst " \n\t"\ |
1235 |
"movq %%mm4, 112+" #dst " \n\t"\ |
1236 |
"movq %%mm0, 32+" #dst " \n\t"\ |
1237 |
"movq %%mm4, 48+" #dst " \n\t"\ |
1238 |
"movq %%mm4, 64+" #dst " \n\t"\ |
1239 |
"movq %%mm0, 80+" #dst " \n\t" |
1240 |
|
1241 |
//IDCT( src0, src4, src1, src5, dst, shift)
|
1242 |
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
1243 |
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
|
1244 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
1245 |
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
|
1246 |
|
1247 |
|
1248 |
#endif
|
1249 |
|
1250 |
/*
|
1251 |
Input
|
1252 |
00 40 04 44 20 60 24 64
|
1253 |
10 30 14 34 50 70 54 74
|
1254 |
01 41 03 43 21 61 23 63
|
1255 |
11 31 13 33 51 71 53 73
|
1256 |
02 42 06 46 22 62 26 66
|
1257 |
12 32 16 36 52 72 56 76
|
1258 |
05 45 07 47 25 65 27 67
|
1259 |
15 35 17 37 55 75 57 77
|
1260 |
|
1261 |
Temp
|
1262 |
00 04 10 14 20 24 30 34
|
1263 |
40 44 50 54 60 64 70 74
|
1264 |
01 03 11 13 21 23 31 33
|
1265 |
41 43 51 53 61 63 71 73
|
1266 |
02 06 12 16 22 26 32 36
|
1267 |
42 46 52 56 62 66 72 76
|
1268 |
05 07 15 17 25 27 35 37
|
1269 |
45 47 55 57 65 67 75 77
|
1270 |
*/
|
1271 |
|
1272 |
"9: \n\t"
|
1273 |
:: "r" (block), "r" (temp), "r" (coeffs) |
1274 |
: "%eax"
|
1275 |
); |
1276 |
} |
1277 |
|
1278 |
void ff_simple_idct_mmx(int16_t *block)
|
1279 |
{ |
1280 |
idct(block); |
1281 |
} |
1282 |
|
1283 |
//FIXME merge add/put into the idct
|
1284 |
|
1285 |
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
1286 |
{ |
1287 |
idct(block); |
1288 |
put_pixels_clamped_mmx(block, dest, line_size); |
1289 |
} |
1290 |
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
1291 |
{ |
1292 |
idct(block); |
1293 |
add_pixels_clamped_mmx(block, dest, line_size); |
1294 |
} |