ffmpeg / libavcodec / i386 / simple_idct_mmx.c @ 5509bffa
History | View | Annotate | Download (72.4 KB)
1 |
/*
|
---|---|
2 |
* Simple IDCT MMX
|
3 |
*
|
4 |
* Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
|
5 |
*
|
6 |
* This library is free software; you can redistribute it and/or
|
7 |
* modify it under the terms of the GNU Lesser General Public
|
8 |
* License as published by the Free Software Foundation; either
|
9 |
* version 2 of the License, or (at your option) any later version.
|
10 |
*
|
11 |
* This library is distributed in the hope that it will be useful,
|
12 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14 |
* Lesser General Public License for more details.
|
15 |
*
|
16 |
* You should have received a copy of the GNU Lesser General Public
|
17 |
* License along with this library; if not, write to the Free Software
|
18 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
19 |
*/
|
20 |
#include "../dsputil.h" |
21 |
#include "../simple_idct.h" |
22 |
|
23 |
/*
|
24 |
23170.475006
|
25 |
22725.260826
|
26 |
21406.727617
|
27 |
19265.545870
|
28 |
16384.000000
|
29 |
12872.826198
|
30 |
8866.956905
|
31 |
4520.335430
|
32 |
*/
|
33 |
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
34 |
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
35 |
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
36 |
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
37 |
#if 0
|
38 |
#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
39 |
#else
|
40 |
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 |
41 |
#endif
|
42 |
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
43 |
#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
44 |
#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
45 |
|
46 |
#define ROW_SHIFT 11 |
47 |
#define COL_SHIFT 20 // 6 |
48 |
|
49 |
static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL; |
50 |
static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL; |
51 |
|
52 |
static const int16_t __attribute__((aligned(8))) coeffs[]= { |
53 |
1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, |
54 |
// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
|
55 |
// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
|
56 |
1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, |
57 |
// the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
|
58 |
// 0, 0, 0, 0,
|
59 |
// 0, 0, 0, 0,
|
60 |
|
61 |
C4, C4, C4, C4, |
62 |
C4, -C4, C4, -C4, |
63 |
|
64 |
C2, C6, C2, C6, |
65 |
C6, -C2, C6, -C2, |
66 |
|
67 |
C1, C3, C1, C3, |
68 |
C5, C7, C5, C7, |
69 |
|
70 |
C3, -C7, C3, -C7, |
71 |
-C1, -C5, -C1, -C5, |
72 |
|
73 |
C5, -C1, C5, -C1, |
74 |
C7, C3, C7, C3, |
75 |
|
76 |
C7, -C5, C7, -C5, |
77 |
C3, -C1, C3, -C1 |
78 |
}; |
79 |
|
80 |
#if 0
|
81 |
static void unused_var_killer(){
|
82 |
int a= wm1010 + d40000;
|
83 |
temp[0]=a;
|
84 |
}
|
85 |
|
86 |
static void inline idctCol (int16_t * col, int16_t *input)
|
87 |
{
|
88 |
#undef C0
|
89 |
#undef C1
|
90 |
#undef C2
|
91 |
#undef C3
|
92 |
#undef C4
|
93 |
#undef C5
|
94 |
#undef C6
|
95 |
#undef C7
|
96 |
int a0, a1, a2, a3, b0, b1, b2, b3;
|
97 |
const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
98 |
const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
99 |
const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
100 |
const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
101 |
const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
102 |
const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
103 |
const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
104 |
const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
105 |
/*
|
106 |
if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
|
107 |
col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
|
108 |
col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
|
109 |
return;
|
110 |
}*/
|
111 |
|
112 |
col[8*0] = input[8*0 + 0];
|
113 |
col[8*1] = input[8*2 + 0];
|
114 |
col[8*2] = input[8*0 + 1];
|
115 |
col[8*3] = input[8*2 + 1];
|
116 |
col[8*4] = input[8*4 + 0];
|
117 |
col[8*5] = input[8*6 + 0];
|
118 |
col[8*6] = input[8*4 + 1];
|
119 |
col[8*7] = input[8*6 + 1];
|
120 |
|
121 |
a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
|
122 |
a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
|
123 |
a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
|
124 |
a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
|
125 |
|
126 |
b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
|
127 |
b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
|
128 |
b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
|
129 |
b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
|
130 |
|
131 |
col[8*0] = (a0 + b0) >> COL_SHIFT;
|
132 |
col[8*1] = (a1 + b1) >> COL_SHIFT;
|
133 |
col[8*2] = (a2 + b2) >> COL_SHIFT;
|
134 |
col[8*3] = (a3 + b3) >> COL_SHIFT;
|
135 |
col[8*4] = (a3 - b3) >> COL_SHIFT;
|
136 |
col[8*5] = (a2 - b2) >> COL_SHIFT;
|
137 |
col[8*6] = (a1 - b1) >> COL_SHIFT;
|
138 |
col[8*7] = (a0 - b0) >> COL_SHIFT;
|
139 |
}
|
140 |
|
141 |
static void inline idctRow (int16_t * output, int16_t * input)
|
142 |
{
|
143 |
int16_t row[8];
|
144 |
|
145 |
int a0, a1, a2, a3, b0, b1, b2, b3;
|
146 |
const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
147 |
const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
148 |
const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
149 |
const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
150 |
const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
151 |
const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
152 |
const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
153 |
const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
154 |
|
155 |
row[0] = input[0];
|
156 |
row[2] = input[1];
|
157 |
row[4] = input[4];
|
158 |
row[6] = input[5];
|
159 |
row[1] = input[8];
|
160 |
row[3] = input[9];
|
161 |
row[5] = input[12];
|
162 |
row[7] = input[13];
|
163 |
|
164 |
if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
|
165 |
row[0] = row[1] = row[2] = row[3] = row[4] =
|
166 |
row[5] = row[6] = row[7] = row[0]<<3;
|
167 |
output[0] = row[0];
|
168 |
output[2] = row[1];
|
169 |
output[4] = row[2];
|
170 |
output[6] = row[3];
|
171 |
output[8] = row[4];
|
172 |
output[10] = row[5];
|
173 |
output[12] = row[6];
|
174 |
output[14] = row[7];
|
175 |
return;
|
176 |
}
|
177 |
|
178 |
a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
|
179 |
a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
|
180 |
a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
|
181 |
a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
|
182 |
|
183 |
b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
|
184 |
b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
|
185 |
b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
|
186 |
b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
|
187 |
|
188 |
row[0] = (a0 + b0) >> ROW_SHIFT;
|
189 |
row[1] = (a1 + b1) >> ROW_SHIFT;
|
190 |
row[2] = (a2 + b2) >> ROW_SHIFT;
|
191 |
row[3] = (a3 + b3) >> ROW_SHIFT;
|
192 |
row[4] = (a3 - b3) >> ROW_SHIFT;
|
193 |
row[5] = (a2 - b2) >> ROW_SHIFT;
|
194 |
row[6] = (a1 - b1) >> ROW_SHIFT;
|
195 |
row[7] = (a0 - b0) >> ROW_SHIFT;
|
196 |
|
197 |
output[0] = row[0];
|
198 |
output[2] = row[1];
|
199 |
output[4] = row[2];
|
200 |
output[6] = row[3];
|
201 |
output[8] = row[4];
|
202 |
output[10] = row[5];
|
203 |
output[12] = row[6];
|
204 |
output[14] = row[7];
|
205 |
}
|
206 |
#endif
|
207 |
|
208 |
static inline void idct(int16_t *block) |
209 |
{ |
210 |
int64_t __attribute__((aligned(8))) align_tmp[16]; |
211 |
int16_t * const temp= (int16_t*)align_tmp;
|
212 |
|
213 |
asm volatile( |
214 |
#if 0 //Alternative, simpler variant
|
215 |
|
216 |
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
217 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
218 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
219 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
220 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
221 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
222 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
223 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
224 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
225 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
226 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
227 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
228 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
229 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
230 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
231 |
#rounder ", %%mm4 \n\t"\
|
232 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
233 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
234 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
235 |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
|
236 |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
237 |
#rounder ", %%mm0 \n\t"\
|
238 |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
|
239 |
"paddd %%mm0, %%mm0 \n\t" \
|
240 |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
|
241 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
242 |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
|
243 |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
|
244 |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
245 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
246 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
247 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
248 |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
|
249 |
"psrad $" #shift ", %%mm7 \n\t"\
|
250 |
"psrad $" #shift ", %%mm4 \n\t"\
|
251 |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
|
252 |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
|
253 |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
254 |
"psrad $" #shift ", %%mm1 \n\t"\
|
255 |
"psrad $" #shift ", %%mm2 \n\t"\
|
256 |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
|
257 |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
|
258 |
"movq %%mm7, " #dst " \n\t"\
|
259 |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
|
260 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
261 |
"movq %%mm2, 24+" #dst " \n\t"\
|
262 |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
263 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
264 |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
265 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
266 |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
|
267 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
268 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
269 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
270 |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
|
271 |
"psrad $" #shift ", %%mm2 \n\t"\
|
272 |
"psrad $" #shift ", %%mm0 \n\t"\
|
273 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
274 |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
|
275 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
276 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
277 |
"psrad $" #shift ", %%mm6 \n\t"\
|
278 |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
|
279 |
"movq %%mm2, 8+" #dst " \n\t"\
|
280 |
"psrad $" #shift ", %%mm4 \n\t"\
|
281 |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
|
282 |
"movq %%mm4, 16+" #dst " \n\t"\
|
283 |
|
284 |
#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
285 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
286 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
287 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
288 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
289 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
290 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
291 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
292 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
293 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
294 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
295 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
296 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
297 |
#rounder ", %%mm4 \n\t"\
|
298 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
299 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
300 |
#rounder ", %%mm0 \n\t"\
|
301 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
302 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
303 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
304 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
305 |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
|
306 |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
|
307 |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
|
308 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
309 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
310 |
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
|
311 |
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
|
312 |
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
313 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
314 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
315 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
316 |
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
|
317 |
"psrad $" #shift ", %%mm7 \n\t"\
|
318 |
"psrad $" #shift ", %%mm4 \n\t"\
|
319 |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
|
320 |
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
321 |
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
322 |
"psrad $" #shift ", %%mm0 \n\t"\
|
323 |
"psrad $" #shift ", %%mm2 \n\t"\
|
324 |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
325 |
"movd %%mm7, " #dst " \n\t"\
|
326 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
327 |
"movd %%mm0, 16+" #dst " \n\t"\
|
328 |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
329 |
"movd %%mm2, 96+" #dst " \n\t"\
|
330 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
331 |
"movd %%mm4, 112+" #dst " \n\t"\
|
332 |
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
|
333 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
334 |
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
335 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
336 |
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
337 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
338 |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
|
339 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
340 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
341 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
342 |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
|
343 |
"psrad $" #shift ", %%mm2 \n\t"\
|
344 |
"psrad $" #shift ", %%mm5 \n\t"\
|
345 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
346 |
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
|
347 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
348 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
349 |
"psrad $" #shift ", %%mm6 \n\t"\
|
350 |
"psrad $" #shift ", %%mm4 \n\t"\
|
351 |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
352 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
353 |
"movd %%mm2, 32+" #dst " \n\t"\
|
354 |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
|
355 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
|
356 |
"movd %%mm6, 48+" #dst " \n\t"\
|
357 |
"movd %%mm4, 64+" #dst " \n\t"\
|
358 |
"movd %%mm5, 80+" #dst " \n\t"\
|
359 |
|
360 |
|
361 |
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
362 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
363 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
364 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
365 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
366 |
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
|
367 |
"pand %%mm0, %%mm4 \n\t"\
|
368 |
"por %%mm1, %%mm4 \n\t"\
|
369 |
"por %%mm2, %%mm4 \n\t"\
|
370 |
"por %%mm3, %%mm4 \n\t"\
|
371 |
"packssdw %%mm4,%%mm4 \n\t"\
|
372 |
"movd %%mm4, %%eax \n\t"\
|
373 |
"orl %%eax, %%eax \n\t"\
|
374 |
"jz 1f \n\t"\
|
375 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
376 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
377 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
378 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
379 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
380 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
381 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
382 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
383 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
384 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
385 |
#rounder ", %%mm4 \n\t"\
|
386 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
387 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
388 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
389 |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
|
390 |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
391 |
#rounder ", %%mm0 \n\t"\
|
392 |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
|
393 |
"paddd %%mm0, %%mm0 \n\t" \
|
394 |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
|
395 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
396 |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
|
397 |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
|
398 |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
399 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
400 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
401 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
402 |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
|
403 |
"psrad $" #shift ", %%mm7 \n\t"\
|
404 |
"psrad $" #shift ", %%mm4 \n\t"\
|
405 |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
|
406 |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
|
407 |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
408 |
"psrad $" #shift ", %%mm1 \n\t"\
|
409 |
"psrad $" #shift ", %%mm2 \n\t"\
|
410 |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
|
411 |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
|
412 |
"movq %%mm7, " #dst " \n\t"\
|
413 |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
|
414 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
415 |
"movq %%mm2, 24+" #dst " \n\t"\
|
416 |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
417 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
418 |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
419 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
420 |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
|
421 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
422 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
423 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
424 |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
|
425 |
"psrad $" #shift ", %%mm2 \n\t"\
|
426 |
"psrad $" #shift ", %%mm0 \n\t"\
|
427 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
428 |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
|
429 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
430 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
431 |
"psrad $" #shift ", %%mm6 \n\t"\
|
432 |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
|
433 |
"movq %%mm2, 8+" #dst " \n\t"\
|
434 |
"psrad $" #shift ", %%mm4 \n\t"\
|
435 |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
|
436 |
"movq %%mm4, 16+" #dst " \n\t"\
|
437 |
"jmp 2f \n\t"\
|
438 |
"1: \n\t"\
|
439 |
"pslld $16, %%mm0 \n\t"\
|
440 |
"#paddd "MANGLE(d40000)", %%mm0 \n\t"\
|
441 |
"psrad $13, %%mm0 \n\t"\
|
442 |
"packssdw %%mm0, %%mm0 \n\t"\
|
443 |
"movq %%mm0, " #dst " \n\t"\
|
444 |
"movq %%mm0, 8+" #dst " \n\t"\
|
445 |
"movq %%mm0, 16+" #dst " \n\t"\
|
446 |
"movq %%mm0, 24+" #dst " \n\t"\
|
447 |
"2: \n\t"
|
448 |
|
449 |
|
450 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
451 |
ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
|
452 |
/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
|
453 |
ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
|
454 |
ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
|
455 |
|
456 |
DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
|
457 |
DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
|
458 |
DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
|
459 |
|
460 |
|
461 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
462 |
COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
|
463 |
COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
|
464 |
COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
|
465 |
COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
|
466 |
|
467 |
#else
|
468 |
|
469 |
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
470 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
471 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
472 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
473 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
474 |
"movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
475 |
"pand %%mm0, %%mm4 \n\t"\
|
476 |
"por %%mm1, %%mm4 \n\t"\
|
477 |
"por %%mm2, %%mm4 \n\t"\
|
478 |
"por %%mm3, %%mm4 \n\t"\
|
479 |
"packssdw %%mm4,%%mm4 \n\t"\
|
480 |
"movd %%mm4, %%eax \n\t"\
|
481 |
"orl %%eax, %%eax \n\t"\
|
482 |
"jz 1f \n\t"\
|
483 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
484 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
485 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
486 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
487 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
488 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
489 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
490 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
491 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
492 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
493 |
#rounder ", %%mm4 \n\t"\ |
494 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
495 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
496 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
497 |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
498 |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
499 |
#rounder ", %%mm0 \n\t"\ |
500 |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
501 |
"paddd %%mm0, %%mm0 \n\t" \
|
502 |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
503 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
504 |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
505 |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
506 |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
507 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
508 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
509 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
510 |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
511 |
"psrad $" #shift ", %%mm7 \n\t"\ |
512 |
"psrad $" #shift ", %%mm4 \n\t"\ |
513 |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
514 |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
515 |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
516 |
"psrad $" #shift ", %%mm1 \n\t"\ |
517 |
"psrad $" #shift ", %%mm2 \n\t"\ |
518 |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
519 |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
520 |
"movq %%mm7, " #dst " \n\t"\ |
521 |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
522 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
523 |
"movq %%mm2, 24+" #dst " \n\t"\ |
524 |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
525 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
526 |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
527 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
528 |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
529 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
530 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
531 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
532 |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
533 |
"psrad $" #shift ", %%mm2 \n\t"\ |
534 |
"psrad $" #shift ", %%mm0 \n\t"\ |
535 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
536 |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
537 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
538 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
539 |
"psrad $" #shift ", %%mm6 \n\t"\ |
540 |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
541 |
"movq %%mm2, 8+" #dst " \n\t"\ |
542 |
"psrad $" #shift ", %%mm4 \n\t"\ |
543 |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
544 |
"movq %%mm4, 16+" #dst " \n\t"\ |
545 |
"jmp 2f \n\t"\
|
546 |
"1: \n\t"\
|
547 |
"pslld $16, %%mm0 \n\t"\
|
548 |
"paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
549 |
"psrad $13, %%mm0 \n\t"\
|
550 |
"packssdw %%mm0, %%mm0 \n\t"\
|
551 |
"movq %%mm0, " #dst " \n\t"\ |
552 |
"movq %%mm0, 8+" #dst " \n\t"\ |
553 |
"movq %%mm0, 16+" #dst " \n\t"\ |
554 |
"movq %%mm0, 24+" #dst " \n\t"\ |
555 |
"2: \n\t"
|
556 |
|
557 |
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
|
558 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
559 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
560 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
561 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
562 |
"movq %%mm0, %%mm4 \n\t"\
|
563 |
"por %%mm1, %%mm4 \n\t"\
|
564 |
"por %%mm2, %%mm4 \n\t"\
|
565 |
"por %%mm3, %%mm4 \n\t"\
|
566 |
"packssdw %%mm4,%%mm4 \n\t"\
|
567 |
"movd %%mm4, %%eax \n\t"\
|
568 |
"orl %%eax, %%eax \n\t"\
|
569 |
"jz " #bt " \n\t"\ |
570 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
571 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
572 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
573 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
574 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
575 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
576 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
577 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
578 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
579 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
580 |
#rounder ", %%mm4 \n\t"\ |
581 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
582 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
583 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
584 |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
585 |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
586 |
#rounder ", %%mm0 \n\t"\ |
587 |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
588 |
"paddd %%mm0, %%mm0 \n\t" \
|
589 |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
590 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
591 |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
592 |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
593 |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
594 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
595 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
596 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
597 |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
598 |
"psrad $" #shift ", %%mm7 \n\t"\ |
599 |
"psrad $" #shift ", %%mm4 \n\t"\ |
600 |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
601 |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
602 |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
603 |
"psrad $" #shift ", %%mm1 \n\t"\ |
604 |
"psrad $" #shift ", %%mm2 \n\t"\ |
605 |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
606 |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
607 |
"movq %%mm7, " #dst " \n\t"\ |
608 |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
609 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
610 |
"movq %%mm2, 24+" #dst " \n\t"\ |
611 |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
612 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
613 |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
614 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
615 |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
616 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
617 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
618 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
619 |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
620 |
"psrad $" #shift ", %%mm2 \n\t"\ |
621 |
"psrad $" #shift ", %%mm0 \n\t"\ |
622 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
623 |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
624 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
625 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
626 |
"psrad $" #shift ", %%mm6 \n\t"\ |
627 |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
628 |
"movq %%mm2, 8+" #dst " \n\t"\ |
629 |
"psrad $" #shift ", %%mm4 \n\t"\ |
630 |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
631 |
"movq %%mm4, 16+" #dst " \n\t"\ |
632 |
|
633 |
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
634 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
635 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
636 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
637 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
638 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
639 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
640 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
641 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
642 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
643 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
644 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
645 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
646 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
647 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
648 |
#rounder ", %%mm4 \n\t"\ |
649 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
650 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
651 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
652 |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
653 |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
654 |
#rounder ", %%mm0 \n\t"\ |
655 |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
656 |
"paddd %%mm0, %%mm0 \n\t" \
|
657 |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
658 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
659 |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
660 |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
661 |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
662 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
663 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
664 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
665 |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
666 |
"psrad $" #shift ", %%mm7 \n\t"\ |
667 |
"psrad $" #shift ", %%mm4 \n\t"\ |
668 |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
669 |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
670 |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
671 |
"psrad $" #shift ", %%mm1 \n\t"\ |
672 |
"psrad $" #shift ", %%mm2 \n\t"\ |
673 |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
674 |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
675 |
"movq %%mm7, " #dst " \n\t"\ |
676 |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
677 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
678 |
"movq %%mm2, 24+" #dst " \n\t"\ |
679 |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
680 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
681 |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
682 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
683 |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
684 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
685 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
686 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
687 |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
688 |
"psrad $" #shift ", %%mm2 \n\t"\ |
689 |
"psrad $" #shift ", %%mm0 \n\t"\ |
690 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
691 |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
692 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
693 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
694 |
"psrad $" #shift ", %%mm6 \n\t"\ |
695 |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
696 |
"movq %%mm2, 8+" #dst " \n\t"\ |
697 |
"psrad $" #shift ", %%mm4 \n\t"\ |
698 |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
699 |
"movq %%mm4, 16+" #dst " \n\t"\ |
700 |
|
701 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
702 |
DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) |
703 |
Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) |
704 |
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) |
705 |
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) |
706 |
|
707 |
#undef IDCT
|
708 |
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
709 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
710 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
711 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
712 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
713 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
714 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
715 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
716 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
717 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
718 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
719 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
720 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
721 |
#rounder ", %%mm4 \n\t"\ |
722 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
723 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
724 |
#rounder ", %%mm0 \n\t"\ |
725 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
726 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
727 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
728 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
729 |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
730 |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
731 |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
732 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
733 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
734 |
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ |
735 |
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ |
736 |
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
737 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
738 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
739 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
740 |
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ |
741 |
"psrad $" #shift ", %%mm7 \n\t"\ |
742 |
"psrad $" #shift ", %%mm4 \n\t"\ |
743 |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
744 |
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
745 |
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
746 |
"psrad $" #shift ", %%mm0 \n\t"\ |
747 |
"psrad $" #shift ", %%mm2 \n\t"\ |
748 |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
749 |
"movd %%mm7, " #dst " \n\t"\ |
750 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
751 |
"movd %%mm0, 16+" #dst " \n\t"\ |
752 |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
753 |
"movd %%mm2, 96+" #dst " \n\t"\ |
754 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
755 |
"movd %%mm4, 112+" #dst " \n\t"\ |
756 |
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ |
757 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
758 |
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
759 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
760 |
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
761 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
762 |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
763 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
764 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
765 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
766 |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
767 |
"psrad $" #shift ", %%mm2 \n\t"\ |
768 |
"psrad $" #shift ", %%mm5 \n\t"\ |
769 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
770 |
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ |
771 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
772 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
773 |
"psrad $" #shift ", %%mm6 \n\t"\ |
774 |
"psrad $" #shift ", %%mm4 \n\t"\ |
775 |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
776 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
777 |
"movd %%mm2, 32+" #dst " \n\t"\ |
778 |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
779 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
780 |
"movd %%mm6, 48+" #dst " \n\t"\ |
781 |
"movd %%mm4, 64+" #dst " \n\t"\ |
782 |
"movd %%mm5, 80+" #dst " \n\t" |
783 |
|
784 |
|
785 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
786 |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) |
787 |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) |
788 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) |
789 |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) |
790 |
"jmp 9f \n\t"
|
791 |
|
792 |
"#.balign 16 \n\t"\
|
793 |
"4: \n\t"
|
794 |
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) |
795 |
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) |
796 |
|
797 |
#undef IDCT
|
798 |
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
799 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
800 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
801 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
802 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
803 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
804 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
805 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
806 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
807 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
808 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
809 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
810 |
#rounder ", %%mm4 \n\t"\ |
811 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
812 |
#rounder ", %%mm0 \n\t"\ |
813 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
814 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
815 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
816 |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
817 |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
818 |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
819 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
820 |
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
821 |
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
822 |
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
823 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
824 |
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
825 |
"psrad $" #shift ", %%mm1 \n\t"\ |
826 |
"psrad $" #shift ", %%mm4 \n\t"\ |
827 |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
828 |
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
829 |
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
830 |
"psrad $" #shift ", %%mm0 \n\t"\ |
831 |
"psrad $" #shift ", %%mm2 \n\t"\ |
832 |
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
833 |
"movd %%mm1, " #dst " \n\t"\ |
834 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
835 |
"movd %%mm0, 16+" #dst " \n\t"\ |
836 |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
837 |
"movd %%mm2, 96+" #dst " \n\t"\ |
838 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
839 |
"movd %%mm4, 112+" #dst " \n\t"\ |
840 |
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ |
841 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
842 |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
843 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
844 |
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
845 |
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
846 |
"psrad $" #shift ", %%mm2 \n\t"\ |
847 |
"psrad $" #shift ", %%mm5 \n\t"\ |
848 |
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ |
849 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
850 |
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ |
851 |
"psrad $" #shift ", %%mm6 \n\t"\ |
852 |
"psrad $" #shift ", %%mm1 \n\t"\ |
853 |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
854 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
855 |
"movd %%mm2, 32+" #dst " \n\t"\ |
856 |
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ |
857 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
858 |
"movd %%mm6, 48+" #dst " \n\t"\ |
859 |
"movd %%mm1, 64+" #dst " \n\t"\ |
860 |
"movd %%mm5, 80+" #dst " \n\t" |
861 |
|
862 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
863 |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) |
864 |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) |
865 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) |
866 |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) |
867 |
"jmp 9f \n\t"
|
868 |
|
869 |
"#.balign 16 \n\t"\
|
870 |
"6: \n\t"
|
871 |
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) |
872 |
|
873 |
#undef IDCT
|
874 |
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
875 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
876 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
877 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
878 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
879 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
880 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
881 |
#rounder ", %%mm4 \n\t"\ |
882 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
883 |
#rounder ", %%mm0 \n\t"\ |
884 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
885 |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
886 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
887 |
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
888 |
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
889 |
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
890 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
891 |
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
892 |
"psrad $" #shift ", %%mm1 \n\t"\ |
893 |
"psrad $" #shift ", %%mm4 \n\t"\ |
894 |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
895 |
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
896 |
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
897 |
"psrad $" #shift ", %%mm0 \n\t"\ |
898 |
"psrad $" #shift ", %%mm2 \n\t"\ |
899 |
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
900 |
"movd %%mm1, " #dst " \n\t"\ |
901 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
902 |
"movd %%mm0, 16+" #dst " \n\t"\ |
903 |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
904 |
"movd %%mm2, 96+" #dst " \n\t"\ |
905 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
906 |
"movd %%mm4, 112+" #dst " \n\t"\ |
907 |
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ |
908 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
909 |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
910 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
911 |
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
912 |
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
913 |
"psrad $" #shift ", %%mm2 \n\t"\ |
914 |
"psrad $" #shift ", %%mm5 \n\t"\ |
915 |
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ |
916 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
917 |
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ |
918 |
"psrad $" #shift ", %%mm6 \n\t"\ |
919 |
"psrad $" #shift ", %%mm1 \n\t"\ |
920 |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
921 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
922 |
"movd %%mm2, 32+" #dst " \n\t"\ |
923 |
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ |
924 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
925 |
"movd %%mm6, 48+" #dst " \n\t"\ |
926 |
"movd %%mm1, 64+" #dst " \n\t"\ |
927 |
"movd %%mm5, 80+" #dst " \n\t" |
928 |
|
929 |
|
930 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
931 |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) |
932 |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) |
933 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) |
934 |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) |
935 |
"jmp 9f \n\t"
|
936 |
|
937 |
"#.balign 16 \n\t"\
|
938 |
"2: \n\t"
|
939 |
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) |
940 |
|
941 |
#undef IDCT
|
942 |
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
943 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
944 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
945 |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
946 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
947 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
948 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
949 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
950 |
#rounder ", %%mm4 \n\t"\ |
951 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
952 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
953 |
#rounder ", %%mm0 \n\t"\ |
954 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
955 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
956 |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
957 |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
958 |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
959 |
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ |
960 |
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ |
961 |
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
962 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
963 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
964 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
965 |
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ |
966 |
"psrad $" #shift ", %%mm7 \n\t"\ |
967 |
"psrad $" #shift ", %%mm4 \n\t"\ |
968 |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
969 |
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
970 |
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
971 |
"psrad $" #shift ", %%mm0 \n\t"\ |
972 |
"psrad $" #shift ", %%mm2 \n\t"\ |
973 |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
974 |
"movd %%mm7, " #dst " \n\t"\ |
975 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
976 |
"movd %%mm0, 16+" #dst " \n\t"\ |
977 |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
978 |
"movd %%mm2, 96+" #dst " \n\t"\ |
979 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
980 |
"movd %%mm4, 112+" #dst " \n\t"\ |
981 |
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ |
982 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
983 |
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
984 |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
985 |
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
986 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
987 |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
988 |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
989 |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
990 |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
991 |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
992 |
"psrad $" #shift ", %%mm2 \n\t"\ |
993 |
"psrad $" #shift ", %%mm5 \n\t"\ |
994 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
995 |
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ |
996 |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
997 |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
998 |
"psrad $" #shift ", %%mm6 \n\t"\ |
999 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1000 |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
1001 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1002 |
"movd %%mm2, 32+" #dst " \n\t"\ |
1003 |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
1004 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
1005 |
"movd %%mm6, 48+" #dst " \n\t"\ |
1006 |
"movd %%mm4, 64+" #dst " \n\t"\ |
1007 |
"movd %%mm5, 80+" #dst " \n\t" |
1008 |
|
1009 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
1010 |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) |
1011 |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) |
1012 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) |
1013 |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) |
1014 |
"jmp 9f \n\t"
|
1015 |
|
1016 |
"#.balign 16 \n\t"\
|
1017 |
"3: \n\t"
|
1018 |
#undef IDCT
|
1019 |
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
1020 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
1021 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
1022 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1023 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1024 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
1025 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1026 |
#rounder ", %%mm4 \n\t"\ |
1027 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1028 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
1029 |
#rounder ", %%mm0 \n\t"\ |
1030 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
1031 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1032 |
"movq 64(%2), %%mm3 \n\t"\
|
1033 |
"pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
1034 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
1035 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
1036 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1037 |
"psrad $" #shift ", %%mm7 \n\t"\ |
1038 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1039 |
"movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
1040 |
"paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
1041 |
"psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ |
1042 |
"psrad $" #shift ", %%mm0 \n\t"\ |
1043 |
"psrad $" #shift ", %%mm1 \n\t"\ |
1044 |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
1045 |
"movd %%mm7, " #dst " \n\t"\ |
1046 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
1047 |
"movd %%mm0, 16+" #dst " \n\t"\ |
1048 |
"packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ |
1049 |
"movd %%mm1, 96+" #dst " \n\t"\ |
1050 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1051 |
"movd %%mm4, 112+" #dst " \n\t"\ |
1052 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
1053 |
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
1054 |
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
1055 |
"movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ |
1056 |
"paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ |
1057 |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
1058 |
"psrad $" #shift ", %%mm1 \n\t"\ |
1059 |
"psrad $" #shift ", %%mm5 \n\t"\ |
1060 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
1061 |
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1062 |
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
1063 |
"psrad $" #shift ", %%mm6 \n\t"\ |
1064 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1065 |
"packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ |
1066 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1067 |
"movd %%mm1, 32+" #dst " \n\t"\ |
1068 |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
1069 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
1070 |
"movd %%mm6, 48+" #dst " \n\t"\ |
1071 |
"movd %%mm4, 64+" #dst " \n\t"\ |
1072 |
"movd %%mm5, 80+" #dst " \n\t" |
1073 |
|
1074 |
|
1075 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
1076 |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) |
1077 |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) |
1078 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) |
1079 |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) |
1080 |
"jmp 9f \n\t"
|
1081 |
|
1082 |
"#.balign 16 \n\t"\
|
1083 |
"5: \n\t"
|
1084 |
#undef IDCT
|
1085 |
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
1086 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
1087 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
1088 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1089 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1090 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
1091 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1092 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
1093 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
1094 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
1095 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
1096 |
#rounder ", %%mm4 \n\t"\ |
1097 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1098 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
1099 |
#rounder ", %%mm0 \n\t"\ |
1100 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
1101 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1102 |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
1103 |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
1104 |
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ |
1105 |
"movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ |
1106 |
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ |
1107 |
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1108 |
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ |
1109 |
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1110 |
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ |
1111 |
"pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
1112 |
"pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
1113 |
#rounder ", %%mm1 \n\t"\ |
1114 |
"paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ |
1115 |
"paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ |
1116 |
#rounder ", %%mm2 \n\t"\ |
1117 |
"psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ |
1118 |
"paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ |
1119 |
"paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ |
1120 |
"psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ |
1121 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1122 |
"psrad $" #shift ", %%mm7 \n\t"\ |
1123 |
"psrad $" #shift ", %%mm3 \n\t"\ |
1124 |
"packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ |
1125 |
"movq %%mm4, " #dst " \n\t"\ |
1126 |
"psrad $" #shift ", %%mm0 \n\t"\ |
1127 |
"packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ |
1128 |
"movq %%mm0, 16+" #dst " \n\t"\ |
1129 |
"movq %%mm0, 96+" #dst " \n\t"\ |
1130 |
"movq %%mm4, 112+" #dst " \n\t"\ |
1131 |
"psrad $" #shift ", %%mm5 \n\t"\ |
1132 |
"psrad $" #shift ", %%mm6 \n\t"\ |
1133 |
"psrad $" #shift ", %%mm2 \n\t"\ |
1134 |
"packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
1135 |
"movq %%mm5, 32+" #dst " \n\t"\ |
1136 |
"psrad $" #shift ", %%mm1 \n\t"\ |
1137 |
"packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1138 |
"movq %%mm6, 48+" #dst " \n\t"\ |
1139 |
"movq %%mm6, 64+" #dst " \n\t"\ |
1140 |
"movq %%mm5, 80+" #dst " \n\t" |
1141 |
|
1142 |
|
1143 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
1144 |
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) |
1145 |
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
|
1146 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) |
1147 |
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
|
1148 |
"jmp 9f \n\t"
|
1149 |
|
1150 |
|
1151 |
"#.balign 16 \n\t"\
|
1152 |
"1: \n\t"
|
1153 |
#undef IDCT
|
1154 |
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
1155 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
1156 |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
1157 |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
1158 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1159 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1160 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
1161 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1162 |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
1163 |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
1164 |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
1165 |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
1166 |
#rounder ", %%mm4 \n\t"\ |
1167 |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1168 |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
1169 |
#rounder ", %%mm0 \n\t"\ |
1170 |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
1171 |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
1172 |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
1173 |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1174 |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
1175 |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
1176 |
"movq 64(%2), %%mm1 \n\t"\
|
1177 |
"pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
1178 |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
1179 |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
1180 |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1181 |
"psrad $" #shift ", %%mm7 \n\t"\ |
1182 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1183 |
"movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ |
1184 |
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
1185 |
"psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ |
1186 |
"psrad $" #shift ", %%mm0 \n\t"\ |
1187 |
"psrad $" #shift ", %%mm3 \n\t"\ |
1188 |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
1189 |
"movd %%mm7, " #dst " \n\t"\ |
1190 |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
1191 |
"movd %%mm0, 16+" #dst " \n\t"\ |
1192 |
"packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ |
1193 |
"movd %%mm3, 96+" #dst " \n\t"\ |
1194 |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1195 |
"movd %%mm4, 112+" #dst " \n\t"\ |
1196 |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
1197 |
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
1198 |
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
1199 |
"movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ |
1200 |
"paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ |
1201 |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
1202 |
"psrad $" #shift ", %%mm3 \n\t"\ |
1203 |
"psrad $" #shift ", %%mm5 \n\t"\ |
1204 |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
1205 |
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1206 |
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
1207 |
"psrad $" #shift ", %%mm6 \n\t"\ |
1208 |
"packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ |
1209 |
"movd %%mm3, 32+" #dst " \n\t"\ |
1210 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1211 |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1212 |
"movd %%mm6, 48+" #dst " \n\t"\ |
1213 |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
1214 |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
1215 |
"movd %%mm4, 64+" #dst " \n\t"\ |
1216 |
"movd %%mm5, 80+" #dst " \n\t" |
1217 |
|
1218 |
|
1219 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
1220 |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) |
1221 |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) |
1222 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) |
1223 |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) |
1224 |
"jmp 9f \n\t"
|
1225 |
|
1226 |
|
1227 |
"#.balign 16 \n\t"
|
1228 |
"7: \n\t"
|
1229 |
#undef IDCT
|
1230 |
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
1231 |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
1232 |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1233 |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1234 |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
1235 |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1236 |
#rounder ", %%mm4 \n\t"\ |
1237 |
#rounder ", %%mm0 \n\t"\ |
1238 |
"psrad $" #shift ", %%mm4 \n\t"\ |
1239 |
"psrad $" #shift ", %%mm0 \n\t"\ |
1240 |
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ |
1241 |
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ |
1242 |
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
1243 |
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ |
1244 |
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
1245 |
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ |
1246 |
#rounder ", %%mm1 \n\t"\ |
1247 |
#rounder ", %%mm2 \n\t"\ |
1248 |
"psrad $" #shift ", %%mm1 \n\t"\ |
1249 |
"packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ |
1250 |
"movq %%mm4, " #dst " \n\t"\ |
1251 |
"psrad $" #shift ", %%mm2 \n\t"\ |
1252 |
"packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ |
1253 |
"movq %%mm0, 16+" #dst " \n\t"\ |
1254 |
"movq %%mm0, 96+" #dst " \n\t"\ |
1255 |
"movq %%mm4, 112+" #dst " \n\t"\ |
1256 |
"movq %%mm0, 32+" #dst " \n\t"\ |
1257 |
"movq %%mm4, 48+" #dst " \n\t"\ |
1258 |
"movq %%mm4, 64+" #dst " \n\t"\ |
1259 |
"movq %%mm0, 80+" #dst " \n\t" |
1260 |
|
1261 |
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
1262 |
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) |
1263 |
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
|
1264 |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) |
1265 |
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
|
1266 |
|
1267 |
|
1268 |
#endif
|
1269 |
|
1270 |
/*
|
1271 |
Input
|
1272 |
00 40 04 44 20 60 24 64
|
1273 |
10 30 14 34 50 70 54 74
|
1274 |
01 41 03 43 21 61 23 63
|
1275 |
11 31 13 33 51 71 53 73
|
1276 |
02 42 06 46 22 62 26 66
|
1277 |
12 32 16 36 52 72 56 76
|
1278 |
05 45 07 47 25 65 27 67
|
1279 |
15 35 17 37 55 75 57 77
|
1280 |
|
1281 |
Temp
|
1282 |
00 04 10 14 20 24 30 34
|
1283 |
40 44 50 54 60 64 70 74
|
1284 |
01 03 11 13 21 23 31 33
|
1285 |
41 43 51 53 61 63 71 73
|
1286 |
02 06 12 16 22 26 32 36
|
1287 |
42 46 52 56 62 66 72 76
|
1288 |
05 07 15 17 25 27 35 37
|
1289 |
45 47 55 57 65 67 75 77
|
1290 |
*/
|
1291 |
|
1292 |
"9: \n\t"
|
1293 |
:: "r" (block), "r" (temp), "r" (coeffs) |
1294 |
: "%eax"
|
1295 |
); |
1296 |
} |
1297 |
|
1298 |
void ff_simple_idct_mmx(int16_t *block)
|
1299 |
{ |
1300 |
idct(block); |
1301 |
} |
1302 |
|
1303 |
//FIXME merge add/put into the idct
|
1304 |
|
1305 |
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
1306 |
{ |
1307 |
idct(block); |
1308 |
put_pixels_clamped_mmx(block, dest, line_size); |
1309 |
} |
1310 |
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
1311 |
{ |
1312 |
idct(block); |
1313 |
add_pixels_clamped_mmx(block, dest, line_size); |
1314 |
} |