ffmpeg / libavcodec / i386 / mpegvideo_mmx.c @ 5509bffa
History | View | Annotate | Download (31 KB)
1 |
/*
|
---|---|
2 |
* The simplest mpeg encoder (well, it was the simplest!)
|
3 |
* Copyright (c) 2000,2001 Fabrice Bellard.
|
4 |
*
|
5 |
* This library is free software; you can redistribute it and/or
|
6 |
* modify it under the terms of the GNU Lesser General Public
|
7 |
* License as published by the Free Software Foundation; either
|
8 |
* version 2 of the License, or (at your option) any later version.
|
9 |
*
|
10 |
* This library is distributed in the hope that it will be useful,
|
11 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13 |
* Lesser General Public License for more details.
|
14 |
*
|
15 |
* You should have received a copy of the GNU Lesser General Public
|
16 |
* License along with this library; if not, write to the Free Software
|
17 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
18 |
*
|
19 |
* Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru>
|
20 |
* h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
|
21 |
*/
|
22 |
|
23 |
#include "../dsputil.h" |
24 |
#include "../mpegvideo.h" |
25 |
#include "../avcodec.h" |
26 |
#include "mmx.h" |
27 |
|
28 |
extern uint8_t zigzag_direct_noperm[64]; |
29 |
extern uint16_t inv_zigzag_direct16[64]; |
30 |
|
31 |
static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; |
32 |
static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; |
33 |
|
34 |
|
35 |
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, |
36 |
DCTELEM *block, int n, int qscale) |
37 |
{ |
38 |
long level, qmul, qadd, nCoeffs;
|
39 |
|
40 |
qmul = qscale << 1;
|
41 |
|
42 |
assert(s->block_last_index[n]>=0 || s->h263_aic);
|
43 |
|
44 |
if (!s->h263_aic) {
|
45 |
if (n < 4) |
46 |
level = block[0] * s->y_dc_scale;
|
47 |
else
|
48 |
level = block[0] * s->c_dc_scale;
|
49 |
qadd = (qscale - 1) | 1; |
50 |
}else{
|
51 |
qadd = 0;
|
52 |
level= block[0];
|
53 |
} |
54 |
if(s->ac_pred)
|
55 |
nCoeffs=63;
|
56 |
else
|
57 |
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; |
58 |
//printf("%d %d ", qmul, qadd);
|
59 |
asm volatile( |
60 |
"movd %1, %%mm6 \n\t" //qmul |
61 |
"packssdw %%mm6, %%mm6 \n\t"
|
62 |
"packssdw %%mm6, %%mm6 \n\t"
|
63 |
"movd %2, %%mm5 \n\t" //qadd |
64 |
"pxor %%mm7, %%mm7 \n\t"
|
65 |
"packssdw %%mm5, %%mm5 \n\t"
|
66 |
"packssdw %%mm5, %%mm5 \n\t"
|
67 |
"psubw %%mm5, %%mm7 \n\t"
|
68 |
"pxor %%mm4, %%mm4 \n\t"
|
69 |
".balign 16 \n\t"
|
70 |
"1: \n\t"
|
71 |
"movq (%0, %3), %%mm0 \n\t"
|
72 |
"movq 8(%0, %3), %%mm1 \n\t"
|
73 |
|
74 |
"pmullw %%mm6, %%mm0 \n\t"
|
75 |
"pmullw %%mm6, %%mm1 \n\t"
|
76 |
|
77 |
"movq (%0, %3), %%mm2 \n\t"
|
78 |
"movq 8(%0, %3), %%mm3 \n\t"
|
79 |
|
80 |
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
81 |
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
82 |
|
83 |
"pxor %%mm2, %%mm0 \n\t"
|
84 |
"pxor %%mm3, %%mm1 \n\t"
|
85 |
|
86 |
"paddw %%mm7, %%mm0 \n\t"
|
87 |
"paddw %%mm7, %%mm1 \n\t"
|
88 |
|
89 |
"pxor %%mm0, %%mm2 \n\t"
|
90 |
"pxor %%mm1, %%mm3 \n\t"
|
91 |
|
92 |
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 |
93 |
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 |
94 |
|
95 |
"pandn %%mm2, %%mm0 \n\t"
|
96 |
"pandn %%mm3, %%mm1 \n\t"
|
97 |
|
98 |
"movq %%mm0, (%0, %3) \n\t"
|
99 |
"movq %%mm1, 8(%0, %3) \n\t"
|
100 |
|
101 |
"add $16, %3 \n\t"
|
102 |
"jng 1b \n\t"
|
103 |
::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) |
104 |
: "memory"
|
105 |
); |
106 |
block[0]= level;
|
107 |
} |
108 |
|
109 |
|
110 |
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, |
111 |
DCTELEM *block, int n, int qscale) |
112 |
{ |
113 |
long qmul, qadd, nCoeffs;
|
114 |
|
115 |
qmul = qscale << 1;
|
116 |
qadd = (qscale - 1) | 1; |
117 |
|
118 |
assert(s->block_last_index[n]>=0 || s->h263_aic);
|
119 |
|
120 |
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; |
121 |
//printf("%d %d ", qmul, qadd);
|
122 |
asm volatile( |
123 |
"movd %1, %%mm6 \n\t" //qmul |
124 |
"packssdw %%mm6, %%mm6 \n\t"
|
125 |
"packssdw %%mm6, %%mm6 \n\t"
|
126 |
"movd %2, %%mm5 \n\t" //qadd |
127 |
"pxor %%mm7, %%mm7 \n\t"
|
128 |
"packssdw %%mm5, %%mm5 \n\t"
|
129 |
"packssdw %%mm5, %%mm5 \n\t"
|
130 |
"psubw %%mm5, %%mm7 \n\t"
|
131 |
"pxor %%mm4, %%mm4 \n\t"
|
132 |
".balign 16 \n\t"
|
133 |
"1: \n\t"
|
134 |
"movq (%0, %3), %%mm0 \n\t"
|
135 |
"movq 8(%0, %3), %%mm1 \n\t"
|
136 |
|
137 |
"pmullw %%mm6, %%mm0 \n\t"
|
138 |
"pmullw %%mm6, %%mm1 \n\t"
|
139 |
|
140 |
"movq (%0, %3), %%mm2 \n\t"
|
141 |
"movq 8(%0, %3), %%mm3 \n\t"
|
142 |
|
143 |
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
144 |
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
145 |
|
146 |
"pxor %%mm2, %%mm0 \n\t"
|
147 |
"pxor %%mm3, %%mm1 \n\t"
|
148 |
|
149 |
"paddw %%mm7, %%mm0 \n\t"
|
150 |
"paddw %%mm7, %%mm1 \n\t"
|
151 |
|
152 |
"pxor %%mm0, %%mm2 \n\t"
|
153 |
"pxor %%mm1, %%mm3 \n\t"
|
154 |
|
155 |
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 |
156 |
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 |
157 |
|
158 |
"pandn %%mm2, %%mm0 \n\t"
|
159 |
"pandn %%mm3, %%mm1 \n\t"
|
160 |
|
161 |
"movq %%mm0, (%0, %3) \n\t"
|
162 |
"movq %%mm1, 8(%0, %3) \n\t"
|
163 |
|
164 |
"add $16, %3 \n\t"
|
165 |
"jng 1b \n\t"
|
166 |
::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) |
167 |
: "memory"
|
168 |
); |
169 |
} |
170 |
|
171 |
|
172 |
/*
|
173 |
NK:
|
174 |
Note: looking at PARANOID:
|
175 |
"enable all paranoid tests for rounding, overflows, etc..."
|
176 |
|
177 |
#ifdef PARANOID
|
178 |
if (level < -2048 || level > 2047)
|
179 |
fprintf(stderr, "unquant error %d %d\n", i, level);
|
180 |
#endif
|
181 |
We can suppose that result of two multiplications can't be greate of 0xFFFF
|
182 |
i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
|
183 |
a complex multiplication.
|
184 |
=====================================================
|
185 |
Full formula for multiplication of 2 integer numbers
|
186 |
which are represent as high:low words:
|
187 |
input: value1 = high1:low1
|
188 |
value2 = high2:low2
|
189 |
output: value3 = value1*value2
|
190 |
value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
|
191 |
this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
|
192 |
but this algorithm will compute only 0x66cb0ce4
|
193 |
this limited by 16-bit size of operands
|
194 |
---------------------------------
|
195 |
tlow1 = high1*low2
|
196 |
tlow2 = high2*low1
|
197 |
tlow1 = tlow1 + tlow2
|
198 |
high3:low3 = low1*low2
|
199 |
high3 += tlow1
|
200 |
*/
|
201 |
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, |
202 |
DCTELEM *block, int n, int qscale) |
203 |
{ |
204 |
long nCoeffs;
|
205 |
const uint16_t *quant_matrix;
|
206 |
int block0;
|
207 |
|
208 |
assert(s->block_last_index[n]>=0);
|
209 |
|
210 |
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
|
211 |
|
212 |
if (n < 4) |
213 |
block0 = block[0] * s->y_dc_scale;
|
214 |
else
|
215 |
block0 = block[0] * s->c_dc_scale;
|
216 |
/* XXX: only mpeg1 */
|
217 |
quant_matrix = s->intra_matrix; |
218 |
asm volatile( |
219 |
"pcmpeqw %%mm7, %%mm7 \n\t"
|
220 |
"psrlw $15, %%mm7 \n\t"
|
221 |
"movd %2, %%mm6 \n\t"
|
222 |
"packssdw %%mm6, %%mm6 \n\t"
|
223 |
"packssdw %%mm6, %%mm6 \n\t"
|
224 |
"mov %3, %%"REG_a" \n\t" |
225 |
".balign 16 \n\t"
|
226 |
"1: \n\t"
|
227 |
"movq (%0, %%"REG_a"), %%mm0 \n\t" |
228 |
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" |
229 |
"movq (%1, %%"REG_a"), %%mm4 \n\t" |
230 |
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" |
231 |
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
232 |
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
233 |
"pxor %%mm2, %%mm2 \n\t"
|
234 |
"pxor %%mm3, %%mm3 \n\t"
|
235 |
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
236 |
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
237 |
"pxor %%mm2, %%mm0 \n\t"
|
238 |
"pxor %%mm3, %%mm1 \n\t"
|
239 |
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
240 |
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
241 |
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q |
242 |
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q |
243 |
"pxor %%mm4, %%mm4 \n\t"
|
244 |
"pxor %%mm5, %%mm5 \n\t" // FIXME slow |
245 |
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
246 |
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 |
247 |
"psraw $3, %%mm0 \n\t"
|
248 |
"psraw $3, %%mm1 \n\t"
|
249 |
"psubw %%mm7, %%mm0 \n\t"
|
250 |
"psubw %%mm7, %%mm1 \n\t"
|
251 |
"por %%mm7, %%mm0 \n\t"
|
252 |
"por %%mm7, %%mm1 \n\t"
|
253 |
"pxor %%mm2, %%mm0 \n\t"
|
254 |
"pxor %%mm3, %%mm1 \n\t"
|
255 |
"psubw %%mm2, %%mm0 \n\t"
|
256 |
"psubw %%mm3, %%mm1 \n\t"
|
257 |
"pandn %%mm0, %%mm4 \n\t"
|
258 |
"pandn %%mm1, %%mm5 \n\t"
|
259 |
"movq %%mm4, (%0, %%"REG_a") \n\t" |
260 |
"movq %%mm5, 8(%0, %%"REG_a") \n\t" |
261 |
|
262 |
"add $16, %%"REG_a" \n\t" |
263 |
"js 1b \n\t"
|
264 |
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) |
265 |
: "%"REG_a, "memory" |
266 |
); |
267 |
block[0]= block0;
|
268 |
} |
269 |
|
270 |
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, |
271 |
DCTELEM *block, int n, int qscale) |
272 |
{ |
273 |
long nCoeffs;
|
274 |
const uint16_t *quant_matrix;
|
275 |
|
276 |
assert(s->block_last_index[n]>=0);
|
277 |
|
278 |
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
|
279 |
|
280 |
quant_matrix = s->inter_matrix; |
281 |
asm volatile( |
282 |
"pcmpeqw %%mm7, %%mm7 \n\t"
|
283 |
"psrlw $15, %%mm7 \n\t"
|
284 |
"movd %2, %%mm6 \n\t"
|
285 |
"packssdw %%mm6, %%mm6 \n\t"
|
286 |
"packssdw %%mm6, %%mm6 \n\t"
|
287 |
"mov %3, %%"REG_a" \n\t" |
288 |
".balign 16 \n\t"
|
289 |
"1: \n\t"
|
290 |
"movq (%0, %%"REG_a"), %%mm0 \n\t" |
291 |
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" |
292 |
"movq (%1, %%"REG_a"), %%mm4 \n\t" |
293 |
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" |
294 |
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
295 |
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
296 |
"pxor %%mm2, %%mm2 \n\t"
|
297 |
"pxor %%mm3, %%mm3 \n\t"
|
298 |
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
299 |
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
300 |
"pxor %%mm2, %%mm0 \n\t"
|
301 |
"pxor %%mm3, %%mm1 \n\t"
|
302 |
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
303 |
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
304 |
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 |
305 |
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 |
306 |
"paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 |
307 |
"paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 |
308 |
"pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q |
309 |
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q |
310 |
"pxor %%mm4, %%mm4 \n\t"
|
311 |
"pxor %%mm5, %%mm5 \n\t" // FIXME slow |
312 |
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
313 |
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 |
314 |
"psraw $4, %%mm0 \n\t"
|
315 |
"psraw $4, %%mm1 \n\t"
|
316 |
"psubw %%mm7, %%mm0 \n\t"
|
317 |
"psubw %%mm7, %%mm1 \n\t"
|
318 |
"por %%mm7, %%mm0 \n\t"
|
319 |
"por %%mm7, %%mm1 \n\t"
|
320 |
"pxor %%mm2, %%mm0 \n\t"
|
321 |
"pxor %%mm3, %%mm1 \n\t"
|
322 |
"psubw %%mm2, %%mm0 \n\t"
|
323 |
"psubw %%mm3, %%mm1 \n\t"
|
324 |
"pandn %%mm0, %%mm4 \n\t"
|
325 |
"pandn %%mm1, %%mm5 \n\t"
|
326 |
"movq %%mm4, (%0, %%"REG_a") \n\t" |
327 |
"movq %%mm5, 8(%0, %%"REG_a") \n\t" |
328 |
|
329 |
"add $16, %%"REG_a" \n\t" |
330 |
"js 1b \n\t"
|
331 |
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) |
332 |
: "%"REG_a, "memory" |
333 |
); |
334 |
} |
335 |
|
336 |
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, |
337 |
DCTELEM *block, int n, int qscale) |
338 |
{ |
339 |
long nCoeffs;
|
340 |
const uint16_t *quant_matrix;
|
341 |
int block0;
|
342 |
|
343 |
assert(s->block_last_index[n]>=0);
|
344 |
|
345 |
if(s->alternate_scan) nCoeffs= 63; //FIXME |
346 |
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
|
347 |
|
348 |
if (n < 4) |
349 |
block0 = block[0] * s->y_dc_scale;
|
350 |
else
|
351 |
block0 = block[0] * s->c_dc_scale;
|
352 |
quant_matrix = s->intra_matrix; |
353 |
asm volatile( |
354 |
"pcmpeqw %%mm7, %%mm7 \n\t"
|
355 |
"psrlw $15, %%mm7 \n\t"
|
356 |
"movd %2, %%mm6 \n\t"
|
357 |
"packssdw %%mm6, %%mm6 \n\t"
|
358 |
"packssdw %%mm6, %%mm6 \n\t"
|
359 |
"mov %3, %%"REG_a" \n\t" |
360 |
".balign 16 \n\t"
|
361 |
"1: \n\t"
|
362 |
"movq (%0, %%"REG_a"), %%mm0 \n\t" |
363 |
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" |
364 |
"movq (%1, %%"REG_a"), %%mm4 \n\t" |
365 |
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" |
366 |
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
367 |
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
368 |
"pxor %%mm2, %%mm2 \n\t"
|
369 |
"pxor %%mm3, %%mm3 \n\t"
|
370 |
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
371 |
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
372 |
"pxor %%mm2, %%mm0 \n\t"
|
373 |
"pxor %%mm3, %%mm1 \n\t"
|
374 |
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
375 |
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
376 |
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q |
377 |
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q |
378 |
"pxor %%mm4, %%mm4 \n\t"
|
379 |
"pxor %%mm5, %%mm5 \n\t" // FIXME slow |
380 |
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
381 |
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 |
382 |
"psraw $3, %%mm0 \n\t"
|
383 |
"psraw $3, %%mm1 \n\t"
|
384 |
"pxor %%mm2, %%mm0 \n\t"
|
385 |
"pxor %%mm3, %%mm1 \n\t"
|
386 |
"psubw %%mm2, %%mm0 \n\t"
|
387 |
"psubw %%mm3, %%mm1 \n\t"
|
388 |
"pandn %%mm0, %%mm4 \n\t"
|
389 |
"pandn %%mm1, %%mm5 \n\t"
|
390 |
"movq %%mm4, (%0, %%"REG_a") \n\t" |
391 |
"movq %%mm5, 8(%0, %%"REG_a") \n\t" |
392 |
|
393 |
"add $16, %%"REG_a" \n\t" |
394 |
"jng 1b \n\t"
|
395 |
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) |
396 |
: "%"REG_a, "memory" |
397 |
); |
398 |
block[0]= block0;
|
399 |
//Note, we dont do mismatch control for intra as errors cannot accumulate
|
400 |
} |
401 |
|
402 |
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, |
403 |
DCTELEM *block, int n, int qscale) |
404 |
{ |
405 |
long nCoeffs;
|
406 |
const uint16_t *quant_matrix;
|
407 |
|
408 |
assert(s->block_last_index[n]>=0);
|
409 |
|
410 |
if(s->alternate_scan) nCoeffs= 63; //FIXME |
411 |
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
|
412 |
|
413 |
quant_matrix = s->inter_matrix; |
414 |
asm volatile( |
415 |
"pcmpeqw %%mm7, %%mm7 \n\t"
|
416 |
"psrlq $48, %%mm7 \n\t"
|
417 |
"movd %2, %%mm6 \n\t"
|
418 |
"packssdw %%mm6, %%mm6 \n\t"
|
419 |
"packssdw %%mm6, %%mm6 \n\t"
|
420 |
"mov %3, %%"REG_a" \n\t" |
421 |
".balign 16 \n\t"
|
422 |
"1: \n\t"
|
423 |
"movq (%0, %%"REG_a"), %%mm0 \n\t" |
424 |
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" |
425 |
"movq (%1, %%"REG_a"), %%mm4 \n\t" |
426 |
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" |
427 |
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
428 |
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
429 |
"pxor %%mm2, %%mm2 \n\t"
|
430 |
"pxor %%mm3, %%mm3 \n\t"
|
431 |
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
432 |
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
433 |
"pxor %%mm2, %%mm0 \n\t"
|
434 |
"pxor %%mm3, %%mm1 \n\t"
|
435 |
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
436 |
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
437 |
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 |
438 |
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 |
439 |
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q |
440 |
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q |
441 |
"paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q |
442 |
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q |
443 |
"pxor %%mm4, %%mm4 \n\t"
|
444 |
"pxor %%mm5, %%mm5 \n\t" // FIXME slow |
445 |
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
446 |
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 |
447 |
"psrlw $4, %%mm0 \n\t"
|
448 |
"psrlw $4, %%mm1 \n\t"
|
449 |
"pxor %%mm2, %%mm0 \n\t"
|
450 |
"pxor %%mm3, %%mm1 \n\t"
|
451 |
"psubw %%mm2, %%mm0 \n\t"
|
452 |
"psubw %%mm3, %%mm1 \n\t"
|
453 |
"pandn %%mm0, %%mm4 \n\t"
|
454 |
"pandn %%mm1, %%mm5 \n\t"
|
455 |
"pxor %%mm4, %%mm7 \n\t"
|
456 |
"pxor %%mm5, %%mm7 \n\t"
|
457 |
"movq %%mm4, (%0, %%"REG_a") \n\t" |
458 |
"movq %%mm5, 8(%0, %%"REG_a") \n\t" |
459 |
|
460 |
"add $16, %%"REG_a" \n\t" |
461 |
"jng 1b \n\t"
|
462 |
"movd 124(%0, %3), %%mm0 \n\t"
|
463 |
"movq %%mm7, %%mm6 \n\t"
|
464 |
"psrlq $32, %%mm7 \n\t"
|
465 |
"pxor %%mm6, %%mm7 \n\t"
|
466 |
"movq %%mm7, %%mm6 \n\t"
|
467 |
"psrlq $16, %%mm7 \n\t"
|
468 |
"pxor %%mm6, %%mm7 \n\t"
|
469 |
"pslld $31, %%mm7 \n\t"
|
470 |
"psrlq $15, %%mm7 \n\t"
|
471 |
"pxor %%mm7, %%mm0 \n\t"
|
472 |
"movd %%mm0, 124(%0, %3) \n\t"
|
473 |
|
474 |
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) |
475 |
: "%"REG_a, "memory" |
476 |
); |
477 |
} |
478 |
|
479 |
/* draw the edges of width 'w' of an image of size width, height
|
480 |
this mmx version can only handle w==8 || w==16 */
|
481 |
static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) |
482 |
{ |
483 |
uint8_t *ptr, *last_line; |
484 |
int i;
|
485 |
|
486 |
last_line = buf + (height - 1) * wrap;
|
487 |
/* left and right */
|
488 |
ptr = buf; |
489 |
if(w==8) |
490 |
{ |
491 |
asm volatile( |
492 |
"1: \n\t"
|
493 |
"movd (%0), %%mm0 \n\t"
|
494 |
"punpcklbw %%mm0, %%mm0 \n\t"
|
495 |
"punpcklwd %%mm0, %%mm0 \n\t"
|
496 |
"punpckldq %%mm0, %%mm0 \n\t"
|
497 |
"movq %%mm0, -8(%0) \n\t"
|
498 |
"movq -8(%0, %2), %%mm1 \n\t"
|
499 |
"punpckhbw %%mm1, %%mm1 \n\t"
|
500 |
"punpckhwd %%mm1, %%mm1 \n\t"
|
501 |
"punpckhdq %%mm1, %%mm1 \n\t"
|
502 |
"movq %%mm1, (%0, %2) \n\t"
|
503 |
"add %1, %0 \n\t"
|
504 |
"cmp %3, %0 \n\t"
|
505 |
" jb 1b \n\t"
|
506 |
: "+r" (ptr)
|
507 |
: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) |
508 |
); |
509 |
} |
510 |
else
|
511 |
{ |
512 |
asm volatile( |
513 |
"1: \n\t"
|
514 |
"movd (%0), %%mm0 \n\t"
|
515 |
"punpcklbw %%mm0, %%mm0 \n\t"
|
516 |
"punpcklwd %%mm0, %%mm0 \n\t"
|
517 |
"punpckldq %%mm0, %%mm0 \n\t"
|
518 |
"movq %%mm0, -8(%0) \n\t"
|
519 |
"movq %%mm0, -16(%0) \n\t"
|
520 |
"movq -8(%0, %2), %%mm1 \n\t"
|
521 |
"punpckhbw %%mm1, %%mm1 \n\t"
|
522 |
"punpckhwd %%mm1, %%mm1 \n\t"
|
523 |
"punpckhdq %%mm1, %%mm1 \n\t"
|
524 |
"movq %%mm1, (%0, %2) \n\t"
|
525 |
"movq %%mm1, 8(%0, %2) \n\t"
|
526 |
"add %1, %0 \n\t"
|
527 |
"cmp %3, %0 \n\t"
|
528 |
" jb 1b \n\t"
|
529 |
: "+r" (ptr)
|
530 |
: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) |
531 |
); |
532 |
} |
533 |
|
534 |
for(i=0;i<w;i+=4) { |
535 |
/* top and bottom (and hopefully also the corners) */
|
536 |
ptr= buf - (i + 1) * wrap - w;
|
537 |
asm volatile( |
538 |
"1: \n\t"
|
539 |
"movq (%1, %0), %%mm0 \n\t"
|
540 |
"movq %%mm0, (%0) \n\t"
|
541 |
"movq %%mm0, (%0, %2) \n\t"
|
542 |
"movq %%mm0, (%0, %2, 2) \n\t"
|
543 |
"movq %%mm0, (%0, %3) \n\t"
|
544 |
"add $8, %0 \n\t"
|
545 |
"cmp %4, %0 \n\t"
|
546 |
" jb 1b \n\t"
|
547 |
: "+r" (ptr)
|
548 |
: "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w) |
549 |
); |
550 |
ptr= last_line + (i + 1) * wrap - w;
|
551 |
asm volatile( |
552 |
"1: \n\t"
|
553 |
"movq (%1, %0), %%mm0 \n\t"
|
554 |
"movq %%mm0, (%0) \n\t"
|
555 |
"movq %%mm0, (%0, %2) \n\t"
|
556 |
"movq %%mm0, (%0, %2, 2) \n\t"
|
557 |
"movq %%mm0, (%0, %3) \n\t"
|
558 |
"add $8, %0 \n\t"
|
559 |
"cmp %4, %0 \n\t"
|
560 |
" jb 1b \n\t"
|
561 |
: "+r" (ptr)
|
562 |
: "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w) |
563 |
); |
564 |
} |
565 |
} |
566 |
|
567 |
static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ |
568 |
const int intra= s->mb_intra; |
569 |
int *sum= s->dct_error_sum[intra];
|
570 |
uint16_t *offset= s->dct_offset[intra]; |
571 |
|
572 |
s->dct_count[intra]++; |
573 |
|
574 |
asm volatile( |
575 |
"pxor %%mm7, %%mm7 \n\t"
|
576 |
"1: \n\t"
|
577 |
"pxor %%mm0, %%mm0 \n\t"
|
578 |
"pxor %%mm1, %%mm1 \n\t"
|
579 |
"movq (%0), %%mm2 \n\t"
|
580 |
"movq 8(%0), %%mm3 \n\t"
|
581 |
"pcmpgtw %%mm2, %%mm0 \n\t"
|
582 |
"pcmpgtw %%mm3, %%mm1 \n\t"
|
583 |
"pxor %%mm0, %%mm2 \n\t"
|
584 |
"pxor %%mm1, %%mm3 \n\t"
|
585 |
"psubw %%mm0, %%mm2 \n\t"
|
586 |
"psubw %%mm1, %%mm3 \n\t"
|
587 |
"movq %%mm2, %%mm4 \n\t"
|
588 |
"movq %%mm3, %%mm5 \n\t"
|
589 |
"psubusw (%2), %%mm2 \n\t"
|
590 |
"psubusw 8(%2), %%mm3 \n\t"
|
591 |
"pxor %%mm0, %%mm2 \n\t"
|
592 |
"pxor %%mm1, %%mm3 \n\t"
|
593 |
"psubw %%mm0, %%mm2 \n\t"
|
594 |
"psubw %%mm1, %%mm3 \n\t"
|
595 |
"movq %%mm2, (%0) \n\t"
|
596 |
"movq %%mm3, 8(%0) \n\t"
|
597 |
"movq %%mm4, %%mm2 \n\t"
|
598 |
"movq %%mm5, %%mm3 \n\t"
|
599 |
"punpcklwd %%mm7, %%mm4 \n\t"
|
600 |
"punpckhwd %%mm7, %%mm2 \n\t"
|
601 |
"punpcklwd %%mm7, %%mm5 \n\t"
|
602 |
"punpckhwd %%mm7, %%mm3 \n\t"
|
603 |
"paddd (%1), %%mm4 \n\t"
|
604 |
"paddd 8(%1), %%mm2 \n\t"
|
605 |
"paddd 16(%1), %%mm5 \n\t"
|
606 |
"paddd 24(%1), %%mm3 \n\t"
|
607 |
"movq %%mm4, (%1) \n\t"
|
608 |
"movq %%mm2, 8(%1) \n\t"
|
609 |
"movq %%mm5, 16(%1) \n\t"
|
610 |
"movq %%mm3, 24(%1) \n\t"
|
611 |
"add $16, %0 \n\t"
|
612 |
"add $32, %1 \n\t"
|
613 |
"add $16, %2 \n\t"
|
614 |
"cmp %3, %0 \n\t"
|
615 |
" jb 1b \n\t"
|
616 |
: "+r" (block), "+r" (sum), "+r" (offset) |
617 |
: "r"(block+64) |
618 |
); |
619 |
} |
620 |
|
621 |
static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ |
622 |
const int intra= s->mb_intra; |
623 |
int *sum= s->dct_error_sum[intra];
|
624 |
uint16_t *offset= s->dct_offset[intra]; |
625 |
|
626 |
s->dct_count[intra]++; |
627 |
|
628 |
asm volatile( |
629 |
"pxor %%xmm7, %%xmm7 \n\t"
|
630 |
"1: \n\t"
|
631 |
"pxor %%xmm0, %%xmm0 \n\t"
|
632 |
"pxor %%xmm1, %%xmm1 \n\t"
|
633 |
"movdqa (%0), %%xmm2 \n\t"
|
634 |
"movdqa 16(%0), %%xmm3 \n\t"
|
635 |
"pcmpgtw %%xmm2, %%xmm0 \n\t"
|
636 |
"pcmpgtw %%xmm3, %%xmm1 \n\t"
|
637 |
"pxor %%xmm0, %%xmm2 \n\t"
|
638 |
"pxor %%xmm1, %%xmm3 \n\t"
|
639 |
"psubw %%xmm0, %%xmm2 \n\t"
|
640 |
"psubw %%xmm1, %%xmm3 \n\t"
|
641 |
"movdqa %%xmm2, %%xmm4 \n\t"
|
642 |
"movdqa %%xmm3, %%xmm5 \n\t"
|
643 |
"psubusw (%2), %%xmm2 \n\t"
|
644 |
"psubusw 16(%2), %%xmm3 \n\t"
|
645 |
"pxor %%xmm0, %%xmm2 \n\t"
|
646 |
"pxor %%xmm1, %%xmm3 \n\t"
|
647 |
"psubw %%xmm0, %%xmm2 \n\t"
|
648 |
"psubw %%xmm1, %%xmm3 \n\t"
|
649 |
"movdqa %%xmm2, (%0) \n\t"
|
650 |
"movdqa %%xmm3, 16(%0) \n\t"
|
651 |
"movdqa %%xmm4, %%xmm6 \n\t"
|
652 |
"movdqa %%xmm5, %%xmm0 \n\t"
|
653 |
"punpcklwd %%xmm7, %%xmm4 \n\t"
|
654 |
"punpckhwd %%xmm7, %%xmm6 \n\t"
|
655 |
"punpcklwd %%xmm7, %%xmm5 \n\t"
|
656 |
"punpckhwd %%xmm7, %%xmm0 \n\t"
|
657 |
"paddd (%1), %%xmm4 \n\t"
|
658 |
"paddd 16(%1), %%xmm6 \n\t"
|
659 |
"paddd 32(%1), %%xmm5 \n\t"
|
660 |
"paddd 48(%1), %%xmm0 \n\t"
|
661 |
"movdqa %%xmm4, (%1) \n\t"
|
662 |
"movdqa %%xmm6, 16(%1) \n\t"
|
663 |
"movdqa %%xmm5, 32(%1) \n\t"
|
664 |
"movdqa %%xmm0, 48(%1) \n\t"
|
665 |
"add $32, %0 \n\t"
|
666 |
"add $64, %1 \n\t"
|
667 |
"add $32, %2 \n\t"
|
668 |
"cmp %3, %0 \n\t"
|
669 |
" jb 1b \n\t"
|
670 |
: "+r" (block), "+r" (sum), "+r" (offset) |
671 |
: "r"(block+64) |
672 |
); |
673 |
} |
674 |
|
675 |
#undef HAVE_MMX2
|
676 |
#define RENAME(a) a ## _MMX |
677 |
#define RENAMEl(a) a ## _mmx |
678 |
#include "mpegvideo_mmx_template.c" |
679 |
|
680 |
#define HAVE_MMX2
|
681 |
#undef RENAME
|
682 |
#undef RENAMEl
|
683 |
#define RENAME(a) a ## _MMX2 |
684 |
#define RENAMEl(a) a ## _mmx2 |
685 |
#include "mpegvideo_mmx_template.c" |
686 |
|
687 |
#undef RENAME
|
688 |
#undef RENAMEl
|
689 |
#define RENAME(a) a ## _SSE2 |
690 |
#define RENAMEl(a) a ## _sse2 |
691 |
#include "mpegvideo_mmx_template.c" |
692 |
|
693 |
void MPV_common_init_mmx(MpegEncContext *s)
|
694 |
{ |
695 |
if (mm_flags & MM_MMX) {
|
696 |
const int dct_algo = s->avctx->dct_algo; |
697 |
|
698 |
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; |
699 |
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; |
700 |
s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; |
701 |
s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; |
702 |
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; |
703 |
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; |
704 |
|
705 |
draw_edges = draw_edges_mmx; |
706 |
|
707 |
if (mm_flags & MM_SSE2) {
|
708 |
s->denoise_dct= denoise_dct_sse2; |
709 |
} else {
|
710 |
s->denoise_dct= denoise_dct_mmx; |
711 |
} |
712 |
|
713 |
if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
|
714 |
if(mm_flags & MM_SSE2){
|
715 |
s->dct_quantize= dct_quantize_SSE2; |
716 |
} else if(mm_flags & MM_MMXEXT){ |
717 |
s->dct_quantize= dct_quantize_MMX2; |
718 |
} else {
|
719 |
s->dct_quantize= dct_quantize_MMX; |
720 |
} |
721 |
} |
722 |
} |
723 |
} |