ffmpeg / libavcodec / i386 / mpegvideo_mmx.c @ 182f56cb
History  View  Annotate  Download (31.1 KB)
1 
/*


2 
* The simplest mpeg encoder (well, it was the simplest!)

3 
* Copyright (c) 2000,2001 Fabrice Bellard.

4 
*

5 
* Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>

6 
* h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>

7 
*

8 
* This file is part of FFmpeg.

9 
*

10 
* FFmpeg is free software; you can redistribute it and/or

11 
* modify it under the terms of the GNU Lesser General Public

12 
* License as published by the Free Software Foundation; either

13 
* version 2.1 of the License, or (at your option) any later version.

14 
*

15 
* FFmpeg is distributed in the hope that it will be useful,

16 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

17 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

18 
* Lesser General Public License for more details.

19 
*

20 
* You should have received a copy of the GNU Lesser General Public

21 
* License along with FFmpeg; if not, write to the Free Software

22 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

23 
*/

24  
25 
#include "dsputil.h" 
26 
#include "dsputil_mmx.h" 
27 
#include "mpegvideo.h" 
28 
#include "avcodec.h" 
29 
#include "x86_cpu.h" 
30  
31 
extern uint16_t inv_zigzag_direct16[64]; 
32  
33  
34 
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, 
35 
DCTELEM *block, int n, int qscale) 
36 
{ 
37 
long level, qmul, qadd, nCoeffs;

38  
39 
qmul = qscale << 1;

40  
41 
assert(s>block_last_index[n]>=0  s>h263_aic);

42  
43 
if (!s>h263_aic) {

44 
if (n < 4) 
45 
level = block[0] * s>y_dc_scale;

46 
else

47 
level = block[0] * s>c_dc_scale;

48 
qadd = (qscale  1)  1; 
49 
}else{

50 
qadd = 0;

51 
level= block[0];

52 
} 
53 
if(s>ac_pred)

54 
nCoeffs=63;

55 
else

56 
nCoeffs= s>inter_scantable.raster_end[ s>block_last_index[n] ]; 
57 
//printf("%d %d ", qmul, qadd);

58 
asm volatile( 
59 
"movd %1, %%mm6 \n\t" //qmul 
60 
"packssdw %%mm6, %%mm6 \n\t"

61 
"packssdw %%mm6, %%mm6 \n\t"

62 
"movd %2, %%mm5 \n\t" //qadd 
63 
"pxor %%mm7, %%mm7 \n\t"

64 
"packssdw %%mm5, %%mm5 \n\t"

65 
"packssdw %%mm5, %%mm5 \n\t"

66 
"psubw %%mm5, %%mm7 \n\t"

67 
"pxor %%mm4, %%mm4 \n\t"

68 
ASMALIGN(4)

69 
"1: \n\t"

70 
"movq (%0, %3), %%mm0 \n\t"

71 
"movq 8(%0, %3), %%mm1 \n\t"

72  
73 
"pmullw %%mm6, %%mm0 \n\t"

74 
"pmullw %%mm6, %%mm1 \n\t"

75  
76 
"movq (%0, %3), %%mm2 \n\t"

77 
"movq 8(%0, %3), %%mm3 \n\t"

78  
79 
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
80 
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
81  
82 
"pxor %%mm2, %%mm0 \n\t"

83 
"pxor %%mm3, %%mm1 \n\t"

84  
85 
"paddw %%mm7, %%mm0 \n\t"

86 
"paddw %%mm7, %%mm1 \n\t"

87  
88 
"pxor %%mm0, %%mm2 \n\t"

89 
"pxor %%mm1, %%mm3 \n\t"

90  
91 
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? 1 : 0 
92 
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? 1 : 0 
93  
94 
"pandn %%mm2, %%mm0 \n\t"

95 
"pandn %%mm3, %%mm1 \n\t"

96  
97 
"movq %%mm0, (%0, %3) \n\t"

98 
"movq %%mm1, 8(%0, %3) \n\t"

99  
100 
"add $16, %3 \n\t"

101 
"jng 1b \n\t"

102 
::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(nCoeffs)) 
103 
: "memory"

104 
); 
105 
block[0]= level;

106 
} 
107  
108  
109 
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, 
110 
DCTELEM *block, int n, int qscale) 
111 
{ 
112 
long qmul, qadd, nCoeffs;

113  
114 
qmul = qscale << 1;

115 
qadd = (qscale  1)  1; 
116  
117 
assert(s>block_last_index[n]>=0  s>h263_aic);

118  
119 
nCoeffs= s>inter_scantable.raster_end[ s>block_last_index[n] ]; 
120 
//printf("%d %d ", qmul, qadd);

121 
asm volatile( 
122 
"movd %1, %%mm6 \n\t" //qmul 
123 
"packssdw %%mm6, %%mm6 \n\t"

124 
"packssdw %%mm6, %%mm6 \n\t"

125 
"movd %2, %%mm5 \n\t" //qadd 
126 
"pxor %%mm7, %%mm7 \n\t"

127 
"packssdw %%mm5, %%mm5 \n\t"

128 
"packssdw %%mm5, %%mm5 \n\t"

129 
"psubw %%mm5, %%mm7 \n\t"

130 
"pxor %%mm4, %%mm4 \n\t"

131 
ASMALIGN(4)

132 
"1: \n\t"

133 
"movq (%0, %3), %%mm0 \n\t"

134 
"movq 8(%0, %3), %%mm1 \n\t"

135  
136 
"pmullw %%mm6, %%mm0 \n\t"

137 
"pmullw %%mm6, %%mm1 \n\t"

138  
139 
"movq (%0, %3), %%mm2 \n\t"

140 
"movq 8(%0, %3), %%mm3 \n\t"

141  
142 
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
143 
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
144  
145 
"pxor %%mm2, %%mm0 \n\t"

146 
"pxor %%mm3, %%mm1 \n\t"

147  
148 
"paddw %%mm7, %%mm0 \n\t"

149 
"paddw %%mm7, %%mm1 \n\t"

150  
151 
"pxor %%mm0, %%mm2 \n\t"

152 
"pxor %%mm1, %%mm3 \n\t"

153  
154 
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? 1 : 0 
155 
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? 1 : 0 
156  
157 
"pandn %%mm2, %%mm0 \n\t"

158 
"pandn %%mm3, %%mm1 \n\t"

159  
160 
"movq %%mm0, (%0, %3) \n\t"

161 
"movq %%mm1, 8(%0, %3) \n\t"

162  
163 
"add $16, %3 \n\t"

164 
"jng 1b \n\t"

165 
::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(nCoeffs)) 
166 
: "memory"

167 
); 
168 
} 
169  
170  
171 
/*

172 
NK:

173 
Note: looking at PARANOID:

174 
"enable all paranoid tests for rounding, overflows, etc..."

175 

176 
#ifdef PARANOID

177 
if (level < 2048  level > 2047)

178 
fprintf(stderr, "unquant error %d %d\n", i, level);

179 
#endif

180 
We can suppose that result of two multiplications can't be greate of 0xFFFF

181 
i.e. is 16bit, so we use here only PMULLW instruction and can avoid

182 
a complex multiplication.

183 
=====================================================

184 
Full formula for multiplication of 2 integer numbers

185 
which are represent as high:low words:

186 
input: value1 = high1:low1

187 
value2 = high2:low2

188 
output: value3 = value1*value2

189 
value3=high3:low3 (on overflow: modulus 2^32 wraparound)

190 
this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4

191 
but this algorithm will compute only 0x66cb0ce4

192 
this limited by 16bit size of operands

193 


194 
tlow1 = high1*low2

195 
tlow2 = high2*low1

196 
tlow1 = tlow1 + tlow2

197 
high3:low3 = low1*low2

198 
high3 += tlow1

199 
*/

200 
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, 
201 
DCTELEM *block, int n, int qscale) 
202 
{ 
203 
long nCoeffs;

204 
const uint16_t *quant_matrix;

205 
int block0;

206  
207 
assert(s>block_last_index[n]>=0);

208  
209 
nCoeffs= s>intra_scantable.raster_end[ s>block_last_index[n] ]+1;

210  
211 
if (n < 4) 
212 
block0 = block[0] * s>y_dc_scale;

213 
else

214 
block0 = block[0] * s>c_dc_scale;

215 
/* XXX: only mpeg1 */

216 
quant_matrix = s>intra_matrix; 
217 
asm volatile( 
218 
"pcmpeqw %%mm7, %%mm7 \n\t"

219 
"psrlw $15, %%mm7 \n\t"

220 
"movd %2, %%mm6 \n\t"

221 
"packssdw %%mm6, %%mm6 \n\t"

222 
"packssdw %%mm6, %%mm6 \n\t"

223 
"mov %3, %%"REG_a" \n\t" 
224 
ASMALIGN(4)

225 
"1: \n\t"

226 
"movq (%0, %%"REG_a"), %%mm0 \n\t" 
227 
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" 
228 
"movq (%1, %%"REG_a"), %%mm4 \n\t" 
229 
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" 
230 
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 
231 
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 
232 
"pxor %%mm2, %%mm2 \n\t"

233 
"pxor %%mm3, %%mm3 \n\t"

234 
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
235 
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
236 
"pxor %%mm2, %%mm0 \n\t"

237 
"pxor %%mm3, %%mm1 \n\t"

238 
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 
239 
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 
240 
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 
241 
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 
242 
"pxor %%mm4, %%mm4 \n\t"

243 
"pxor %%mm5, %%mm5 \n\t" // FIXME slow 
244 
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? 1 : 0 
245 
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? 1 : 0 
246 
"psraw $3, %%mm0 \n\t"

247 
"psraw $3, %%mm1 \n\t"

248 
"psubw %%mm7, %%mm0 \n\t"

249 
"psubw %%mm7, %%mm1 \n\t"

250 
"por %%mm7, %%mm0 \n\t"

251 
"por %%mm7, %%mm1 \n\t"

252 
"pxor %%mm2, %%mm0 \n\t"

253 
"pxor %%mm3, %%mm1 \n\t"

254 
"psubw %%mm2, %%mm0 \n\t"

255 
"psubw %%mm3, %%mm1 \n\t"

256 
"pandn %%mm0, %%mm4 \n\t"

257 
"pandn %%mm1, %%mm5 \n\t"

258 
"movq %%mm4, (%0, %%"REG_a") \n\t" 
259 
"movq %%mm5, 8(%0, %%"REG_a") \n\t" 
260  
261 
"add $16, %%"REG_a" \n\t" 
262 
"js 1b \n\t"

263 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (2*nCoeffs) 
264 
: "%"REG_a, "memory" 
265 
); 
266 
block[0]= block0;

267 
} 
268  
269 
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, 
270 
DCTELEM *block, int n, int qscale) 
271 
{ 
272 
long nCoeffs;

273 
const uint16_t *quant_matrix;

274  
275 
assert(s>block_last_index[n]>=0);

276  
277 
nCoeffs= s>intra_scantable.raster_end[ s>block_last_index[n] ]+1;

278  
279 
quant_matrix = s>inter_matrix; 
280 
asm volatile( 
281 
"pcmpeqw %%mm7, %%mm7 \n\t"

282 
"psrlw $15, %%mm7 \n\t"

283 
"movd %2, %%mm6 \n\t"

284 
"packssdw %%mm6, %%mm6 \n\t"

285 
"packssdw %%mm6, %%mm6 \n\t"

286 
"mov %3, %%"REG_a" \n\t" 
287 
ASMALIGN(4)

288 
"1: \n\t"

289 
"movq (%0, %%"REG_a"), %%mm0 \n\t" 
290 
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" 
291 
"movq (%1, %%"REG_a"), %%mm4 \n\t" 
292 
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" 
293 
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 
294 
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 
295 
"pxor %%mm2, %%mm2 \n\t"

296 
"pxor %%mm3, %%mm3 \n\t"

297 
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
298 
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
299 
"pxor %%mm2, %%mm0 \n\t"

300 
"pxor %%mm3, %%mm1 \n\t"

301 
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 
302 
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 
303 
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 
304 
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 
305 
"paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 
306 
"paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 
307 
"pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 
308 
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 
309 
"pxor %%mm4, %%mm4 \n\t"

310 
"pxor %%mm5, %%mm5 \n\t" // FIXME slow 
311 
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? 1 : 0 
312 
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? 1 : 0 
313 
"psraw $4, %%mm0 \n\t"

314 
"psraw $4, %%mm1 \n\t"

315 
"psubw %%mm7, %%mm0 \n\t"

316 
"psubw %%mm7, %%mm1 \n\t"

317 
"por %%mm7, %%mm0 \n\t"

318 
"por %%mm7, %%mm1 \n\t"

319 
"pxor %%mm2, %%mm0 \n\t"

320 
"pxor %%mm3, %%mm1 \n\t"

321 
"psubw %%mm2, %%mm0 \n\t"

322 
"psubw %%mm3, %%mm1 \n\t"

323 
"pandn %%mm0, %%mm4 \n\t"

324 
"pandn %%mm1, %%mm5 \n\t"

325 
"movq %%mm4, (%0, %%"REG_a") \n\t" 
326 
"movq %%mm5, 8(%0, %%"REG_a") \n\t" 
327  
328 
"add $16, %%"REG_a" \n\t" 
329 
"js 1b \n\t"

330 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (2*nCoeffs) 
331 
: "%"REG_a, "memory" 
332 
); 
333 
} 
334  
335 
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, 
336 
DCTELEM *block, int n, int qscale) 
337 
{ 
338 
long nCoeffs;

339 
const uint16_t *quant_matrix;

340 
int block0;

341  
342 
assert(s>block_last_index[n]>=0);

343  
344 
if(s>alternate_scan) nCoeffs= 63; //FIXME 
345 
else nCoeffs= s>intra_scantable.raster_end[ s>block_last_index[n] ];

346  
347 
if (n < 4) 
348 
block0 = block[0] * s>y_dc_scale;

349 
else

350 
block0 = block[0] * s>c_dc_scale;

351 
quant_matrix = s>intra_matrix; 
352 
asm volatile( 
353 
"pcmpeqw %%mm7, %%mm7 \n\t"

354 
"psrlw $15, %%mm7 \n\t"

355 
"movd %2, %%mm6 \n\t"

356 
"packssdw %%mm6, %%mm6 \n\t"

357 
"packssdw %%mm6, %%mm6 \n\t"

358 
"mov %3, %%"REG_a" \n\t" 
359 
ASMALIGN(4)

360 
"1: \n\t"

361 
"movq (%0, %%"REG_a"), %%mm0 \n\t" 
362 
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" 
363 
"movq (%1, %%"REG_a"), %%mm4 \n\t" 
364 
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" 
365 
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 
366 
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 
367 
"pxor %%mm2, %%mm2 \n\t"

368 
"pxor %%mm3, %%mm3 \n\t"

369 
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
370 
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
371 
"pxor %%mm2, %%mm0 \n\t"

372 
"pxor %%mm3, %%mm1 \n\t"

373 
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 
374 
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 
375 
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 
376 
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 
377 
"pxor %%mm4, %%mm4 \n\t"

378 
"pxor %%mm5, %%mm5 \n\t" // FIXME slow 
379 
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? 1 : 0 
380 
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? 1 : 0 
381 
"psraw $3, %%mm0 \n\t"

382 
"psraw $3, %%mm1 \n\t"

383 
"pxor %%mm2, %%mm0 \n\t"

384 
"pxor %%mm3, %%mm1 \n\t"

385 
"psubw %%mm2, %%mm0 \n\t"

386 
"psubw %%mm3, %%mm1 \n\t"

387 
"pandn %%mm0, %%mm4 \n\t"

388 
"pandn %%mm1, %%mm5 \n\t"

389 
"movq %%mm4, (%0, %%"REG_a") \n\t" 
390 
"movq %%mm5, 8(%0, %%"REG_a") \n\t" 
391  
392 
"add $16, %%"REG_a" \n\t" 
393 
"jng 1b \n\t"

394 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (2*nCoeffs) 
395 
: "%"REG_a, "memory" 
396 
); 
397 
block[0]= block0;

398 
//Note, we do not do mismatch control for intra as errors cannot accumulate

399 
} 
400  
401 
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, 
402 
DCTELEM *block, int n, int qscale) 
403 
{ 
404 
long nCoeffs;

405 
const uint16_t *quant_matrix;

406  
407 
assert(s>block_last_index[n]>=0);

408  
409 
if(s>alternate_scan) nCoeffs= 63; //FIXME 
410 
else nCoeffs= s>intra_scantable.raster_end[ s>block_last_index[n] ];

411  
412 
quant_matrix = s>inter_matrix; 
413 
asm volatile( 
414 
"pcmpeqw %%mm7, %%mm7 \n\t"

415 
"psrlq $48, %%mm7 \n\t"

416 
"movd %2, %%mm6 \n\t"

417 
"packssdw %%mm6, %%mm6 \n\t"

418 
"packssdw %%mm6, %%mm6 \n\t"

419 
"mov %3, %%"REG_a" \n\t" 
420 
ASMALIGN(4)

421 
"1: \n\t"

422 
"movq (%0, %%"REG_a"), %%mm0 \n\t" 
423 
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" 
424 
"movq (%1, %%"REG_a"), %%mm4 \n\t" 
425 
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" 
426 
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 
427 
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 
428 
"pxor %%mm2, %%mm2 \n\t"

429 
"pxor %%mm3, %%mm3 \n\t"

430 
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
431 
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
432 
"pxor %%mm2, %%mm0 \n\t"

433 
"pxor %%mm3, %%mm1 \n\t"

434 
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 
435 
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 
436 
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 
437 
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 
438 
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q 
439 
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q 
440 
"paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 
441 
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 
442 
"pxor %%mm4, %%mm4 \n\t"

443 
"pxor %%mm5, %%mm5 \n\t" // FIXME slow 
444 
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? 1 : 0 
445 
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? 1 : 0 
446 
"psrlw $4, %%mm0 \n\t"

447 
"psrlw $4, %%mm1 \n\t"

448 
"pxor %%mm2, %%mm0 \n\t"

449 
"pxor %%mm3, %%mm1 \n\t"

450 
"psubw %%mm2, %%mm0 \n\t"

451 
"psubw %%mm3, %%mm1 \n\t"

452 
"pandn %%mm0, %%mm4 \n\t"

453 
"pandn %%mm1, %%mm5 \n\t"

454 
"pxor %%mm4, %%mm7 \n\t"

455 
"pxor %%mm5, %%mm7 \n\t"

456 
"movq %%mm4, (%0, %%"REG_a") \n\t" 
457 
"movq %%mm5, 8(%0, %%"REG_a") \n\t" 
458  
459 
"add $16, %%"REG_a" \n\t" 
460 
"jng 1b \n\t"

461 
"movd 124(%0, %3), %%mm0 \n\t"

462 
"movq %%mm7, %%mm6 \n\t"

463 
"psrlq $32, %%mm7 \n\t"

464 
"pxor %%mm6, %%mm7 \n\t"

465 
"movq %%mm7, %%mm6 \n\t"

466 
"psrlq $16, %%mm7 \n\t"

467 
"pxor %%mm6, %%mm7 \n\t"

468 
"pslld $31, %%mm7 \n\t"

469 
"psrlq $15, %%mm7 \n\t"

470 
"pxor %%mm7, %%mm0 \n\t"

471 
"movd %%mm0, 124(%0, %3) \n\t"

472  
473 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (2*nCoeffs) 
474 
: "%"REG_a, "memory" 
475 
); 
476 
} 
477  
478 
/* draw the edges of width 'w' of an image of size width, height

479 
this mmx version can only handle w==8  w==16 */

480 
static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) 
481 
{ 
482 
uint8_t *ptr, *last_line; 
483 
int i;

484  
485 
last_line = buf + (height  1) * wrap;

486 
/* left and right */

487 
ptr = buf; 
488 
if(w==8) 
489 
{ 
490 
asm volatile( 
491 
"1: \n\t"

492 
"movd (%0), %%mm0 \n\t"

493 
"punpcklbw %%mm0, %%mm0 \n\t"

494 
"punpcklwd %%mm0, %%mm0 \n\t"

495 
"punpckldq %%mm0, %%mm0 \n\t"

496 
"movq %%mm0, 8(%0) \n\t"

497 
"movq 8(%0, %2), %%mm1 \n\t"

498 
"punpckhbw %%mm1, %%mm1 \n\t"

499 
"punpckhwd %%mm1, %%mm1 \n\t"

500 
"punpckhdq %%mm1, %%mm1 \n\t"

501 
"movq %%mm1, (%0, %2) \n\t"

502 
"add %1, %0 \n\t"

503 
"cmp %3, %0 \n\t"

504 
" jb 1b \n\t"

505 
: "+r" (ptr)

506 
: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) 
507 
); 
508 
} 
509 
else

510 
{ 
511 
asm volatile( 
512 
"1: \n\t"

513 
"movd (%0), %%mm0 \n\t"

514 
"punpcklbw %%mm0, %%mm0 \n\t"

515 
"punpcklwd %%mm0, %%mm0 \n\t"

516 
"punpckldq %%mm0, %%mm0 \n\t"

517 
"movq %%mm0, 8(%0) \n\t"

518 
"movq %%mm0, 16(%0) \n\t"

519 
"movq 8(%0, %2), %%mm1 \n\t"

520 
"punpckhbw %%mm1, %%mm1 \n\t"

521 
"punpckhwd %%mm1, %%mm1 \n\t"

522 
"punpckhdq %%mm1, %%mm1 \n\t"

523 
"movq %%mm1, (%0, %2) \n\t"

524 
"movq %%mm1, 8(%0, %2) \n\t"

525 
"add %1, %0 \n\t"

526 
"cmp %3, %0 \n\t"

527 
" jb 1b \n\t"

528 
: "+r" (ptr)

529 
: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) 
530 
); 
531 
} 
532  
533 
for(i=0;i<w;i+=4) { 
534 
/* top and bottom (and hopefully also the corners) */

535 
ptr= buf  (i + 1) * wrap  w;

536 
asm volatile( 
537 
"1: \n\t"

538 
"movq (%1, %0), %%mm0 \n\t"

539 
"movq %%mm0, (%0) \n\t"

540 
"movq %%mm0, (%0, %2) \n\t"

541 
"movq %%mm0, (%0, %2, 2) \n\t"

542 
"movq %%mm0, (%0, %3) \n\t"

543 
"add $8, %0 \n\t"

544 
"cmp %4, %0 \n\t"

545 
" jb 1b \n\t"

546 
: "+r" (ptr)

547 
: "r" ((long)buf  (long)ptr  w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w) 
548 
); 
549 
ptr= last_line + (i + 1) * wrap  w;

550 
asm volatile( 
551 
"1: \n\t"

552 
"movq (%1, %0), %%mm0 \n\t"

553 
"movq %%mm0, (%0) \n\t"

554 
"movq %%mm0, (%0, %2) \n\t"

555 
"movq %%mm0, (%0, %2, 2) \n\t"

556 
"movq %%mm0, (%0, %3) \n\t"

557 
"add $8, %0 \n\t"

558 
"cmp %4, %0 \n\t"

559 
" jb 1b \n\t"

560 
: "+r" (ptr)

561 
: "r" ((long)last_line  (long)ptr  w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w) 
562 
); 
563 
} 
564 
} 
565  
566 
static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ 
567 
const int intra= s>mb_intra; 
568 
int *sum= s>dct_error_sum[intra];

569 
uint16_t *offset= s>dct_offset[intra]; 
570  
571 
s>dct_count[intra]++; 
572  
573 
asm volatile( 
574 
"pxor %%mm7, %%mm7 \n\t"

575 
"1: \n\t"

576 
"pxor %%mm0, %%mm0 \n\t"

577 
"pxor %%mm1, %%mm1 \n\t"

578 
"movq (%0), %%mm2 \n\t"

579 
"movq 8(%0), %%mm3 \n\t"

580 
"pcmpgtw %%mm2, %%mm0 \n\t"

581 
"pcmpgtw %%mm3, %%mm1 \n\t"

582 
"pxor %%mm0, %%mm2 \n\t"

583 
"pxor %%mm1, %%mm3 \n\t"

584 
"psubw %%mm0, %%mm2 \n\t"

585 
"psubw %%mm1, %%mm3 \n\t"

586 
"movq %%mm2, %%mm4 \n\t"

587 
"movq %%mm3, %%mm5 \n\t"

588 
"psubusw (%2), %%mm2 \n\t"

589 
"psubusw 8(%2), %%mm3 \n\t"

590 
"pxor %%mm0, %%mm2 \n\t"

591 
"pxor %%mm1, %%mm3 \n\t"

592 
"psubw %%mm0, %%mm2 \n\t"

593 
"psubw %%mm1, %%mm3 \n\t"

594 
"movq %%mm2, (%0) \n\t"

595 
"movq %%mm3, 8(%0) \n\t"

596 
"movq %%mm4, %%mm2 \n\t"

597 
"movq %%mm5, %%mm3 \n\t"

598 
"punpcklwd %%mm7, %%mm4 \n\t"

599 
"punpckhwd %%mm7, %%mm2 \n\t"

600 
"punpcklwd %%mm7, %%mm5 \n\t"

601 
"punpckhwd %%mm7, %%mm3 \n\t"

602 
"paddd (%1), %%mm4 \n\t"

603 
"paddd 8(%1), %%mm2 \n\t"

604 
"paddd 16(%1), %%mm5 \n\t"

605 
"paddd 24(%1), %%mm3 \n\t"

606 
"movq %%mm4, (%1) \n\t"

607 
"movq %%mm2, 8(%1) \n\t"

608 
"movq %%mm5, 16(%1) \n\t"

609 
"movq %%mm3, 24(%1) \n\t"

610 
"add $16, %0 \n\t"

611 
"add $32, %1 \n\t"

612 
"add $16, %2 \n\t"

613 
"cmp %3, %0 \n\t"

614 
" jb 1b \n\t"

615 
: "+r" (block), "+r" (sum), "+r" (offset) 
616 
: "r"(block+64) 
617 
); 
618 
} 
619  
620 
static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ 
621 
const int intra= s>mb_intra; 
622 
int *sum= s>dct_error_sum[intra];

623 
uint16_t *offset= s>dct_offset[intra]; 
624  
625 
s>dct_count[intra]++; 
626  
627 
asm volatile( 
628 
"pxor %%xmm7, %%xmm7 \n\t"

629 
"1: \n\t"

630 
"pxor %%xmm0, %%xmm0 \n\t"

631 
"pxor %%xmm1, %%xmm1 \n\t"

632 
"movdqa (%0), %%xmm2 \n\t"

633 
"movdqa 16(%0), %%xmm3 \n\t"

634 
"pcmpgtw %%xmm2, %%xmm0 \n\t"

635 
"pcmpgtw %%xmm3, %%xmm1 \n\t"

636 
"pxor %%xmm0, %%xmm2 \n\t"

637 
"pxor %%xmm1, %%xmm3 \n\t"

638 
"psubw %%xmm0, %%xmm2 \n\t"

639 
"psubw %%xmm1, %%xmm3 \n\t"

640 
"movdqa %%xmm2, %%xmm4 \n\t"

641 
"movdqa %%xmm3, %%xmm5 \n\t"

642 
"psubusw (%2), %%xmm2 \n\t"

643 
"psubusw 16(%2), %%xmm3 \n\t"

644 
"pxor %%xmm0, %%xmm2 \n\t"

645 
"pxor %%xmm1, %%xmm3 \n\t"

646 
"psubw %%xmm0, %%xmm2 \n\t"

647 
"psubw %%xmm1, %%xmm3 \n\t"

648 
"movdqa %%xmm2, (%0) \n\t"

649 
"movdqa %%xmm3, 16(%0) \n\t"

650 
"movdqa %%xmm4, %%xmm6 \n\t"

651 
"movdqa %%xmm5, %%xmm0 \n\t"

652 
"punpcklwd %%xmm7, %%xmm4 \n\t"

653 
"punpckhwd %%xmm7, %%xmm6 \n\t"

654 
"punpcklwd %%xmm7, %%xmm5 \n\t"

655 
"punpckhwd %%xmm7, %%xmm0 \n\t"

656 
"paddd (%1), %%xmm4 \n\t"

657 
"paddd 16(%1), %%xmm6 \n\t"

658 
"paddd 32(%1), %%xmm5 \n\t"

659 
"paddd 48(%1), %%xmm0 \n\t"

660 
"movdqa %%xmm4, (%1) \n\t"

661 
"movdqa %%xmm6, 16(%1) \n\t"

662 
"movdqa %%xmm5, 32(%1) \n\t"

663 
"movdqa %%xmm0, 48(%1) \n\t"

664 
"add $32, %0 \n\t"

665 
"add $64, %1 \n\t"

666 
"add $32, %2 \n\t"

667 
"cmp %3, %0 \n\t"

668 
" jb 1b \n\t"

669 
: "+r" (block), "+r" (sum), "+r" (offset) 
670 
: "r"(block+64) 
671 
); 
672 
} 
673  
674 
#ifdef HAVE_SSSE3

675 
#define HAVE_SSSE3_BAK

676 
#endif

677 
#undef HAVE_SSSE3

678  
679 
#undef HAVE_SSE2

680 
#undef HAVE_MMX2

681 
#define RENAME(a) a ## _MMX 
682 
#define RENAMEl(a) a ## _mmx 
683 
#include "mpegvideo_mmx_template.c" 
684  
685 
#define HAVE_MMX2

686 
#undef RENAME

687 
#undef RENAMEl

688 
#define RENAME(a) a ## _MMX2 
689 
#define RENAMEl(a) a ## _mmx2 
690 
#include "mpegvideo_mmx_template.c" 
691  
692 
#define HAVE_SSE2

693 
#undef RENAME

694 
#undef RENAMEl

695 
#define RENAME(a) a ## _SSE2 
696 
#define RENAMEl(a) a ## _sse2 
697 
#include "mpegvideo_mmx_template.c" 
698  
699 
#ifdef HAVE_SSSE3_BAK

700 
#define HAVE_SSSE3

701 
#undef RENAME

702 
#undef RENAMEl

703 
#define RENAME(a) a ## _SSSE3 
704 
#define RENAMEl(a) a ## _sse2 
705 
#include "mpegvideo_mmx_template.c" 
706 
#endif

707  
708 
void MPV_common_init_mmx(MpegEncContext *s)

709 
{ 
710 
if (mm_flags & MM_MMX) {

711 
const int dct_algo = s>avctx>dct_algo; 
712  
713 
s>dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; 
714 
s>dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; 
715 
s>dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; 
716 
s>dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; 
717 
if(!(s>flags & CODEC_FLAG_BITEXACT))

718 
s>dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; 
719 
s>dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; 
720  
721 
draw_edges = draw_edges_mmx; 
722  
723 
if (mm_flags & MM_SSE2) {

724 
s>denoise_dct= denoise_dct_sse2; 
725 
} else {

726 
s>denoise_dct= denoise_dct_mmx; 
727 
} 
728  
729 
if(dct_algo==FF_DCT_AUTO  dct_algo==FF_DCT_MMX){

730 
#ifdef HAVE_SSSE3

731 
if(mm_flags & MM_SSSE3){

732 
s>dct_quantize= dct_quantize_SSSE3; 
733 
} else

734 
#endif

735 
if(mm_flags & MM_SSE2){

736 
s>dct_quantize= dct_quantize_SSE2; 
737 
} else if(mm_flags & MM_MMXEXT){ 
738 
s>dct_quantize= dct_quantize_MMX2; 
739 
} else {

740 
s>dct_quantize= dct_quantize_MMX; 
741 
} 
742 
} 
743 
} 
744 
} 