ffmpeg / libavcodec / i386 / mpegvideo_mmx.c @ ff506a90
History  View  Annotate  Download (31.3 KB)
1 
/*


2 
* The simplest mpeg encoder (well, it was the simplest!)

3 
* Copyright (c) 2000,2001 Fabrice Bellard.

4 
*

5 
* This file is part of FFmpeg.

6 
*

7 
* FFmpeg is free software; you can redistribute it and/or

8 
* modify it under the terms of the GNU Lesser General Public

9 
* License as published by the Free Software Foundation; either

10 
* version 2.1 of the License, or (at your option) any later version.

11 
*

12 
* FFmpeg is distributed in the hope that it will be useful,

13 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

14 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15 
* Lesser General Public License for more details.

16 
*

17 
* You should have received a copy of the GNU Lesser General Public

18 
* License along with FFmpeg; if not, write to the Free Software

19 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

20 
*

21 
* Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru>

22 
* h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>

23 
*/

24  
25 
#include "../dsputil.h" 
26 
#include "../mpegvideo.h" 
27 
#include "../avcodec.h" 
28 
#include "x86_cpu.h" 
29  
30 
extern uint16_t inv_zigzag_direct16[64]; 
31  
32 
static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; 
33 
static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; 
34  
35  
36 
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, 
37 
DCTELEM *block, int n, int qscale) 
38 
{ 
39 
long level, qmul, qadd, nCoeffs;

40  
41 
qmul = qscale << 1;

42  
43 
assert(s>block_last_index[n]>=0  s>h263_aic);

44  
45 
if (!s>h263_aic) {

46 
if (n < 4) 
47 
level = block[0] * s>y_dc_scale;

48 
else

49 
level = block[0] * s>c_dc_scale;

50 
qadd = (qscale  1)  1; 
51 
}else{

52 
qadd = 0;

53 
level= block[0];

54 
} 
55 
if(s>ac_pred)

56 
nCoeffs=63;

57 
else

58 
nCoeffs= s>inter_scantable.raster_end[ s>block_last_index[n] ]; 
59 
//printf("%d %d ", qmul, qadd);

60 
asm volatile( 
61 
"movd %1, %%mm6 \n\t" //qmul 
62 
"packssdw %%mm6, %%mm6 \n\t"

63 
"packssdw %%mm6, %%mm6 \n\t"

64 
"movd %2, %%mm5 \n\t" //qadd 
65 
"pxor %%mm7, %%mm7 \n\t"

66 
"packssdw %%mm5, %%mm5 \n\t"

67 
"packssdw %%mm5, %%mm5 \n\t"

68 
"psubw %%mm5, %%mm7 \n\t"

69 
"pxor %%mm4, %%mm4 \n\t"

70 
ASMALIGN(4)

71 
"1: \n\t"

72 
"movq (%0, %3), %%mm0 \n\t"

73 
"movq 8(%0, %3), %%mm1 \n\t"

74  
75 
"pmullw %%mm6, %%mm0 \n\t"

76 
"pmullw %%mm6, %%mm1 \n\t"

77  
78 
"movq (%0, %3), %%mm2 \n\t"

79 
"movq 8(%0, %3), %%mm3 \n\t"

80  
81 
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
82 
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
83  
84 
"pxor %%mm2, %%mm0 \n\t"

85 
"pxor %%mm3, %%mm1 \n\t"

86  
87 
"paddw %%mm7, %%mm0 \n\t"

88 
"paddw %%mm7, %%mm1 \n\t"

89  
90 
"pxor %%mm0, %%mm2 \n\t"

91 
"pxor %%mm1, %%mm3 \n\t"

92  
93 
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? 1 : 0 
94 
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? 1 : 0 
95  
96 
"pandn %%mm2, %%mm0 \n\t"

97 
"pandn %%mm3, %%mm1 \n\t"

98  
99 
"movq %%mm0, (%0, %3) \n\t"

100 
"movq %%mm1, 8(%0, %3) \n\t"

101  
102 
"add $16, %3 \n\t"

103 
"jng 1b \n\t"

104 
::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(nCoeffs)) 
105 
: "memory"

106 
); 
107 
block[0]= level;

108 
} 
109  
110  
111 
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, 
112 
DCTELEM *block, int n, int qscale) 
113 
{ 
114 
long qmul, qadd, nCoeffs;

115  
116 
qmul = qscale << 1;

117 
qadd = (qscale  1)  1; 
118  
119 
assert(s>block_last_index[n]>=0  s>h263_aic);

120  
121 
nCoeffs= s>inter_scantable.raster_end[ s>block_last_index[n] ]; 
122 
//printf("%d %d ", qmul, qadd);

123 
asm volatile( 
124 
"movd %1, %%mm6 \n\t" //qmul 
125 
"packssdw %%mm6, %%mm6 \n\t"

126 
"packssdw %%mm6, %%mm6 \n\t"

127 
"movd %2, %%mm5 \n\t" //qadd 
128 
"pxor %%mm7, %%mm7 \n\t"

129 
"packssdw %%mm5, %%mm5 \n\t"

130 
"packssdw %%mm5, %%mm5 \n\t"

131 
"psubw %%mm5, %%mm7 \n\t"

132 
"pxor %%mm4, %%mm4 \n\t"

133 
ASMALIGN(4)

134 
"1: \n\t"

135 
"movq (%0, %3), %%mm0 \n\t"

136 
"movq 8(%0, %3), %%mm1 \n\t"

137  
138 
"pmullw %%mm6, %%mm0 \n\t"

139 
"pmullw %%mm6, %%mm1 \n\t"

140  
141 
"movq (%0, %3), %%mm2 \n\t"

142 
"movq 8(%0, %3), %%mm3 \n\t"

143  
144 
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
145 
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
146  
147 
"pxor %%mm2, %%mm0 \n\t"

148 
"pxor %%mm3, %%mm1 \n\t"

149  
150 
"paddw %%mm7, %%mm0 \n\t"

151 
"paddw %%mm7, %%mm1 \n\t"

152  
153 
"pxor %%mm0, %%mm2 \n\t"

154 
"pxor %%mm1, %%mm3 \n\t"

155  
156 
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? 1 : 0 
157 
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? 1 : 0 
158  
159 
"pandn %%mm2, %%mm0 \n\t"

160 
"pandn %%mm3, %%mm1 \n\t"

161  
162 
"movq %%mm0, (%0, %3) \n\t"

163 
"movq %%mm1, 8(%0, %3) \n\t"

164  
165 
"add $16, %3 \n\t"

166 
"jng 1b \n\t"

167 
::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(nCoeffs)) 
168 
: "memory"

169 
); 
170 
} 
171  
172  
173 
/*

174 
NK:

175 
Note: looking at PARANOID:

176 
"enable all paranoid tests for rounding, overflows, etc..."

177 

178 
#ifdef PARANOID

179 
if (level < 2048  level > 2047)

180 
fprintf(stderr, "unquant error %d %d\n", i, level);

181 
#endif

182 
We can suppose that result of two multiplications can't be greate of 0xFFFF

183 
i.e. is 16bit, so we use here only PMULLW instruction and can avoid

184 
a complex multiplication.

185 
=====================================================

186 
Full formula for multiplication of 2 integer numbers

187 
which are represent as high:low words:

188 
input: value1 = high1:low1

189 
value2 = high2:low2

190 
output: value3 = value1*value2

191 
value3=high3:low3 (on overflow: modulus 2^32 wraparound)

192 
this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4

193 
but this algorithm will compute only 0x66cb0ce4

194 
this limited by 16bit size of operands

195 


196 
tlow1 = high1*low2

197 
tlow2 = high2*low1

198 
tlow1 = tlow1 + tlow2

199 
high3:low3 = low1*low2

200 
high3 += tlow1

201 
*/

202 
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, 
203 
DCTELEM *block, int n, int qscale) 
204 
{ 
205 
long nCoeffs;

206 
const uint16_t *quant_matrix;

207 
int block0;

208  
209 
assert(s>block_last_index[n]>=0);

210  
211 
nCoeffs= s>intra_scantable.raster_end[ s>block_last_index[n] ]+1;

212  
213 
if (n < 4) 
214 
block0 = block[0] * s>y_dc_scale;

215 
else

216 
block0 = block[0] * s>c_dc_scale;

217 
/* XXX: only mpeg1 */

218 
quant_matrix = s>intra_matrix; 
219 
asm volatile( 
220 
"pcmpeqw %%mm7, %%mm7 \n\t"

221 
"psrlw $15, %%mm7 \n\t"

222 
"movd %2, %%mm6 \n\t"

223 
"packssdw %%mm6, %%mm6 \n\t"

224 
"packssdw %%mm6, %%mm6 \n\t"

225 
"mov %3, %%"REG_a" \n\t" 
226 
ASMALIGN(4)

227 
"1: \n\t"

228 
"movq (%0, %%"REG_a"), %%mm0 \n\t" 
229 
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" 
230 
"movq (%1, %%"REG_a"), %%mm4 \n\t" 
231 
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" 
232 
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 
233 
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 
234 
"pxor %%mm2, %%mm2 \n\t"

235 
"pxor %%mm3, %%mm3 \n\t"

236 
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
237 
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
238 
"pxor %%mm2, %%mm0 \n\t"

239 
"pxor %%mm3, %%mm1 \n\t"

240 
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 
241 
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 
242 
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 
243 
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 
244 
"pxor %%mm4, %%mm4 \n\t"

245 
"pxor %%mm5, %%mm5 \n\t" // FIXME slow 
246 
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? 1 : 0 
247 
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? 1 : 0 
248 
"psraw $3, %%mm0 \n\t"

249 
"psraw $3, %%mm1 \n\t"

250 
"psubw %%mm7, %%mm0 \n\t"

251 
"psubw %%mm7, %%mm1 \n\t"

252 
"por %%mm7, %%mm0 \n\t"

253 
"por %%mm7, %%mm1 \n\t"

254 
"pxor %%mm2, %%mm0 \n\t"

255 
"pxor %%mm3, %%mm1 \n\t"

256 
"psubw %%mm2, %%mm0 \n\t"

257 
"psubw %%mm3, %%mm1 \n\t"

258 
"pandn %%mm0, %%mm4 \n\t"

259 
"pandn %%mm1, %%mm5 \n\t"

260 
"movq %%mm4, (%0, %%"REG_a") \n\t" 
261 
"movq %%mm5, 8(%0, %%"REG_a") \n\t" 
262  
263 
"add $16, %%"REG_a" \n\t" 
264 
"js 1b \n\t"

265 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (2*nCoeffs) 
266 
: "%"REG_a, "memory" 
267 
); 
268 
block[0]= block0;

269 
} 
270  
271 
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, 
272 
DCTELEM *block, int n, int qscale) 
273 
{ 
274 
long nCoeffs;

275 
const uint16_t *quant_matrix;

276  
277 
assert(s>block_last_index[n]>=0);

278  
279 
nCoeffs= s>intra_scantable.raster_end[ s>block_last_index[n] ]+1;

280  
281 
quant_matrix = s>inter_matrix; 
282 
asm volatile( 
283 
"pcmpeqw %%mm7, %%mm7 \n\t"

284 
"psrlw $15, %%mm7 \n\t"

285 
"movd %2, %%mm6 \n\t"

286 
"packssdw %%mm6, %%mm6 \n\t"

287 
"packssdw %%mm6, %%mm6 \n\t"

288 
"mov %3, %%"REG_a" \n\t" 
289 
ASMALIGN(4)

290 
"1: \n\t"

291 
"movq (%0, %%"REG_a"), %%mm0 \n\t" 
292 
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" 
293 
"movq (%1, %%"REG_a"), %%mm4 \n\t" 
294 
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" 
295 
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 
296 
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 
297 
"pxor %%mm2, %%mm2 \n\t"

298 
"pxor %%mm3, %%mm3 \n\t"

299 
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
300 
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
301 
"pxor %%mm2, %%mm0 \n\t"

302 
"pxor %%mm3, %%mm1 \n\t"

303 
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 
304 
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 
305 
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 
306 
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 
307 
"paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 
308 
"paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 
309 
"pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 
310 
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 
311 
"pxor %%mm4, %%mm4 \n\t"

312 
"pxor %%mm5, %%mm5 \n\t" // FIXME slow 
313 
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? 1 : 0 
314 
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? 1 : 0 
315 
"psraw $4, %%mm0 \n\t"

316 
"psraw $4, %%mm1 \n\t"

317 
"psubw %%mm7, %%mm0 \n\t"

318 
"psubw %%mm7, %%mm1 \n\t"

319 
"por %%mm7, %%mm0 \n\t"

320 
"por %%mm7, %%mm1 \n\t"

321 
"pxor %%mm2, %%mm0 \n\t"

322 
"pxor %%mm3, %%mm1 \n\t"

323 
"psubw %%mm2, %%mm0 \n\t"

324 
"psubw %%mm3, %%mm1 \n\t"

325 
"pandn %%mm0, %%mm4 \n\t"

326 
"pandn %%mm1, %%mm5 \n\t"

327 
"movq %%mm4, (%0, %%"REG_a") \n\t" 
328 
"movq %%mm5, 8(%0, %%"REG_a") \n\t" 
329  
330 
"add $16, %%"REG_a" \n\t" 
331 
"js 1b \n\t"

332 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (2*nCoeffs) 
333 
: "%"REG_a, "memory" 
334 
); 
335 
} 
336  
337 
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, 
338 
DCTELEM *block, int n, int qscale) 
339 
{ 
340 
long nCoeffs;

341 
const uint16_t *quant_matrix;

342 
int block0;

343  
344 
assert(s>block_last_index[n]>=0);

345  
346 
if(s>alternate_scan) nCoeffs= 63; //FIXME 
347 
else nCoeffs= s>intra_scantable.raster_end[ s>block_last_index[n] ];

348  
349 
if (n < 4) 
350 
block0 = block[0] * s>y_dc_scale;

351 
else

352 
block0 = block[0] * s>c_dc_scale;

353 
quant_matrix = s>intra_matrix; 
354 
asm volatile( 
355 
"pcmpeqw %%mm7, %%mm7 \n\t"

356 
"psrlw $15, %%mm7 \n\t"

357 
"movd %2, %%mm6 \n\t"

358 
"packssdw %%mm6, %%mm6 \n\t"

359 
"packssdw %%mm6, %%mm6 \n\t"

360 
"mov %3, %%"REG_a" \n\t" 
361 
ASMALIGN(4)

362 
"1: \n\t"

363 
"movq (%0, %%"REG_a"), %%mm0 \n\t" 
364 
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" 
365 
"movq (%1, %%"REG_a"), %%mm4 \n\t" 
366 
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" 
367 
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 
368 
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 
369 
"pxor %%mm2, %%mm2 \n\t"

370 
"pxor %%mm3, %%mm3 \n\t"

371 
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
372 
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
373 
"pxor %%mm2, %%mm0 \n\t"

374 
"pxor %%mm3, %%mm1 \n\t"

375 
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 
376 
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 
377 
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 
378 
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 
379 
"pxor %%mm4, %%mm4 \n\t"

380 
"pxor %%mm5, %%mm5 \n\t" // FIXME slow 
381 
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? 1 : 0 
382 
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? 1 : 0 
383 
"psraw $3, %%mm0 \n\t"

384 
"psraw $3, %%mm1 \n\t"

385 
"pxor %%mm2, %%mm0 \n\t"

386 
"pxor %%mm3, %%mm1 \n\t"

387 
"psubw %%mm2, %%mm0 \n\t"

388 
"psubw %%mm3, %%mm1 \n\t"

389 
"pandn %%mm0, %%mm4 \n\t"

390 
"pandn %%mm1, %%mm5 \n\t"

391 
"movq %%mm4, (%0, %%"REG_a") \n\t" 
392 
"movq %%mm5, 8(%0, %%"REG_a") \n\t" 
393  
394 
"add $16, %%"REG_a" \n\t" 
395 
"jng 1b \n\t"

396 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (2*nCoeffs) 
397 
: "%"REG_a, "memory" 
398 
); 
399 
block[0]= block0;

400 
//Note, we dont do mismatch control for intra as errors cannot accumulate

401 
} 
402  
403 
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, 
404 
DCTELEM *block, int n, int qscale) 
405 
{ 
406 
long nCoeffs;

407 
const uint16_t *quant_matrix;

408  
409 
assert(s>block_last_index[n]>=0);

410  
411 
if(s>alternate_scan) nCoeffs= 63; //FIXME 
412 
else nCoeffs= s>intra_scantable.raster_end[ s>block_last_index[n] ];

413  
414 
quant_matrix = s>inter_matrix; 
415 
asm volatile( 
416 
"pcmpeqw %%mm7, %%mm7 \n\t"

417 
"psrlq $48, %%mm7 \n\t"

418 
"movd %2, %%mm6 \n\t"

419 
"packssdw %%mm6, %%mm6 \n\t"

420 
"packssdw %%mm6, %%mm6 \n\t"

421 
"mov %3, %%"REG_a" \n\t" 
422 
ASMALIGN(4)

423 
"1: \n\t"

424 
"movq (%0, %%"REG_a"), %%mm0 \n\t" 
425 
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" 
426 
"movq (%1, %%"REG_a"), %%mm4 \n\t" 
427 
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" 
428 
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 
429 
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 
430 
"pxor %%mm2, %%mm2 \n\t"

431 
"pxor %%mm3, %%mm3 \n\t"

432 
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
433 
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
434 
"pxor %%mm2, %%mm0 \n\t"

435 
"pxor %%mm3, %%mm1 \n\t"

436 
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 
437 
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 
438 
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 
439 
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 
440 
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q 
441 
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q 
442 
"paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 
443 
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 
444 
"pxor %%mm4, %%mm4 \n\t"

445 
"pxor %%mm5, %%mm5 \n\t" // FIXME slow 
446 
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? 1 : 0 
447 
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? 1 : 0 
448 
"psrlw $4, %%mm0 \n\t"

449 
"psrlw $4, %%mm1 \n\t"

450 
"pxor %%mm2, %%mm0 \n\t"

451 
"pxor %%mm3, %%mm1 \n\t"

452 
"psubw %%mm2, %%mm0 \n\t"

453 
"psubw %%mm3, %%mm1 \n\t"

454 
"pandn %%mm0, %%mm4 \n\t"

455 
"pandn %%mm1, %%mm5 \n\t"

456 
"pxor %%mm4, %%mm7 \n\t"

457 
"pxor %%mm5, %%mm7 \n\t"

458 
"movq %%mm4, (%0, %%"REG_a") \n\t" 
459 
"movq %%mm5, 8(%0, %%"REG_a") \n\t" 
460  
461 
"add $16, %%"REG_a" \n\t" 
462 
"jng 1b \n\t"

463 
"movd 124(%0, %3), %%mm0 \n\t"

464 
"movq %%mm7, %%mm6 \n\t"

465 
"psrlq $32, %%mm7 \n\t"

466 
"pxor %%mm6, %%mm7 \n\t"

467 
"movq %%mm7, %%mm6 \n\t"

468 
"psrlq $16, %%mm7 \n\t"

469 
"pxor %%mm6, %%mm7 \n\t"

470 
"pslld $31, %%mm7 \n\t"

471 
"psrlq $15, %%mm7 \n\t"

472 
"pxor %%mm7, %%mm0 \n\t"

473 
"movd %%mm0, 124(%0, %3) \n\t"

474  
475 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (2*nCoeffs) 
476 
: "%"REG_a, "memory" 
477 
); 
478 
} 
479  
480 
/* draw the edges of width 'w' of an image of size width, height

481 
this mmx version can only handle w==8  w==16 */

482 
static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) 
483 
{ 
484 
uint8_t *ptr, *last_line; 
485 
int i;

486  
487 
last_line = buf + (height  1) * wrap;

488 
/* left and right */

489 
ptr = buf; 
490 
if(w==8) 
491 
{ 
492 
asm volatile( 
493 
"1: \n\t"

494 
"movd (%0), %%mm0 \n\t"

495 
"punpcklbw %%mm0, %%mm0 \n\t"

496 
"punpcklwd %%mm0, %%mm0 \n\t"

497 
"punpckldq %%mm0, %%mm0 \n\t"

498 
"movq %%mm0, 8(%0) \n\t"

499 
"movq 8(%0, %2), %%mm1 \n\t"

500 
"punpckhbw %%mm1, %%mm1 \n\t"

501 
"punpckhwd %%mm1, %%mm1 \n\t"

502 
"punpckhdq %%mm1, %%mm1 \n\t"

503 
"movq %%mm1, (%0, %2) \n\t"

504 
"add %1, %0 \n\t"

505 
"cmp %3, %0 \n\t"

506 
" jb 1b \n\t"

507 
: "+r" (ptr)

508 
: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) 
509 
); 
510 
} 
511 
else

512 
{ 
513 
asm volatile( 
514 
"1: \n\t"

515 
"movd (%0), %%mm0 \n\t"

516 
"punpcklbw %%mm0, %%mm0 \n\t"

517 
"punpcklwd %%mm0, %%mm0 \n\t"

518 
"punpckldq %%mm0, %%mm0 \n\t"

519 
"movq %%mm0, 8(%0) \n\t"

520 
"movq %%mm0, 16(%0) \n\t"

521 
"movq 8(%0, %2), %%mm1 \n\t"

522 
"punpckhbw %%mm1, %%mm1 \n\t"

523 
"punpckhwd %%mm1, %%mm1 \n\t"

524 
"punpckhdq %%mm1, %%mm1 \n\t"

525 
"movq %%mm1, (%0, %2) \n\t"

526 
"movq %%mm1, 8(%0, %2) \n\t"

527 
"add %1, %0 \n\t"

528 
"cmp %3, %0 \n\t"

529 
" jb 1b \n\t"

530 
: "+r" (ptr)

531 
: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) 
532 
); 
533 
} 
534  
535 
for(i=0;i<w;i+=4) { 
536 
/* top and bottom (and hopefully also the corners) */

537 
ptr= buf  (i + 1) * wrap  w;

538 
asm volatile( 
539 
"1: \n\t"

540 
"movq (%1, %0), %%mm0 \n\t"

541 
"movq %%mm0, (%0) \n\t"

542 
"movq %%mm0, (%0, %2) \n\t"

543 
"movq %%mm0, (%0, %2, 2) \n\t"

544 
"movq %%mm0, (%0, %3) \n\t"

545 
"add $8, %0 \n\t"

546 
"cmp %4, %0 \n\t"

547 
" jb 1b \n\t"

548 
: "+r" (ptr)

549 
: "r" ((long)buf  (long)ptr  w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w) 
550 
); 
551 
ptr= last_line + (i + 1) * wrap  w;

552 
asm volatile( 
553 
"1: \n\t"

554 
"movq (%1, %0), %%mm0 \n\t"

555 
"movq %%mm0, (%0) \n\t"

556 
"movq %%mm0, (%0, %2) \n\t"

557 
"movq %%mm0, (%0, %2, 2) \n\t"

558 
"movq %%mm0, (%0, %3) \n\t"

559 
"add $8, %0 \n\t"

560 
"cmp %4, %0 \n\t"

561 
" jb 1b \n\t"

562 
: "+r" (ptr)

563 
: "r" ((long)last_line  (long)ptr  w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w) 
564 
); 
565 
} 
566 
} 
567  
568 
static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ 
569 
const int intra= s>mb_intra; 
570 
int *sum= s>dct_error_sum[intra];

571 
uint16_t *offset= s>dct_offset[intra]; 
572  
573 
s>dct_count[intra]++; 
574  
575 
asm volatile( 
576 
"pxor %%mm7, %%mm7 \n\t"

577 
"1: \n\t"

578 
"pxor %%mm0, %%mm0 \n\t"

579 
"pxor %%mm1, %%mm1 \n\t"

580 
"movq (%0), %%mm2 \n\t"

581 
"movq 8(%0), %%mm3 \n\t"

582 
"pcmpgtw %%mm2, %%mm0 \n\t"

583 
"pcmpgtw %%mm3, %%mm1 \n\t"

584 
"pxor %%mm0, %%mm2 \n\t"

585 
"pxor %%mm1, %%mm3 \n\t"

586 
"psubw %%mm0, %%mm2 \n\t"

587 
"psubw %%mm1, %%mm3 \n\t"

588 
"movq %%mm2, %%mm4 \n\t"

589 
"movq %%mm3, %%mm5 \n\t"

590 
"psubusw (%2), %%mm2 \n\t"

591 
"psubusw 8(%2), %%mm3 \n\t"

592 
"pxor %%mm0, %%mm2 \n\t"

593 
"pxor %%mm1, %%mm3 \n\t"

594 
"psubw %%mm0, %%mm2 \n\t"

595 
"psubw %%mm1, %%mm3 \n\t"

596 
"movq %%mm2, (%0) \n\t"

597 
"movq %%mm3, 8(%0) \n\t"

598 
"movq %%mm4, %%mm2 \n\t"

599 
"movq %%mm5, %%mm3 \n\t"

600 
"punpcklwd %%mm7, %%mm4 \n\t"

601 
"punpckhwd %%mm7, %%mm2 \n\t"

602 
"punpcklwd %%mm7, %%mm5 \n\t"

603 
"punpckhwd %%mm7, %%mm3 \n\t"

604 
"paddd (%1), %%mm4 \n\t"

605 
"paddd 8(%1), %%mm2 \n\t"

606 
"paddd 16(%1), %%mm5 \n\t"

607 
"paddd 24(%1), %%mm3 \n\t"

608 
"movq %%mm4, (%1) \n\t"

609 
"movq %%mm2, 8(%1) \n\t"

610 
"movq %%mm5, 16(%1) \n\t"

611 
"movq %%mm3, 24(%1) \n\t"

612 
"add $16, %0 \n\t"

613 
"add $32, %1 \n\t"

614 
"add $16, %2 \n\t"

615 
"cmp %3, %0 \n\t"

616 
" jb 1b \n\t"

617 
: "+r" (block), "+r" (sum), "+r" (offset) 
618 
: "r"(block+64) 
619 
); 
620 
} 
621  
622 
static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ 
623 
const int intra= s>mb_intra; 
624 
int *sum= s>dct_error_sum[intra];

625 
uint16_t *offset= s>dct_offset[intra]; 
626  
627 
s>dct_count[intra]++; 
628  
629 
asm volatile( 
630 
"pxor %%xmm7, %%xmm7 \n\t"

631 
"1: \n\t"

632 
"pxor %%xmm0, %%xmm0 \n\t"

633 
"pxor %%xmm1, %%xmm1 \n\t"

634 
"movdqa (%0), %%xmm2 \n\t"

635 
"movdqa 16(%0), %%xmm3 \n\t"

636 
"pcmpgtw %%xmm2, %%xmm0 \n\t"

637 
"pcmpgtw %%xmm3, %%xmm1 \n\t"

638 
"pxor %%xmm0, %%xmm2 \n\t"

639 
"pxor %%xmm1, %%xmm3 \n\t"

640 
"psubw %%xmm0, %%xmm2 \n\t"

641 
"psubw %%xmm1, %%xmm3 \n\t"

642 
"movdqa %%xmm2, %%xmm4 \n\t"

643 
"movdqa %%xmm3, %%xmm5 \n\t"

644 
"psubusw (%2), %%xmm2 \n\t"

645 
"psubusw 16(%2), %%xmm3 \n\t"

646 
"pxor %%xmm0, %%xmm2 \n\t"

647 
"pxor %%xmm1, %%xmm3 \n\t"

648 
"psubw %%xmm0, %%xmm2 \n\t"

649 
"psubw %%xmm1, %%xmm3 \n\t"

650 
"movdqa %%xmm2, (%0) \n\t"

651 
"movdqa %%xmm3, 16(%0) \n\t"

652 
"movdqa %%xmm4, %%xmm6 \n\t"

653 
"movdqa %%xmm5, %%xmm0 \n\t"

654 
"punpcklwd %%xmm7, %%xmm4 \n\t"

655 
"punpckhwd %%xmm7, %%xmm6 \n\t"

656 
"punpcklwd %%xmm7, %%xmm5 \n\t"

657 
"punpckhwd %%xmm7, %%xmm0 \n\t"

658 
"paddd (%1), %%xmm4 \n\t"

659 
"paddd 16(%1), %%xmm6 \n\t"

660 
"paddd 32(%1), %%xmm5 \n\t"

661 
"paddd 48(%1), %%xmm0 \n\t"

662 
"movdqa %%xmm4, (%1) \n\t"

663 
"movdqa %%xmm6, 16(%1) \n\t"

664 
"movdqa %%xmm5, 32(%1) \n\t"

665 
"movdqa %%xmm0, 48(%1) \n\t"

666 
"add $32, %0 \n\t"

667 
"add $64, %1 \n\t"

668 
"add $32, %2 \n\t"

669 
"cmp %3, %0 \n\t"

670 
" jb 1b \n\t"

671 
: "+r" (block), "+r" (sum), "+r" (offset) 
672 
: "r"(block+64) 
673 
); 
674 
} 
675  
676 
#ifdef HAVE_SSSE3

677 
#define HAVE_SSSE3_BAK

678 
#endif

679 
#undef HAVE_SSSE3

680  
681 
#undef HAVE_SSE2

682 
#undef HAVE_MMX2

683 
#define RENAME(a) a ## _MMX 
684 
#define RENAMEl(a) a ## _mmx 
685 
#include "mpegvideo_mmx_template.c" 
686  
687 
#define HAVE_MMX2

688 
#undef RENAME

689 
#undef RENAMEl

690 
#define RENAME(a) a ## _MMX2 
691 
#define RENAMEl(a) a ## _mmx2 
692 
#include "mpegvideo_mmx_template.c" 
693  
694 
#define HAVE_SSE2

695 
#undef RENAME

696 
#undef RENAMEl

697 
#define RENAME(a) a ## _SSE2 
698 
#define RENAMEl(a) a ## _sse2 
699 
#include "mpegvideo_mmx_template.c" 
700  
701 
#ifdef HAVE_SSSE3_BAK

702 
#define HAVE_SSSE3

703 
#undef RENAME

704 
#undef RENAMEl

705 
#define RENAME(a) a ## _SSSE3 
706 
#define RENAMEl(a) a ## _sse2 
707 
#include "mpegvideo_mmx_template.c" 
708 
#endif

709  
710 
void MPV_common_init_mmx(MpegEncContext *s)

711 
{ 
712 
if (mm_flags & MM_MMX) {

713 
const int dct_algo = s>avctx>dct_algo; 
714  
715 
s>dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; 
716 
s>dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; 
717 
s>dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; 
718 
s>dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; 
719 
if(!(s>flags & CODEC_FLAG_BITEXACT))

720 
s>dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; 
721 
s>dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; 
722  
723 
draw_edges = draw_edges_mmx; 
724  
725 
if (mm_flags & MM_SSE2) {

726 
s>denoise_dct= denoise_dct_sse2; 
727 
} else {

728 
s>denoise_dct= denoise_dct_mmx; 
729 
} 
730  
731 
if(dct_algo==FF_DCT_AUTO  dct_algo==FF_DCT_MMX){

732 
#ifdef HAVE_SSSE3

733 
if(mm_flags & MM_SSSE3){

734 
s>dct_quantize= dct_quantize_SSSE3; 
735 
} else

736 
#endif

737 
if(mm_flags & MM_SSE2){

738 
s>dct_quantize= dct_quantize_SSE2; 
739 
} else if(mm_flags & MM_MMXEXT){ 
740 
s>dct_quantize= dct_quantize_MMX2; 
741 
} else {

742 
s>dct_quantize= dct_quantize_MMX; 
743 
} 
744 
} 
745 
} 
746 
} 