1 
/*


2 
* The simplest mpeg encoder (well, it was the simplest!)

3 
* Copyright (c) 2000,2001 Fabrice Bellard.

4 
*

5 
* This library is free software; you can redistribute it and/or

6 
* modify it under the terms of the GNU Lesser General Public

7 
* License as published by the Free Software Foundation; either

8 
* version 2 of the License, or (at your option) any later version.

9 
*

10 
* This library is distributed in the hope that it will be useful,

11 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

12 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

13 
* Lesser General Public License for more details.

14 
*

15 
* You should have received a copy of the GNU Lesser General Public

16 
* License along with this library; if not, write to the Free Software

17 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

18 
*

19 
* Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru>

20 
* h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>

21 
*/

22  
23 
#include "../dsputil.h" 
24 
#include "../mpegvideo.h" 
25 
#include "../avcodec.h" 
26 
#include "mmx.h" 
27  
28 
extern uint8_t zigzag_direct_noperm[64]; 
29 
extern uint16_t inv_zigzag_direct16[64]; 
30  
31 
static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; 
32 
static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; 
33  
34  
35 
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, 
36 
DCTELEM *block, int n, int qscale) 
37 
{ 
38 
long level, qmul, qadd, nCoeffs;

39  
40 
qmul = qscale << 1;

41  
42 
assert(s>block_last_index[n]>=0  s>h263_aic);

43  
44 
if (!s>h263_aic) {

45 
if (n < 4) 
46 
level = block[0] * s>y_dc_scale;

47 
else

48 
level = block[0] * s>c_dc_scale;

49 
qadd = (qscale  1)  1; 
50 
}else{

51 
qadd = 0;

52 
level= block[0];

53 
} 
54 
if(s>ac_pred)

55 
nCoeffs=63;

56 
else

57 
nCoeffs= s>inter_scantable.raster_end[ s>block_last_index[n] ]; 
58 
//printf("%d %d ", qmul, qadd);

59 
asm volatile( 
60 
"movd %1, %%mm6 \n\t" //qmul 
61 
"packssdw %%mm6, %%mm6 \n\t"

62 
"packssdw %%mm6, %%mm6 \n\t"

63 
"movd %2, %%mm5 \n\t" //qadd 
64 
"pxor %%mm7, %%mm7 \n\t"

65 
"packssdw %%mm5, %%mm5 \n\t"

66 
"packssdw %%mm5, %%mm5 \n\t"

67 
"psubw %%mm5, %%mm7 \n\t"

68 
"pxor %%mm4, %%mm4 \n\t"

69 
".balign 16 \n\t"

70 
"1: \n\t"

71 
"movq (%0, %3), %%mm0 \n\t"

72 
"movq 8(%0, %3), %%mm1 \n\t"

73  
74 
"pmullw %%mm6, %%mm0 \n\t"

75 
"pmullw %%mm6, %%mm1 \n\t"

76  
77 
"movq (%0, %3), %%mm2 \n\t"

78 
"movq 8(%0, %3), %%mm3 \n\t"

79  
80 
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
81 
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
82  
83 
"pxor %%mm2, %%mm0 \n\t"

84 
"pxor %%mm3, %%mm1 \n\t"

85  
86 
"paddw %%mm7, %%mm0 \n\t"

87 
"paddw %%mm7, %%mm1 \n\t"

88  
89 
"pxor %%mm0, %%mm2 \n\t"

90 
"pxor %%mm1, %%mm3 \n\t"

91  
92 
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? 1 : 0 
93 
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? 1 : 0 
94  
95 
"pandn %%mm2, %%mm0 \n\t"

96 
"pandn %%mm3, %%mm1 \n\t"

97  
98 
"movq %%mm0, (%0, %3) \n\t"

99 
"movq %%mm1, 8(%0, %3) \n\t"

100  
101 
"add $16, %3 \n\t"

102 
"jng 1b \n\t"

103 
::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(nCoeffs)) 
104 
: "memory"

105 
); 
106 
block[0]= level;

107 
} 
108  
109  
110 
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, 
111 
DCTELEM *block, int n, int qscale) 
112 
{ 
113 
long qmul, qadd, nCoeffs;

114  
115 
qmul = qscale << 1;

116 
qadd = (qscale  1)  1; 
117  
118 
assert(s>block_last_index[n]>=0  s>h263_aic);

119  
120 
nCoeffs= s>inter_scantable.raster_end[ s>block_last_index[n] ]; 
121 
//printf("%d %d ", qmul, qadd);

122 
asm volatile( 
123 
"movd %1, %%mm6 \n\t" //qmul 
124 
"packssdw %%mm6, %%mm6 \n\t"

125 
"packssdw %%mm6, %%mm6 \n\t"

126 
"movd %2, %%mm5 \n\t" //qadd 
127 
"pxor %%mm7, %%mm7 \n\t"

128 
"packssdw %%mm5, %%mm5 \n\t"

129 
"packssdw %%mm5, %%mm5 \n\t"

130 
"psubw %%mm5, %%mm7 \n\t"

131 
"pxor %%mm4, %%mm4 \n\t"

132 
".balign 16 \n\t"

133 
"1: \n\t"

134 
"movq (%0, %3), %%mm0 \n\t"

135 
"movq 8(%0, %3), %%mm1 \n\t"

136  
137 
"pmullw %%mm6, %%mm0 \n\t"

138 
"pmullw %%mm6, %%mm1 \n\t"

139  
140 
"movq (%0, %3), %%mm2 \n\t"

141 
"movq 8(%0, %3), %%mm3 \n\t"

142  
143 
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
144 
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
145  
146 
"pxor %%mm2, %%mm0 \n\t"

147 
"pxor %%mm3, %%mm1 \n\t"

148  
149 
"paddw %%mm7, %%mm0 \n\t"

150 
"paddw %%mm7, %%mm1 \n\t"

151  
152 
"pxor %%mm0, %%mm2 \n\t"

153 
"pxor %%mm1, %%mm3 \n\t"

154  
155 
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? 1 : 0 
156 
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? 1 : 0 
157  
158 
"pandn %%mm2, %%mm0 \n\t"

159 
"pandn %%mm3, %%mm1 \n\t"

160  
161 
"movq %%mm0, (%0, %3) \n\t"

162 
"movq %%mm1, 8(%0, %3) \n\t"

163  
164 
"add $16, %3 \n\t"

165 
"jng 1b \n\t"

166 
::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(nCoeffs)) 
167 
: "memory"

168 
); 
169 
} 
170  
171  
172 
/*

173 
NK:

174 
Note: looking at PARANOID:

175 
"enable all paranoid tests for rounding, overflows, etc..."

176 

177 
#ifdef PARANOID

178 
if (level < 2048  level > 2047)

179 
fprintf(stderr, "unquant error %d %d\n", i, level);

180 
#endif

181 
We can suppose that result of two multiplications can't be greate of 0xFFFF

182 
i.e. is 16bit, so we use here only PMULLW instruction and can avoid

183 
a complex multiplication.

184 
=====================================================

185 
Full formula for multiplication of 2 integer numbers

186 
which are represent as high:low words:

187 
input: value1 = high1:low1

188 
value2 = high2:low2

189 
output: value3 = value1*value2

190 
value3=high3:low3 (on overflow: modulus 2^32 wraparound)

191 
this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4

192 
but this algorithm will compute only 0x66cb0ce4

193 
this limited by 16bit size of operands

194 


195 
tlow1 = high1*low2

196 
tlow2 = high2*low1

197 
tlow1 = tlow1 + tlow2

198 
high3:low3 = low1*low2

199 
high3 += tlow1

200 
*/

201 
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, 
202 
DCTELEM *block, int n, int qscale) 
203 
{ 
204 
long nCoeffs;

205 
const uint16_t *quant_matrix;

206 
int block0;

207  
208 
assert(s>block_last_index[n]>=0);

209  
210 
nCoeffs= s>intra_scantable.raster_end[ s>block_last_index[n] ]+1;

211  
212 
if (n < 4) 
213 
block0 = block[0] * s>y_dc_scale;

214 
else

215 
block0 = block[0] * s>c_dc_scale;

216 
/* XXX: only mpeg1 */

217 
quant_matrix = s>intra_matrix; 
218 
asm volatile( 
219 
"pcmpeqw %%mm7, %%mm7 \n\t"

220 
"psrlw $15, %%mm7 \n\t"

221 
"movd %2, %%mm6 \n\t"

222 
"packssdw %%mm6, %%mm6 \n\t"

223 
"packssdw %%mm6, %%mm6 \n\t"

224 
"mov %3, %%"REG_a" \n\t" 
225 
".balign 16 \n\t"

226 
"1: \n\t"

227 
"movq (%0, %%"REG_a"), %%mm0 \n\t" 
228 
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" 
229 
"movq (%1, %%"REG_a"), %%mm4 \n\t" 
230 
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" 
231 
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 
232 
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 
233 
"pxor %%mm2, %%mm2 \n\t"

234 
"pxor %%mm3, %%mm3 \n\t"

235 
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
236 
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
237 
"pxor %%mm2, %%mm0 \n\t"

238 
"pxor %%mm3, %%mm1 \n\t"

239 
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 
240 
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 
241 
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 
242 
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 
243 
"pxor %%mm4, %%mm4 \n\t"

244 
"pxor %%mm5, %%mm5 \n\t" // FIXME slow 
245 
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? 1 : 0 
246 
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? 1 : 0 
247 
"psraw $3, %%mm0 \n\t"

248 
"psraw $3, %%mm1 \n\t"

249 
"psubw %%mm7, %%mm0 \n\t"

250 
"psubw %%mm7, %%mm1 \n\t"

251 
"por %%mm7, %%mm0 \n\t"

252 
"por %%mm7, %%mm1 \n\t"

253 
"pxor %%mm2, %%mm0 \n\t"

254 
"pxor %%mm3, %%mm1 \n\t"

255 
"psubw %%mm2, %%mm0 \n\t"

256 
"psubw %%mm3, %%mm1 \n\t"

257 
"pandn %%mm0, %%mm4 \n\t"

258 
"pandn %%mm1, %%mm5 \n\t"

259 
"movq %%mm4, (%0, %%"REG_a") \n\t" 
260 
"movq %%mm5, 8(%0, %%"REG_a") \n\t" 
261  
262 
"add $16, %%"REG_a" \n\t" 
263 
"js 1b \n\t"

264 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (2*nCoeffs) 
265 
: "%"REG_a, "memory" 
266 
); 
267 
block[0]= block0;

268 
} 
269  
270 
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, 
271 
DCTELEM *block, int n, int qscale) 
272 
{ 
273 
long nCoeffs;

274 
const uint16_t *quant_matrix;

275  
276 
assert(s>block_last_index[n]>=0);

277  
278 
nCoeffs= s>intra_scantable.raster_end[ s>block_last_index[n] ]+1;

279  
280 
quant_matrix = s>inter_matrix; 
281 
asm volatile( 
282 
"pcmpeqw %%mm7, %%mm7 \n\t"

283 
"psrlw $15, %%mm7 \n\t"

284 
"movd %2, %%mm6 \n\t"

285 
"packssdw %%mm6, %%mm6 \n\t"

286 
"packssdw %%mm6, %%mm6 \n\t"

287 
"mov %3, %%"REG_a" \n\t" 
288 
".balign 16 \n\t"

289 
"1: \n\t"

290 
"movq (%0, %%"REG_a"), %%mm0 \n\t" 
291 
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" 
292 
"movq (%1, %%"REG_a"), %%mm4 \n\t" 
293 
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" 
294 
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 
295 
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 
296 
"pxor %%mm2, %%mm2 \n\t"

297 
"pxor %%mm3, %%mm3 \n\t"

298 
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
299 
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
300 
"pxor %%mm2, %%mm0 \n\t"

301 
"pxor %%mm3, %%mm1 \n\t"

302 
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 
303 
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 
304 
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 
305 
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 
306 
"paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 
307 
"paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 
308 
"pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 
309 
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 
310 
"pxor %%mm4, %%mm4 \n\t"

311 
"pxor %%mm5, %%mm5 \n\t" // FIXME slow 
312 
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? 1 : 0 
313 
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? 1 : 0 
314 
"psraw $4, %%mm0 \n\t"

315 
"psraw $4, %%mm1 \n\t"

316 
"psubw %%mm7, %%mm0 \n\t"

317 
"psubw %%mm7, %%mm1 \n\t"

318 
"por %%mm7, %%mm0 \n\t"

319 
"por %%mm7, %%mm1 \n\t"

320 
"pxor %%mm2, %%mm0 \n\t"

321 
"pxor %%mm3, %%mm1 \n\t"

322 
"psubw %%mm2, %%mm0 \n\t"

323 
"psubw %%mm3, %%mm1 \n\t"

324 
"pandn %%mm0, %%mm4 \n\t"

325 
"pandn %%mm1, %%mm5 \n\t"

326 
"movq %%mm4, (%0, %%"REG_a") \n\t" 
327 
"movq %%mm5, 8(%0, %%"REG_a") \n\t" 
328  
329 
"add $16, %%"REG_a" \n\t" 
330 
"js 1b \n\t"

331 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (2*nCoeffs) 
332 
: "%"REG_a, "memory" 
333 
); 
334 
} 
335  
336 
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, 
337 
DCTELEM *block, int n, int qscale) 
338 
{ 
339 
long nCoeffs;

340 
const uint16_t *quant_matrix;

341 
int block0;

342  
343 
assert(s>block_last_index[n]>=0);

344  
345 
if(s>alternate_scan) nCoeffs= 63; //FIXME 
346 
else nCoeffs= s>intra_scantable.raster_end[ s>block_last_index[n] ];

347  
348 
if (n < 4) 
349 
block0 = block[0] * s>y_dc_scale;

350 
else

351 
block0 = block[0] * s>c_dc_scale;

352 
quant_matrix = s>intra_matrix; 
353 
asm volatile( 
354 
"pcmpeqw %%mm7, %%mm7 \n\t"

355 
"psrlw $15, %%mm7 \n\t"

356 
"movd %2, %%mm6 \n\t"

357 
"packssdw %%mm6, %%mm6 \n\t"

358 
"packssdw %%mm6, %%mm6 \n\t"

359 
"mov %3, %%"REG_a" \n\t" 
360 
".balign 16 \n\t"

361 
"1: \n\t"

362 
"movq (%0, %%"REG_a"), %%mm0 \n\t" 
363 
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" 
364 
"movq (%1, %%"REG_a"), %%mm4 \n\t" 
365 
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" 
366 
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 
367 
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 
368 
"pxor %%mm2, %%mm2 \n\t"

369 
"pxor %%mm3, %%mm3 \n\t"

370 
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
371 
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
372 
"pxor %%mm2, %%mm0 \n\t"

373 
"pxor %%mm3, %%mm1 \n\t"

374 
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 
375 
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 
376 
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 
377 
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 
378 
"pxor %%mm4, %%mm4 \n\t"

379 
"pxor %%mm5, %%mm5 \n\t" // FIXME slow 
380 
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? 1 : 0 
381 
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? 1 : 0 
382 
"psraw $3, %%mm0 \n\t"

383 
"psraw $3, %%mm1 \n\t"

384 
"pxor %%mm2, %%mm0 \n\t"

385 
"pxor %%mm3, %%mm1 \n\t"

386 
"psubw %%mm2, %%mm0 \n\t"

387 
"psubw %%mm3, %%mm1 \n\t"

388 
"pandn %%mm0, %%mm4 \n\t"

389 
"pandn %%mm1, %%mm5 \n\t"

390 
"movq %%mm4, (%0, %%"REG_a") \n\t" 
391 
"movq %%mm5, 8(%0, %%"REG_a") \n\t" 
392  
393 
"add $16, %%"REG_a" \n\t" 
394 
"jng 1b \n\t"

395 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (2*nCoeffs) 
396 
: "%"REG_a, "memory" 
397 
); 
398 
block[0]= block0;

399 
//Note, we dont do mismatch control for intra as errors cannot accumulate

400 
} 
401  
402 
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, 
403 
DCTELEM *block, int n, int qscale) 
404 
{ 
405 
long nCoeffs;

406 
const uint16_t *quant_matrix;

407  
408 
assert(s>block_last_index[n]>=0);

409  
410 
if(s>alternate_scan) nCoeffs= 63; //FIXME 
411 
else nCoeffs= s>intra_scantable.raster_end[ s>block_last_index[n] ];

412  
413 
quant_matrix = s>inter_matrix; 
414 
asm volatile( 
415 
"pcmpeqw %%mm7, %%mm7 \n\t"

416 
"psrlq $48, %%mm7 \n\t"

417 
"movd %2, %%mm6 \n\t"

418 
"packssdw %%mm6, %%mm6 \n\t"

419 
"packssdw %%mm6, %%mm6 \n\t"

420 
"mov %3, %%"REG_a" \n\t" 
421 
".balign 16 \n\t"

422 
"1: \n\t"

423 
"movq (%0, %%"REG_a"), %%mm0 \n\t" 
424 
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" 
425 
"movq (%1, %%"REG_a"), %%mm4 \n\t" 
426 
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" 
427 
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 
428 
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 
429 
"pxor %%mm2, %%mm2 \n\t"

430 
"pxor %%mm3, %%mm3 \n\t"

431 
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? 1 : 0 
432 
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? 1 : 0 
433 
"pxor %%mm2, %%mm0 \n\t"

434 
"pxor %%mm3, %%mm1 \n\t"

435 
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 
436 
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 
437 
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 
438 
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 
439 
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q 
440 
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q 
441 
"paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 
442 
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 
443 
"pxor %%mm4, %%mm4 \n\t"

444 
"pxor %%mm5, %%mm5 \n\t" // FIXME slow 
445 
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? 1 : 0 
446 
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? 1 : 0 
447 
"psrlw $4, %%mm0 \n\t"

448 
"psrlw $4, %%mm1 \n\t"

449 
"pxor %%mm2, %%mm0 \n\t"

450 
"pxor %%mm3, %%mm1 \n\t"

451 
"psubw %%mm2, %%mm0 \n\t"

452 
"psubw %%mm3, %%mm1 \n\t"

453 
"pandn %%mm0, %%mm4 \n\t"

454 
"pandn %%mm1, %%mm5 \n\t"

455 
"pxor %%mm4, %%mm7 \n\t"

456 
"pxor %%mm5, %%mm7 \n\t"

457 
"movq %%mm4, (%0, %%"REG_a") \n\t" 
458 
"movq %%mm5, 8(%0, %%"REG_a") \n\t" 
459  
460 
"add $16, %%"REG_a" \n\t" 
461 
"jng 1b \n\t"

462 
"movd 124(%0, %3), %%mm0 \n\t"

463 
"movq %%mm7, %%mm6 \n\t"

464 
"psrlq $32, %%mm7 \n\t"

465 
"pxor %%mm6, %%mm7 \n\t"

466 
"movq %%mm7, %%mm6 \n\t"

467 
"psrlq $16, %%mm7 \n\t"

468 
"pxor %%mm6, %%mm7 \n\t"

469 
"pslld $31, %%mm7 \n\t"

470 
"psrlq $15, %%mm7 \n\t"

471 
"pxor %%mm7, %%mm0 \n\t"

472 
"movd %%mm0, 124(%0, %3) \n\t"

473  
474 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (2*nCoeffs) 
475 
: "%"REG_a, "memory" 
476 
); 
477 
} 
478  
479 
/* draw the edges of width 'w' of an image of size width, height

480 
this mmx version can only handle w==8  w==16 */

481 
static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) 
482 
{ 
483 
uint8_t *ptr, *last_line; 
484 
int i;

485  
486 
last_line = buf + (height  1) * wrap;

487 
/* left and right */

488 
ptr = buf; 
489 
if(w==8) 
490 
{ 
491 
asm volatile( 
492 
"1: \n\t"

493 
"movd (%0), %%mm0 \n\t"

494 
"punpcklbw %%mm0, %%mm0 \n\t"

495 
"punpcklwd %%mm0, %%mm0 \n\t"

496 
"punpckldq %%mm0, %%mm0 \n\t"

497 
"movq %%mm0, 8(%0) \n\t"

498 
"movq 8(%0, %2), %%mm1 \n\t"

499 
"punpckhbw %%mm1, %%mm1 \n\t"

500 
"punpckhwd %%mm1, %%mm1 \n\t"

501 
"punpckhdq %%mm1, %%mm1 \n\t"

502 
"movq %%mm1, (%0, %2) \n\t"

503 
"add %1, %0 \n\t"

504 
"cmp %3, %0 \n\t"

505 
" jb 1b \n\t"

506 
: "+r" (ptr)

507 
: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) 
508 
); 
509 
} 
510 
else

511 
{ 
512 
asm volatile( 
513 
"1: \n\t"

514 
"movd (%0), %%mm0 \n\t"

515 
"punpcklbw %%mm0, %%mm0 \n\t"

516 
"punpcklwd %%mm0, %%mm0 \n\t"

517 
"punpckldq %%mm0, %%mm0 \n\t"

518 
"movq %%mm0, 8(%0) \n\t"

519 
"movq %%mm0, 16(%0) \n\t"

520 
"movq 8(%0, %2), %%mm1 \n\t"

521 
"punpckhbw %%mm1, %%mm1 \n\t"

522 
"punpckhwd %%mm1, %%mm1 \n\t"

523 
"punpckhdq %%mm1, %%mm1 \n\t"

524 
"movq %%mm1, (%0, %2) \n\t"

525 
"movq %%mm1, 8(%0, %2) \n\t"

526 
"add %1, %0 \n\t"

527 
"cmp %3, %0 \n\t"

528 
" jb 1b \n\t"

529 
: "+r" (ptr)

530 
: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) 
531 
); 
532 
} 
533  
534 
for(i=0;i<w;i+=4) { 
535 
/* top and bottom (and hopefully also the corners) */

536 
ptr= buf  (i + 1) * wrap  w;

537 
asm volatile( 
538 
"1: \n\t"

539 
"movq (%1, %0), %%mm0 \n\t"

540 
"movq %%mm0, (%0) \n\t"

541 
"movq %%mm0, (%0, %2) \n\t"

542 
"movq %%mm0, (%0, %2, 2) \n\t"

543 
"movq %%mm0, (%0, %3) \n\t"

544 
"add $8, %0 \n\t"

545 
"cmp %4, %0 \n\t"

546 
" jb 1b \n\t"

547 
: "+r" (ptr)

548 
: "r" ((long)buf  (long)ptr  w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w) 
549 
); 
550 
ptr= last_line + (i + 1) * wrap  w;

551 
asm volatile( 
552 
"1: \n\t"

553 
"movq (%1, %0), %%mm0 \n\t"

554 
"movq %%mm0, (%0) \n\t"

555 
"movq %%mm0, (%0, %2) \n\t"

556 
"movq %%mm0, (%0, %2, 2) \n\t"

557 
"movq %%mm0, (%0, %3) \n\t"

558 
"add $8, %0 \n\t"

559 
"cmp %4, %0 \n\t"

560 
" jb 1b \n\t"

561 
: "+r" (ptr)

562 
: "r" ((long)last_line  (long)ptr  w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w) 
563 
); 
564 
} 
565 
} 
566  
567 
static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ 
568 
const int intra= s>mb_intra; 
569 
int *sum= s>dct_error_sum[intra];

570 
uint16_t *offset= s>dct_offset[intra]; 
571  
572 
s>dct_count[intra]++; 
573  
574 
asm volatile( 
575 
"pxor %%mm7, %%mm7 \n\t"

576 
"1: \n\t"

577 
"pxor %%mm0, %%mm0 \n\t"

578 
"pxor %%mm1, %%mm1 \n\t"

579 
"movq (%0), %%mm2 \n\t"

580 
"movq 8(%0), %%mm3 \n\t"

581 
"pcmpgtw %%mm2, %%mm0 \n\t"

582 
"pcmpgtw %%mm3, %%mm1 \n\t"

583 
"pxor %%mm0, %%mm2 \n\t"

584 
"pxor %%mm1, %%mm3 \n\t"

585 
"psubw %%mm0, %%mm2 \n\t"

586 
"psubw %%mm1, %%mm3 \n\t"

587 
"movq %%mm2, %%mm4 \n\t"

588 
"movq %%mm3, %%mm5 \n\t"

589 
"psubusw (%2), %%mm2 \n\t"

590 
"psubusw 8(%2), %%mm3 \n\t"

591 
"pxor %%mm0, %%mm2 \n\t"

592 
"pxor %%mm1, %%mm3 \n\t"

593 
"psubw %%mm0, %%mm2 \n\t"

594 
"psubw %%mm1, %%mm3 \n\t"

595 
"movq %%mm2, (%0) \n\t"

596 
"movq %%mm3, 8(%0) \n\t"

597 
"movq %%mm4, %%mm2 \n\t"

598 
"movq %%mm5, %%mm3 \n\t"

599 
"punpcklwd %%mm7, %%mm4 \n\t"

600 
"punpckhwd %%mm7, %%mm2 \n\t"

601 
"punpcklwd %%mm7, %%mm5 \n\t"

602 
"punpckhwd %%mm7, %%mm3 \n\t"

603 
"paddd (%1), %%mm4 \n\t"

604 
"paddd 8(%1), %%mm2 \n\t"

605 
"paddd 16(%1), %%mm5 \n\t"

606 
"paddd 24(%1), %%mm3 \n\t"

607 
"movq %%mm4, (%1) \n\t"

608 
"movq %%mm2, 8(%1) \n\t"

609 
"movq %%mm5, 16(%1) \n\t"

610 
"movq %%mm3, 24(%1) \n\t"

611 
"add $16, %0 \n\t"

612 
"add $32, %1 \n\t"

613 
"add $16, %2 \n\t"

614 
"cmp %3, %0 \n\t"

615 
" jb 1b \n\t"

616 
: "+r" (block), "+r" (sum), "+r" (offset) 
617 
: "r"(block+64) 
618 
); 
619 
} 
620  
621 
static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ 
622 
const int intra= s>mb_intra; 
623 
int *sum= s>dct_error_sum[intra];

624 
uint16_t *offset= s>dct_offset[intra]; 
625  
626 
s>dct_count[intra]++; 
627  
628 
asm volatile( 
629 
"pxor %%xmm7, %%xmm7 \n\t"

630 
"1: \n\t"

631 
"pxor %%xmm0, %%xmm0 \n\t"

632 
"pxor %%xmm1, %%xmm1 \n\t"

633 
"movdqa (%0), %%xmm2 \n\t"

634 
"movdqa 16(%0), %%xmm3 \n\t"

635 
"pcmpgtw %%xmm2, %%xmm0 \n\t"

636 
"pcmpgtw %%xmm3, %%xmm1 \n\t"

637 
"pxor %%xmm0, %%xmm2 \n\t"

638 
"pxor %%xmm1, %%xmm3 \n\t"

639 
"psubw %%xmm0, %%xmm2 \n\t"

640 
"psubw %%xmm1, %%xmm3 \n\t"

641 
"movdqa %%xmm2, %%xmm4 \n\t"

642 
"movdqa %%xmm3, %%xmm5 \n\t"

643 
"psubusw (%2), %%xmm2 \n\t"

644 
"psubusw 16(%2), %%xmm3 \n\t"

645 
"pxor %%xmm0, %%xmm2 \n\t"

646 
"pxor %%xmm1, %%xmm3 \n\t"

647 
"psubw %%xmm0, %%xmm2 \n\t"

648 
"psubw %%xmm1, %%xmm3 \n\t"

649 
"movdqa %%xmm2, (%0) \n\t"

650 
"movdqa %%xmm3, 16(%0) \n\t"

651 
"movdqa %%xmm4, %%xmm6 \n\t"

652 
"movdqa %%xmm5, %%xmm0 \n\t"

653 
"punpcklwd %%xmm7, %%xmm4 \n\t"

654 
"punpckhwd %%xmm7, %%xmm6 \n\t"

655 
"punpcklwd %%xmm7, %%xmm5 \n\t"

656 
"punpckhwd %%xmm7, %%xmm0 \n\t"

657 
"paddd (%1), %%xmm4 \n\t"

658 
"paddd 16(%1), %%xmm6 \n\t"

659 
"paddd 32(%1), %%xmm5 \n\t"

660 
"paddd 48(%1), %%xmm0 \n\t"

661 
"movdqa %%xmm4, (%1) \n\t"

662 
"movdqa %%xmm6, 16(%1) \n\t"

663 
"movdqa %%xmm5, 32(%1) \n\t"

664 
"movdqa %%xmm0, 48(%1) \n\t"

665 
"add $32, %0 \n\t"

666 
"add $64, %1 \n\t"

667 
"add $32, %2 \n\t"

668 
"cmp %3, %0 \n\t"

669 
" jb 1b \n\t"

670 
: "+r" (block), "+r" (sum), "+r" (offset) 
671 
: "r"(block+64) 
672 
); 
673 
} 
674  
675 
#undef HAVE_MMX2

676 
#define RENAME(a) a ## _MMX 
677 
#define RENAMEl(a) a ## _mmx 
678 
#include "mpegvideo_mmx_template.c" 
679  
680 
#define HAVE_MMX2

681 
#undef RENAME

682 
#undef RENAMEl

683 
#define RENAME(a) a ## _MMX2 
684 
#define RENAMEl(a) a ## _mmx2 
685 
#include "mpegvideo_mmx_template.c" 
686  
687 
#undef RENAME

688 
#undef RENAMEl

689 
#define RENAME(a) a ## _SSE2 
690 
#define RENAMEl(a) a ## _sse2 
691 
#include "mpegvideo_mmx_template.c" 
692  
693 
void MPV_common_init_mmx(MpegEncContext *s)

694 
{ 
695 
if (mm_flags & MM_MMX) {

696 
const int dct_algo = s>avctx>dct_algo; 
697  
698 
s>dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; 
699 
s>dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; 
700 
s>dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; 
701 
s>dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; 
702 
s>dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; 
703 
s>dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; 
704  
705 
draw_edges = draw_edges_mmx; 
706  
707 
if (mm_flags & MM_SSE2) {

708 
s>denoise_dct= denoise_dct_sse2; 
709 
} else {

710 
s>denoise_dct= denoise_dct_mmx; 
711 
} 
712  
713 
if(dct_algo==FF_DCT_AUTO  dct_algo==FF_DCT_MMX){

714 
if(mm_flags & MM_SSE2){

715 
s>dct_quantize= dct_quantize_SSE2; 
716 
} else if(mm_flags & MM_MMXEXT){ 
717 
s>dct_quantize= dct_quantize_MMX2; 
718 
} else {

719 
s>dct_quantize= dct_quantize_MMX; 
720 
} 
721 
} 
722 
} 
723 
} 