ffmpeg / libavcodec / i386 / dsputilenc_mmx.c @ c4ff7c53
History  View  Annotate  Download (43.1 KB)
1 
/*


2 
* MMX optimized DSP utils

3 
* Copyright (c) 2000, 2001 Fabrice Bellard.

4 
* Copyright (c) 20022004 Michael Niedermayer <michaelni@gmx.at>

5 
*

6 
* This file is part of FFmpeg.

7 
*

8 
* FFmpeg is free software; you can redistribute it and/or

9 
* modify it under the terms of the GNU Lesser General Public

10 
* License as published by the Free Software Foundation; either

11 
* version 2.1 of the License, or (at your option) any later version.

12 
*

13 
* FFmpeg is distributed in the hope that it will be useful,

14 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

15 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

16 
* Lesser General Public License for more details.

17 
*

18 
* You should have received a copy of the GNU Lesser General Public

19 
* License along with FFmpeg; if not, write to the Free Software

20 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

21 
*

22 
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>

23 
*/

24  
25 
#include "libavutil/x86_cpu.h" 
26 
#include "libavcodec/dsputil.h" 
27 
#include "libavcodec/mpegvideo.h" 
28 
#include "dsputil_mmx.h" 
29  
30  
31 
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) 
32 
{ 
33 
__asm__ volatile(

34 
"mov $128, %%"REG_a" \n\t" 
35 
"pxor %%mm7, %%mm7 \n\t"

36 
ASMALIGN(4)

37 
"1: \n\t"

38 
"movq (%0), %%mm0 \n\t"

39 
"movq (%0, %2), %%mm2 \n\t"

40 
"movq %%mm0, %%mm1 \n\t"

41 
"movq %%mm2, %%mm3 \n\t"

42 
"punpcklbw %%mm7, %%mm0 \n\t"

43 
"punpckhbw %%mm7, %%mm1 \n\t"

44 
"punpcklbw %%mm7, %%mm2 \n\t"

45 
"punpckhbw %%mm7, %%mm3 \n\t"

46 
"movq %%mm0, (%1, %%"REG_a") \n\t" 
47 
"movq %%mm1, 8(%1, %%"REG_a") \n\t" 
48 
"movq %%mm2, 16(%1, %%"REG_a") \n\t" 
49 
"movq %%mm3, 24(%1, %%"REG_a") \n\t" 
50 
"add %3, %0 \n\t"

51 
"add $32, %%"REG_a" \n\t" 
52 
"js 1b \n\t"

53 
: "+r" (pixels)

54 
: "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) 
55 
: "%"REG_a

56 
); 
57 
} 
58  
59 
static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) 
60 
{ 
61 
__asm__ volatile(

62 
"pxor %%xmm7, %%xmm7 \n\t"

63 
"movq (%0), %%xmm0 \n\t"

64 
"movq (%0, %2), %%xmm1 \n\t"

65 
"movq (%0, %2,2), %%xmm2 \n\t"

66 
"movq (%0, %3), %%xmm3 \n\t"

67 
"lea (%0,%2,4), %0 \n\t"

68 
"punpcklbw %%xmm7, %%xmm0 \n\t"

69 
"punpcklbw %%xmm7, %%xmm1 \n\t"

70 
"punpcklbw %%xmm7, %%xmm2 \n\t"

71 
"punpcklbw %%xmm7, %%xmm3 \n\t"

72 
"movdqa %%xmm0, (%1) \n\t"

73 
"movdqa %%xmm1, 16(%1) \n\t"

74 
"movdqa %%xmm2, 32(%1) \n\t"

75 
"movdqa %%xmm3, 48(%1) \n\t"

76 
"movq (%0), %%xmm0 \n\t"

77 
"movq (%0, %2), %%xmm1 \n\t"

78 
"movq (%0, %2,2), %%xmm2 \n\t"

79 
"movq (%0, %3), %%xmm3 \n\t"

80 
"punpcklbw %%xmm7, %%xmm0 \n\t"

81 
"punpcklbw %%xmm7, %%xmm1 \n\t"

82 
"punpcklbw %%xmm7, %%xmm2 \n\t"

83 
"punpcklbw %%xmm7, %%xmm3 \n\t"

84 
"movdqa %%xmm0, 64(%1) \n\t"

85 
"movdqa %%xmm1, 80(%1) \n\t"

86 
"movdqa %%xmm2, 96(%1) \n\t"

87 
"movdqa %%xmm3, 112(%1) \n\t"

88 
: "+r" (pixels)

89 
: "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) 
90 
); 
91 
} 
92  
93 
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) 
94 
{ 
95 
__asm__ volatile(

96 
"pxor %%mm7, %%mm7 \n\t"

97 
"mov $128, %%"REG_a" \n\t" 
98 
ASMALIGN(4)

99 
"1: \n\t"

100 
"movq (%0), %%mm0 \n\t"

101 
"movq (%1), %%mm2 \n\t"

102 
"movq %%mm0, %%mm1 \n\t"

103 
"movq %%mm2, %%mm3 \n\t"

104 
"punpcklbw %%mm7, %%mm0 \n\t"

105 
"punpckhbw %%mm7, %%mm1 \n\t"

106 
"punpcklbw %%mm7, %%mm2 \n\t"

107 
"punpckhbw %%mm7, %%mm3 \n\t"

108 
"psubw %%mm2, %%mm0 \n\t"

109 
"psubw %%mm3, %%mm1 \n\t"

110 
"movq %%mm0, (%2, %%"REG_a") \n\t" 
111 
"movq %%mm1, 8(%2, %%"REG_a") \n\t" 
112 
"add %3, %0 \n\t"

113 
"add %3, %1 \n\t"

114 
"add $16, %%"REG_a" \n\t" 
115 
"jnz 1b \n\t"

116 
: "+r" (s1), "+r" (s2) 
117 
: "r" (block+64), "r" ((x86_reg)stride) 
118 
: "%"REG_a

119 
); 
120 
} 
121  
122 
static int pix_sum16_mmx(uint8_t * pix, int line_size){ 
123 
const int h=16; 
124 
int sum;

125 
x86_reg index= line_size*h; 
126  
127 
__asm__ volatile(

128 
"pxor %%mm7, %%mm7 \n\t"

129 
"pxor %%mm6, %%mm6 \n\t"

130 
"1: \n\t"

131 
"movq (%2, %1), %%mm0 \n\t"

132 
"movq (%2, %1), %%mm1 \n\t"

133 
"movq 8(%2, %1), %%mm2 \n\t"

134 
"movq 8(%2, %1), %%mm3 \n\t"

135 
"punpcklbw %%mm7, %%mm0 \n\t"

136 
"punpckhbw %%mm7, %%mm1 \n\t"

137 
"punpcklbw %%mm7, %%mm2 \n\t"

138 
"punpckhbw %%mm7, %%mm3 \n\t"

139 
"paddw %%mm0, %%mm1 \n\t"

140 
"paddw %%mm2, %%mm3 \n\t"

141 
"paddw %%mm1, %%mm3 \n\t"

142 
"paddw %%mm3, %%mm6 \n\t"

143 
"add %3, %1 \n\t"

144 
" js 1b \n\t"

145 
"movq %%mm6, %%mm5 \n\t"

146 
"psrlq $32, %%mm6 \n\t"

147 
"paddw %%mm5, %%mm6 \n\t"

148 
"movq %%mm6, %%mm5 \n\t"

149 
"psrlq $16, %%mm6 \n\t"

150 
"paddw %%mm5, %%mm6 \n\t"

151 
"movd %%mm6, %0 \n\t"

152 
"andl $0xFFFF, %0 \n\t"

153 
: "=&r" (sum), "+r" (index) 
154 
: "r" (pix  index), "r" ((x86_reg)line_size) 
155 
); 
156  
157 
return sum;

158 
} 
159  
160 
static int pix_norm1_mmx(uint8_t *pix, int line_size) { 
161 
int tmp;

162 
__asm__ volatile (

163 
"movl $16,%%ecx\n"

164 
"pxor %%mm0,%%mm0\n"

165 
"pxor %%mm7,%%mm7\n"

166 
"1:\n"

167 
"movq (%0),%%mm2\n" /* mm2 = pix[07] */ 
168 
"movq 8(%0),%%mm3\n" /* mm3 = pix[815] */ 
169  
170 
"movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[07] */ 
171  
172 
"punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix47] */ 
173 
"punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix03] */ 
174  
175 
"movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[815] */ 
176 
"punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix1215] */ 
177 
"punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix811] */ 
178  
179 
"pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ 
180 
"pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ 
181  
182 
"pmaddwd %%mm3,%%mm3\n"

183 
"pmaddwd %%mm4,%%mm4\n"

184  
185 
"paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, 
186 
pix2^2+pix3^2+pix6^2+pix7^2) */

187 
"paddd %%mm3,%%mm4\n"

188 
"paddd %%mm2,%%mm7\n"

189  
190 
"add %2, %0\n"

191 
"paddd %%mm4,%%mm7\n"

192 
"dec %%ecx\n"

193 
"jnz 1b\n"

194  
195 
"movq %%mm7,%%mm1\n"

196 
"psrlq $32, %%mm7\n" /* shift hi dword to lo */ 
197 
"paddd %%mm7,%%mm1\n"

198 
"movd %%mm1,%1\n"

199 
: "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); 
200 
return tmp;

201 
} 
202  
203 
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
204 
int tmp;

205 
__asm__ volatile (

206 
"movl %4,%%ecx\n"

207 
"shr $1,%%ecx\n"

208 
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 
209 
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 
210 
"1:\n"

211 
"movq (%0),%%mm1\n" /* mm1 = pix1[0][07] */ 
212 
"movq (%1),%%mm2\n" /* mm2 = pix2[0][07] */ 
213 
"movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][07] */ 
214 
"movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][07] */ 
215  
216 
/* todo: mm1mm2, mm3mm4 */

217 
/* algo: subtract mm1 from mm2 with saturation and vice versa */

218 
/* OR the results to get absolute difference */

219 
"movq %%mm1,%%mm5\n"

220 
"movq %%mm3,%%mm6\n"

221 
"psubusb %%mm2,%%mm1\n"

222 
"psubusb %%mm4,%%mm3\n"

223 
"psubusb %%mm5,%%mm2\n"

224 
"psubusb %%mm6,%%mm4\n"

225  
226 
"por %%mm1,%%mm2\n"

227 
"por %%mm3,%%mm4\n"

228  
229 
/* now convert to 16bit vectors so we can square them */

230 
"movq %%mm2,%%mm1\n"

231 
"movq %%mm4,%%mm3\n"

232  
233 
"punpckhbw %%mm0,%%mm2\n"

234 
"punpckhbw %%mm0,%%mm4\n"

235 
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 
236 
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 
237  
238 
"pmaddwd %%mm2,%%mm2\n"

239 
"pmaddwd %%mm4,%%mm4\n"

240 
"pmaddwd %%mm1,%%mm1\n"

241 
"pmaddwd %%mm3,%%mm3\n"

242  
243 
"lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ 
244 
"lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ 
245  
246 
"paddd %%mm2,%%mm1\n"

247 
"paddd %%mm4,%%mm3\n"

248 
"paddd %%mm1,%%mm7\n"

249 
"paddd %%mm3,%%mm7\n"

250  
251 
"decl %%ecx\n"

252 
"jnz 1b\n"

253  
254 
"movq %%mm7,%%mm1\n"

255 
"psrlq $32, %%mm7\n" /* shift hi dword to lo */ 
256 
"paddd %%mm7,%%mm1\n"

257 
"movd %%mm1,%2\n"

258 
: "+r" (pix1), "+r" (pix2), "=r"(tmp) 
259 
: "r" ((x86_reg)line_size) , "m" (h) 
260 
: "%ecx");

261 
return tmp;

262 
} 
263  
264 
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
265 
int tmp;

266 
__asm__ volatile (

267 
"movl %4,%%ecx\n"

268 
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 
269 
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 
270 
"1:\n"

271 
"movq (%0),%%mm1\n" /* mm1 = pix1[07] */ 
272 
"movq (%1),%%mm2\n" /* mm2 = pix2[07] */ 
273 
"movq 8(%0),%%mm3\n" /* mm3 = pix1[815] */ 
274 
"movq 8(%1),%%mm4\n" /* mm4 = pix2[815] */ 
275  
276 
/* todo: mm1mm2, mm3mm4 */

277 
/* algo: subtract mm1 from mm2 with saturation and vice versa */

278 
/* OR the results to get absolute difference */

279 
"movq %%mm1,%%mm5\n"

280 
"movq %%mm3,%%mm6\n"

281 
"psubusb %%mm2,%%mm1\n"

282 
"psubusb %%mm4,%%mm3\n"

283 
"psubusb %%mm5,%%mm2\n"

284 
"psubusb %%mm6,%%mm4\n"

285  
286 
"por %%mm1,%%mm2\n"

287 
"por %%mm3,%%mm4\n"

288  
289 
/* now convert to 16bit vectors so we can square them */

290 
"movq %%mm2,%%mm1\n"

291 
"movq %%mm4,%%mm3\n"

292  
293 
"punpckhbw %%mm0,%%mm2\n"

294 
"punpckhbw %%mm0,%%mm4\n"

295 
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 
296 
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 
297  
298 
"pmaddwd %%mm2,%%mm2\n"

299 
"pmaddwd %%mm4,%%mm4\n"

300 
"pmaddwd %%mm1,%%mm1\n"

301 
"pmaddwd %%mm3,%%mm3\n"

302  
303 
"add %3,%0\n"

304 
"add %3,%1\n"

305  
306 
"paddd %%mm2,%%mm1\n"

307 
"paddd %%mm4,%%mm3\n"

308 
"paddd %%mm1,%%mm7\n"

309 
"paddd %%mm3,%%mm7\n"

310  
311 
"decl %%ecx\n"

312 
"jnz 1b\n"

313  
314 
"movq %%mm7,%%mm1\n"

315 
"psrlq $32, %%mm7\n" /* shift hi dword to lo */ 
316 
"paddd %%mm7,%%mm1\n"

317 
"movd %%mm1,%2\n"

318 
: "+r" (pix1), "+r" (pix2), "=r"(tmp) 
319 
: "r" ((x86_reg)line_size) , "m" (h) 
320 
: "%ecx");

321 
return tmp;

322 
} 
323  
324 
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
325 
int tmp;

326 
__asm__ volatile (

327 
"shr $1,%2\n"

328 
"pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ 
329 
"pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ 
330 
"1:\n"

331 
"movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][015] */ 
332 
"movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][015] */ 
333 
"movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][015] */ 
334 
"movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][015] */ 
335  
336 
/* todo: mm1mm2, mm3mm4 */

337 
/* algo: subtract mm1 from mm2 with saturation and vice versa */

338 
/* OR the results to get absolute difference */

339 
"movdqa %%xmm1,%%xmm5\n"

340 
"movdqa %%xmm3,%%xmm6\n"

341 
"psubusb %%xmm2,%%xmm1\n"

342 
"psubusb %%xmm4,%%xmm3\n"

343 
"psubusb %%xmm5,%%xmm2\n"

344 
"psubusb %%xmm6,%%xmm4\n"

345  
346 
"por %%xmm1,%%xmm2\n"

347 
"por %%xmm3,%%xmm4\n"

348  
349 
/* now convert to 16bit vectors so we can square them */

350 
"movdqa %%xmm2,%%xmm1\n"

351 
"movdqa %%xmm4,%%xmm3\n"

352  
353 
"punpckhbw %%xmm0,%%xmm2\n"

354 
"punpckhbw %%xmm0,%%xmm4\n"

355 
"punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ 
356 
"punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ 
357  
358 
"pmaddwd %%xmm2,%%xmm2\n"

359 
"pmaddwd %%xmm4,%%xmm4\n"

360 
"pmaddwd %%xmm1,%%xmm1\n"

361 
"pmaddwd %%xmm3,%%xmm3\n"

362  
363 
"lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ 
364 
"lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ 
365  
366 
"paddd %%xmm2,%%xmm1\n"

367 
"paddd %%xmm4,%%xmm3\n"

368 
"paddd %%xmm1,%%xmm7\n"

369 
"paddd %%xmm3,%%xmm7\n"

370  
371 
"decl %2\n"

372 
"jnz 1b\n"

373  
374 
"movdqa %%xmm7,%%xmm1\n"

375 
"psrldq $8, %%xmm7\n" /* shift hi qword to lo */ 
376 
"paddd %%xmm1,%%xmm7\n"

377 
"movdqa %%xmm7,%%xmm1\n"

378 
"psrldq $4, %%xmm7\n" /* shift hi dword to lo */ 
379 
"paddd %%xmm1,%%xmm7\n"

380 
"movd %%xmm7,%3\n"

381 
: "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) 
382 
: "r" ((x86_reg)line_size));

383 
return tmp;

384 
} 
385  
386 
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { 
387 
int tmp;

388 
__asm__ volatile (

389 
"movl %3,%%ecx\n"

390 
"pxor %%mm7,%%mm7\n"

391 
"pxor %%mm6,%%mm6\n"

392  
393 
"movq (%0),%%mm0\n"

394 
"movq %%mm0, %%mm1\n"

395 
"psllq $8, %%mm0\n"

396 
"psrlq $8, %%mm1\n"

397 
"psrlq $8, %%mm0\n"

398 
"movq %%mm0, %%mm2\n"

399 
"movq %%mm1, %%mm3\n"

400 
"punpcklbw %%mm7,%%mm0\n"

401 
"punpcklbw %%mm7,%%mm1\n"

402 
"punpckhbw %%mm7,%%mm2\n"

403 
"punpckhbw %%mm7,%%mm3\n"

404 
"psubw %%mm1, %%mm0\n"

405 
"psubw %%mm3, %%mm2\n"

406  
407 
"add %2,%0\n"

408  
409 
"movq (%0),%%mm4\n"

410 
"movq %%mm4, %%mm1\n"

411 
"psllq $8, %%mm4\n"

412 
"psrlq $8, %%mm1\n"

413 
"psrlq $8, %%mm4\n"

414 
"movq %%mm4, %%mm5\n"

415 
"movq %%mm1, %%mm3\n"

416 
"punpcklbw %%mm7,%%mm4\n"

417 
"punpcklbw %%mm7,%%mm1\n"

418 
"punpckhbw %%mm7,%%mm5\n"

419 
"punpckhbw %%mm7,%%mm3\n"

420 
"psubw %%mm1, %%mm4\n"

421 
"psubw %%mm3, %%mm5\n"

422 
"psubw %%mm4, %%mm0\n"

423 
"psubw %%mm5, %%mm2\n"

424 
"pxor %%mm3, %%mm3\n"

425 
"pxor %%mm1, %%mm1\n"

426 
"pcmpgtw %%mm0, %%mm3\n\t"

427 
"pcmpgtw %%mm2, %%mm1\n\t"

428 
"pxor %%mm3, %%mm0\n"

429 
"pxor %%mm1, %%mm2\n"

430 
"psubw %%mm3, %%mm0\n"

431 
"psubw %%mm1, %%mm2\n"

432 
"paddw %%mm0, %%mm2\n"

433 
"paddw %%mm2, %%mm6\n"

434  
435 
"add %2,%0\n"

436 
"1:\n"

437  
438 
"movq (%0),%%mm0\n"

439 
"movq %%mm0, %%mm1\n"

440 
"psllq $8, %%mm0\n"

441 
"psrlq $8, %%mm1\n"

442 
"psrlq $8, %%mm0\n"

443 
"movq %%mm0, %%mm2\n"

444 
"movq %%mm1, %%mm3\n"

445 
"punpcklbw %%mm7,%%mm0\n"

446 
"punpcklbw %%mm7,%%mm1\n"

447 
"punpckhbw %%mm7,%%mm2\n"

448 
"punpckhbw %%mm7,%%mm3\n"

449 
"psubw %%mm1, %%mm0\n"

450 
"psubw %%mm3, %%mm2\n"

451 
"psubw %%mm0, %%mm4\n"

452 
"psubw %%mm2, %%mm5\n"

453 
"pxor %%mm3, %%mm3\n"

454 
"pxor %%mm1, %%mm1\n"

455 
"pcmpgtw %%mm4, %%mm3\n\t"

456 
"pcmpgtw %%mm5, %%mm1\n\t"

457 
"pxor %%mm3, %%mm4\n"

458 
"pxor %%mm1, %%mm5\n"

459 
"psubw %%mm3, %%mm4\n"

460 
"psubw %%mm1, %%mm5\n"

461 
"paddw %%mm4, %%mm5\n"

462 
"paddw %%mm5, %%mm6\n"

463  
464 
"add %2,%0\n"

465  
466 
"movq (%0),%%mm4\n"

467 
"movq %%mm4, %%mm1\n"

468 
"psllq $8, %%mm4\n"

469 
"psrlq $8, %%mm1\n"

470 
"psrlq $8, %%mm4\n"

471 
"movq %%mm4, %%mm5\n"

472 
"movq %%mm1, %%mm3\n"

473 
"punpcklbw %%mm7,%%mm4\n"

474 
"punpcklbw %%mm7,%%mm1\n"

475 
"punpckhbw %%mm7,%%mm5\n"

476 
"punpckhbw %%mm7,%%mm3\n"

477 
"psubw %%mm1, %%mm4\n"

478 
"psubw %%mm3, %%mm5\n"

479 
"psubw %%mm4, %%mm0\n"

480 
"psubw %%mm5, %%mm2\n"

481 
"pxor %%mm3, %%mm3\n"

482 
"pxor %%mm1, %%mm1\n"

483 
"pcmpgtw %%mm0, %%mm3\n\t"

484 
"pcmpgtw %%mm2, %%mm1\n\t"

485 
"pxor %%mm3, %%mm0\n"

486 
"pxor %%mm1, %%mm2\n"

487 
"psubw %%mm3, %%mm0\n"

488 
"psubw %%mm1, %%mm2\n"

489 
"paddw %%mm0, %%mm2\n"

490 
"paddw %%mm2, %%mm6\n"

491  
492 
"add %2,%0\n"

493 
"subl $2, %%ecx\n"

494 
" jnz 1b\n"

495  
496 
"movq %%mm6, %%mm0\n"

497 
"punpcklwd %%mm7,%%mm0\n"

498 
"punpckhwd %%mm7,%%mm6\n"

499 
"paddd %%mm0, %%mm6\n"

500  
501 
"movq %%mm6,%%mm0\n"

502 
"psrlq $32, %%mm6\n"

503 
"paddd %%mm6,%%mm0\n"

504 
"movd %%mm0,%1\n"

505 
: "+r" (pix1), "=r"(tmp) 
506 
: "r" ((x86_reg)line_size) , "g" (h2) 
507 
: "%ecx");

508 
return tmp;

509 
} 
510  
511 
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { 
512 
int tmp;

513 
uint8_t * pix= pix1; 
514 
__asm__ volatile (

515 
"movl %3,%%ecx\n"

516 
"pxor %%mm7,%%mm7\n"

517 
"pxor %%mm6,%%mm6\n"

518  
519 
"movq (%0),%%mm0\n"

520 
"movq 1(%0),%%mm1\n"

521 
"movq %%mm0, %%mm2\n"

522 
"movq %%mm1, %%mm3\n"

523 
"punpcklbw %%mm7,%%mm0\n"

524 
"punpcklbw %%mm7,%%mm1\n"

525 
"punpckhbw %%mm7,%%mm2\n"

526 
"punpckhbw %%mm7,%%mm3\n"

527 
"psubw %%mm1, %%mm0\n"

528 
"psubw %%mm3, %%mm2\n"

529  
530 
"add %2,%0\n"

531  
532 
"movq (%0),%%mm4\n"

533 
"movq 1(%0),%%mm1\n"

534 
"movq %%mm4, %%mm5\n"

535 
"movq %%mm1, %%mm3\n"

536 
"punpcklbw %%mm7,%%mm4\n"

537 
"punpcklbw %%mm7,%%mm1\n"

538 
"punpckhbw %%mm7,%%mm5\n"

539 
"punpckhbw %%mm7,%%mm3\n"

540 
"psubw %%mm1, %%mm4\n"

541 
"psubw %%mm3, %%mm5\n"

542 
"psubw %%mm4, %%mm0\n"

543 
"psubw %%mm5, %%mm2\n"

544 
"pxor %%mm3, %%mm3\n"

545 
"pxor %%mm1, %%mm1\n"

546 
"pcmpgtw %%mm0, %%mm3\n\t"

547 
"pcmpgtw %%mm2, %%mm1\n\t"

548 
"pxor %%mm3, %%mm0\n"

549 
"pxor %%mm1, %%mm2\n"

550 
"psubw %%mm3, %%mm0\n"

551 
"psubw %%mm1, %%mm2\n"

552 
"paddw %%mm0, %%mm2\n"

553 
"paddw %%mm2, %%mm6\n"

554  
555 
"add %2,%0\n"

556 
"1:\n"

557  
558 
"movq (%0),%%mm0\n"

559 
"movq 1(%0),%%mm1\n"

560 
"movq %%mm0, %%mm2\n"

561 
"movq %%mm1, %%mm3\n"

562 
"punpcklbw %%mm7,%%mm0\n"

563 
"punpcklbw %%mm7,%%mm1\n"

564 
"punpckhbw %%mm7,%%mm2\n"

565 
"punpckhbw %%mm7,%%mm3\n"

566 
"psubw %%mm1, %%mm0\n"

567 
"psubw %%mm3, %%mm2\n"

568 
"psubw %%mm0, %%mm4\n"

569 
"psubw %%mm2, %%mm5\n"

570 
"pxor %%mm3, %%mm3\n"

571 
"pxor %%mm1, %%mm1\n"

572 
"pcmpgtw %%mm4, %%mm3\n\t"

573 
"pcmpgtw %%mm5, %%mm1\n\t"

574 
"pxor %%mm3, %%mm4\n"

575 
"pxor %%mm1, %%mm5\n"

576 
"psubw %%mm3, %%mm4\n"

577 
"psubw %%mm1, %%mm5\n"

578 
"paddw %%mm4, %%mm5\n"

579 
"paddw %%mm5, %%mm6\n"

580  
581 
"add %2,%0\n"

582  
583 
"movq (%0),%%mm4\n"

584 
"movq 1(%0),%%mm1\n"

585 
"movq %%mm4, %%mm5\n"

586 
"movq %%mm1, %%mm3\n"

587 
"punpcklbw %%mm7,%%mm4\n"

588 
"punpcklbw %%mm7,%%mm1\n"

589 
"punpckhbw %%mm7,%%mm5\n"

590 
"punpckhbw %%mm7,%%mm3\n"

591 
"psubw %%mm1, %%mm4\n"

592 
"psubw %%mm3, %%mm5\n"

593 
"psubw %%mm4, %%mm0\n"

594 
"psubw %%mm5, %%mm2\n"

595 
"pxor %%mm3, %%mm3\n"

596 
"pxor %%mm1, %%mm1\n"

597 
"pcmpgtw %%mm0, %%mm3\n\t"

598 
"pcmpgtw %%mm2, %%mm1\n\t"

599 
"pxor %%mm3, %%mm0\n"

600 
"pxor %%mm1, %%mm2\n"

601 
"psubw %%mm3, %%mm0\n"

602 
"psubw %%mm1, %%mm2\n"

603 
"paddw %%mm0, %%mm2\n"

604 
"paddw %%mm2, %%mm6\n"

605  
606 
"add %2,%0\n"

607 
"subl $2, %%ecx\n"

608 
" jnz 1b\n"

609  
610 
"movq %%mm6, %%mm0\n"

611 
"punpcklwd %%mm7,%%mm0\n"

612 
"punpckhwd %%mm7,%%mm6\n"

613 
"paddd %%mm0, %%mm6\n"

614  
615 
"movq %%mm6,%%mm0\n"

616 
"psrlq $32, %%mm6\n"

617 
"paddd %%mm6,%%mm0\n"

618 
"movd %%mm0,%1\n"

619 
: "+r" (pix1), "=r"(tmp) 
620 
: "r" ((x86_reg)line_size) , "g" (h2) 
621 
: "%ecx");

622 
return tmp + hf_noise8_mmx(pix+8, line_size, h); 
623 
} 
624  
625 
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
626 
MpegEncContext *c = p; 
627 
int score1, score2;

628  
629 
if(c) score1 = c>dsp.sse[0](c, pix1, pix2, line_size, h); 
630 
else score1 = sse16_mmx(c, pix1, pix2, line_size, h);

631 
score2= hf_noise16_mmx(pix1, line_size, h)  hf_noise16_mmx(pix2, line_size, h); 
632  
633 
if(c) return score1 + FFABS(score2)*c>avctx>nsse_weight; 
634 
else return score1 + FFABS(score2)*8; 
635 
} 
636  
637 
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
638 
MpegEncContext *c = p; 
639 
int score1= sse8_mmx(c, pix1, pix2, line_size, h);

640 
int score2= hf_noise8_mmx(pix1, line_size, h)  hf_noise8_mmx(pix2, line_size, h);

641  
642 
if(c) return score1 + FFABS(score2)*c>avctx>nsse_weight; 
643 
else return score1 + FFABS(score2)*8; 
644 
} 
645  
646 
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 
647 
int tmp;

648  
649 
assert( (((int)pix) & 7) == 0); 
650 
assert((line_size &7) ==0); 
651  
652 
#define SUM(in0, in1, out0, out1) \

653 
"movq (%0), %%mm2\n"\

654 
"movq 8(%0), %%mm3\n"\

655 
"add %2,%0\n"\

656 
"movq %%mm2, " #out0 "\n"\ 
657 
"movq %%mm3, " #out1 "\n"\ 
658 
"psubusb " #in0 ", %%mm2\n"\ 
659 
"psubusb " #in1 ", %%mm3\n"\ 
660 
"psubusb " #out0 ", " #in0 "\n"\ 
661 
"psubusb " #out1 ", " #in1 "\n"\ 
662 
"por %%mm2, " #in0 "\n"\ 
663 
"por %%mm3, " #in1 "\n"\ 
664 
"movq " #in0 ", %%mm2\n"\ 
665 
"movq " #in1 ", %%mm3\n"\ 
666 
"punpcklbw %%mm7, " #in0 "\n"\ 
667 
"punpcklbw %%mm7, " #in1 "\n"\ 
668 
"punpckhbw %%mm7, %%mm2\n"\

669 
"punpckhbw %%mm7, %%mm3\n"\

670 
"paddw " #in1 ", " #in0 "\n"\ 
671 
"paddw %%mm3, %%mm2\n"\

672 
"paddw %%mm2, " #in0 "\n"\ 
673 
"paddw " #in0 ", %%mm6\n" 
674  
675  
676 
__asm__ volatile (

677 
"movl %3,%%ecx\n"

678 
"pxor %%mm6,%%mm6\n"

679 
"pxor %%mm7,%%mm7\n"

680 
"movq (%0),%%mm0\n"

681 
"movq 8(%0),%%mm1\n"

682 
"add %2,%0\n"

683 
"jmp 2f\n"

684 
"1:\n"

685  
686 
SUM(%%mm4, %%mm5, %%mm0, %%mm1) 
687 
"2:\n"

688 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 
689  
690 
"subl $2, %%ecx\n"

691 
"jnz 1b\n"

692  
693 
"movq %%mm6,%%mm0\n"

694 
"psrlq $32, %%mm6\n"

695 
"paddw %%mm6,%%mm0\n"

696 
"movq %%mm0,%%mm6\n"

697 
"psrlq $16, %%mm0\n"

698 
"paddw %%mm6,%%mm0\n"

699 
"movd %%mm0,%1\n"

700 
: "+r" (pix), "=r"(tmp) 
701 
: "r" ((x86_reg)line_size) , "m" (h) 
702 
: "%ecx");

703 
return tmp & 0xFFFF; 
704 
} 
705 
#undef SUM

706  
707 
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 
708 
int tmp;

709  
710 
assert( (((int)pix) & 7) == 0); 
711 
assert((line_size &7) ==0); 
712  
713 
#define SUM(in0, in1, out0, out1) \

714 
"movq (%0), " #out0 "\n"\ 
715 
"movq 8(%0), " #out1 "\n"\ 
716 
"add %2,%0\n"\

717 
"psadbw " #out0 ", " #in0 "\n"\ 
718 
"psadbw " #out1 ", " #in1 "\n"\ 
719 
"paddw " #in1 ", " #in0 "\n"\ 
720 
"paddw " #in0 ", %%mm6\n" 
721  
722 
__asm__ volatile (

723 
"movl %3,%%ecx\n"

724 
"pxor %%mm6,%%mm6\n"

725 
"pxor %%mm7,%%mm7\n"

726 
"movq (%0),%%mm0\n"

727 
"movq 8(%0),%%mm1\n"

728 
"add %2,%0\n"

729 
"jmp 2f\n"

730 
"1:\n"

731  
732 
SUM(%%mm4, %%mm5, %%mm0, %%mm1) 
733 
"2:\n"

734 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 
735  
736 
"subl $2, %%ecx\n"

737 
"jnz 1b\n"

738  
739 
"movd %%mm6,%1\n"

740 
: "+r" (pix), "=r"(tmp) 
741 
: "r" ((x86_reg)line_size) , "m" (h) 
742 
: "%ecx");

743 
return tmp;

744 
} 
745 
#undef SUM

746  
747 
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
748 
int tmp;

749  
750 
assert( (((int)pix1) & 7) == 0); 
751 
assert( (((int)pix2) & 7) == 0); 
752 
assert((line_size &7) ==0); 
753  
754 
#define SUM(in0, in1, out0, out1) \

755 
"movq (%0),%%mm2\n"\

756 
"movq (%1)," #out0 "\n"\ 
757 
"movq 8(%0),%%mm3\n"\

758 
"movq 8(%1)," #out1 "\n"\ 
759 
"add %3,%0\n"\

760 
"add %3,%1\n"\

761 
"psubb " #out0 ", %%mm2\n"\ 
762 
"psubb " #out1 ", %%mm3\n"\ 
763 
"pxor %%mm7, %%mm2\n"\

764 
"pxor %%mm7, %%mm3\n"\

765 
"movq %%mm2, " #out0 "\n"\ 
766 
"movq %%mm3, " #out1 "\n"\ 
767 
"psubusb " #in0 ", %%mm2\n"\ 
768 
"psubusb " #in1 ", %%mm3\n"\ 
769 
"psubusb " #out0 ", " #in0 "\n"\ 
770 
"psubusb " #out1 ", " #in1 "\n"\ 
771 
"por %%mm2, " #in0 "\n"\ 
772 
"por %%mm3, " #in1 "\n"\ 
773 
"movq " #in0 ", %%mm2\n"\ 
774 
"movq " #in1 ", %%mm3\n"\ 
775 
"punpcklbw %%mm7, " #in0 "\n"\ 
776 
"punpcklbw %%mm7, " #in1 "\n"\ 
777 
"punpckhbw %%mm7, %%mm2\n"\

778 
"punpckhbw %%mm7, %%mm3\n"\

779 
"paddw " #in1 ", " #in0 "\n"\ 
780 
"paddw %%mm3, %%mm2\n"\

781 
"paddw %%mm2, " #in0 "\n"\ 
782 
"paddw " #in0 ", %%mm6\n" 
783  
784  
785 
__asm__ volatile (

786 
"movl %4,%%ecx\n"

787 
"pxor %%mm6,%%mm6\n"

788 
"pcmpeqw %%mm7,%%mm7\n"

789 
"psllw $15, %%mm7\n"

790 
"packsswb %%mm7, %%mm7\n"

791 
"movq (%0),%%mm0\n"

792 
"movq (%1),%%mm2\n"

793 
"movq 8(%0),%%mm1\n"

794 
"movq 8(%1),%%mm3\n"

795 
"add %3,%0\n"

796 
"add %3,%1\n"

797 
"psubb %%mm2, %%mm0\n"

798 
"psubb %%mm3, %%mm1\n"

799 
"pxor %%mm7, %%mm0\n"

800 
"pxor %%mm7, %%mm1\n"

801 
"jmp 2f\n"

802 
"1:\n"

803  
804 
SUM(%%mm4, %%mm5, %%mm0, %%mm1) 
805 
"2:\n"

806 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 
807  
808 
"subl $2, %%ecx\n"

809 
"jnz 1b\n"

810  
811 
"movq %%mm6,%%mm0\n"

812 
"psrlq $32, %%mm6\n"

813 
"paddw %%mm6,%%mm0\n"

814 
"movq %%mm0,%%mm6\n"

815 
"psrlq $16, %%mm0\n"

816 
"paddw %%mm6,%%mm0\n"

817 
"movd %%mm0,%2\n"

818 
: "+r" (pix1), "+r" (pix2), "=r"(tmp) 
819 
: "r" ((x86_reg)line_size) , "m" (h) 
820 
: "%ecx");

821 
return tmp & 0x7FFF; 
822 
} 
823 
#undef SUM

824  
825 
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
826 
int tmp;

827  
828 
assert( (((int)pix1) & 7) == 0); 
829 
assert( (((int)pix2) & 7) == 0); 
830 
assert((line_size &7) ==0); 
831  
832 
#define SUM(in0, in1, out0, out1) \

833 
"movq (%0)," #out0 "\n"\ 
834 
"movq (%1),%%mm2\n"\

835 
"movq 8(%0)," #out1 "\n"\ 
836 
"movq 8(%1),%%mm3\n"\

837 
"add %3,%0\n"\

838 
"add %3,%1\n"\

839 
"psubb %%mm2, " #out0 "\n"\ 
840 
"psubb %%mm3, " #out1 "\n"\ 
841 
"pxor %%mm7, " #out0 "\n"\ 
842 
"pxor %%mm7, " #out1 "\n"\ 
843 
"psadbw " #out0 ", " #in0 "\n"\ 
844 
"psadbw " #out1 ", " #in1 "\n"\ 
845 
"paddw " #in1 ", " #in0 "\n"\ 
846 
"paddw " #in0 ", %%mm6\n" 
847  
848 
__asm__ volatile (

849 
"movl %4,%%ecx\n"

850 
"pxor %%mm6,%%mm6\n"

851 
"pcmpeqw %%mm7,%%mm7\n"

852 
"psllw $15, %%mm7\n"

853 
"packsswb %%mm7, %%mm7\n"

854 
"movq (%0),%%mm0\n"

855 
"movq (%1),%%mm2\n"

856 
"movq 8(%0),%%mm1\n"

857 
"movq 8(%1),%%mm3\n"

858 
"add %3,%0\n"

859 
"add %3,%1\n"

860 
"psubb %%mm2, %%mm0\n"

861 
"psubb %%mm3, %%mm1\n"

862 
"pxor %%mm7, %%mm0\n"

863 
"pxor %%mm7, %%mm1\n"

864 
"jmp 2f\n"

865 
"1:\n"

866  
867 
SUM(%%mm4, %%mm5, %%mm0, %%mm1) 
868 
"2:\n"

869 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 
870  
871 
"subl $2, %%ecx\n"

872 
"jnz 1b\n"

873  
874 
"movd %%mm6,%2\n"

875 
: "+r" (pix1), "+r" (pix2), "=r"(tmp) 
876 
: "r" ((x86_reg)line_size) , "m" (h) 
877 
: "%ecx");

878 
return tmp;

879 
} 
880 
#undef SUM

881  
882 
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 
883 
x86_reg i=0;

884 
__asm__ volatile(

885 
"1: \n\t"

886 
"movq (%2, %0), %%mm0 \n\t"

887 
"movq (%1, %0), %%mm1 \n\t"

888 
"psubb %%mm0, %%mm1 \n\t"

889 
"movq %%mm1, (%3, %0) \n\t"

890 
"movq 8(%2, %0), %%mm0 \n\t"

891 
"movq 8(%1, %0), %%mm1 \n\t"

892 
"psubb %%mm0, %%mm1 \n\t"

893 
"movq %%mm1, 8(%3, %0) \n\t"

894 
"add $16, %0 \n\t"

895 
"cmp %4, %0 \n\t"

896 
" jb 1b \n\t"

897 
: "+r" (i)

898 
: "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w15) 
899 
); 
900 
for(; i<w; i++)

901 
dst[i+0] = src1[i+0]src2[i+0]; 
902 
} 
903  
904 
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ 
905 
x86_reg i=0;

906 
uint8_t l, lt; 
907  
908 
__asm__ volatile(

909 
"1: \n\t"

910 
"movq 1(%1, %0), %%mm0 \n\t" // LT 
911 
"movq (%1, %0), %%mm1 \n\t" // T 
912 
"movq 1(%2, %0), %%mm2 \n\t" // L 
913 
"movq (%2, %0), %%mm3 \n\t" // X 
914 
"movq %%mm2, %%mm4 \n\t" // L 
915 
"psubb %%mm0, %%mm2 \n\t"

916 
"paddb %%mm1, %%mm2 \n\t" // L + T  LT 
917 
"movq %%mm4, %%mm5 \n\t" // L 
918 
"pmaxub %%mm1, %%mm4 \n\t" // max(T, L) 
919 
"pminub %%mm5, %%mm1 \n\t" // min(T, L) 
920 
"pminub %%mm2, %%mm4 \n\t"

921 
"pmaxub %%mm1, %%mm4 \n\t"

922 
"psubb %%mm4, %%mm3 \n\t" // dst  pred 
923 
"movq %%mm3, (%3, %0) \n\t"

924 
"add $8, %0 \n\t"

925 
"cmp %4, %0 \n\t"

926 
" jb 1b \n\t"

927 
: "+r" (i)

928 
: "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w) 
929 
); 
930  
931 
l= *left; 
932 
lt= *left_top; 
933  
934 
dst[0]= src2[0]  mid_pred(l, src1[0], (l + src1[0]  lt)&0xFF); 
935  
936 
*left_top= src1[w1];

937 
*left = src2[w1];

938 
} 
939  
940 
#define DIFF_PIXELS_1(m,a,t,p1,p2)\

941 
"mov"#m" "#p1", "#a" \n\t"\ 
942 
"mov"#m" "#p2", "#t" \n\t"\ 
943 
"punpcklbw "#a", "#t" \n\t"\ 
944 
"punpcklbw "#a", "#a" \n\t"\ 
945 
"psubw "#t", "#a" \n\t"\ 
946  
947 
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\

948 
uint8_t *p1b=p1, *p2b=p2;\ 
949 
__asm__ volatile(\

950 
DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ 
951 
DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ 
952 
DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ 
953 
"add %4, %1 \n\t"\

954 
"add %4, %2 \n\t"\

955 
DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ 
956 
DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ 
957 
DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ 
958 
DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ 
959 
"mov"#m1" "#mm"0, %0 \n\t"\ 
960 
DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ 
961 
"mov"#m1" %0, "#mm"0 \n\t"\ 
962 
: "+m"(temp), "+r"(p1b), "+r"(p2b)\ 
963 
: "r"((x86_reg)stride), "r"((x86_reg)stride*3)\ 
964 
);\ 
965 
} 
966 
//the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)

967  
968 
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)

969 
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)

970  
971 
#define LBUTTERFLY2(a1,b1,a2,b2)\

972 
"paddw " #b1 ", " #a1 " \n\t"\ 
973 
"paddw " #b2 ", " #a2 " \n\t"\ 
974 
"paddw " #b1 ", " #b1 " \n\t"\ 
975 
"paddw " #b2 ", " #b2 " \n\t"\ 
976 
"psubw " #a1 ", " #b1 " \n\t"\ 
977 
"psubw " #a2 ", " #b2 " \n\t" 
978  
979 
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\

980 
LBUTTERFLY2(m0, m1, m2, m3)\ 
981 
LBUTTERFLY2(m4, m5, m6, m7)\ 
982 
LBUTTERFLY2(m0, m2, m1, m3)\ 
983 
LBUTTERFLY2(m4, m6, m5, m7)\ 
984 
LBUTTERFLY2(m0, m4, m1, m5)\ 
985 
LBUTTERFLY2(m2, m6, m3, m7)\ 
986  
987 
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)

988  
989 
#define MMABS_MMX(a,z)\

990 
"pxor " #z ", " #z " \n\t"\ 
991 
"pcmpgtw " #a ", " #z " \n\t"\ 
992 
"pxor " #z ", " #a " \n\t"\ 
993 
"psubw " #z ", " #a " \n\t" 
994  
995 
#define MMABS_MMX2(a,z)\

996 
"pxor " #z ", " #z " \n\t"\ 
997 
"psubw " #a ", " #z " \n\t"\ 
998 
"pmaxsw " #z ", " #a " \n\t" 
999  
1000 
#define MMABS_SSSE3(a,z)\

1001 
"pabsw " #a ", " #a " \n\t" 
1002  
1003 
#define MMABS_SUM(a,z, sum)\

1004 
MMABS(a,z)\ 
1005 
"paddusw " #a ", " #sum " \n\t" 
1006  
1007 
#define MMABS_SUM_8x8_NOSPILL\

1008 
MMABS(%%xmm0, %%xmm8)\ 
1009 
MMABS(%%xmm1, %%xmm9)\ 
1010 
MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ 
1011 
MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ 
1012 
MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ 
1013 
MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ 
1014 
MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ 
1015 
MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ 
1016 
"paddusw %%xmm1, %%xmm0 \n\t"

1017  
1018 
#ifdef ARCH_X86_64

1019 
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL

1020 
#else

1021 
#define MMABS_SUM_8x8_SSE2\

1022 
"movdqa %%xmm7, (%1) \n\t"\

1023 
MMABS(%%xmm0, %%xmm7)\ 
1024 
MMABS(%%xmm1, %%xmm7)\ 
1025 
MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ 
1026 
MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ 
1027 
MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ 
1028 
MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ 
1029 
MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ 
1030 
"movdqa (%1), %%xmm2 \n\t"\

1031 
MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ 
1032 
"paddusw %%xmm1, %%xmm0 \n\t"

1033 
#endif

1034  
1035 
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to

1036 
* about 100k on extreme inputs. But that's very unlikely to occur in natural video,

1037 
* and it's even more unlikely to not have any alternative mvs/modes with lower cost. */

1038 
#define HSUM_MMX(a, t, dst)\

1039 
"movq "#a", "#t" \n\t"\ 
1040 
"psrlq $32, "#a" \n\t"\ 
1041 
"paddusw "#t", "#a" \n\t"\ 
1042 
"movq "#a", "#t" \n\t"\ 
1043 
"psrlq $16, "#a" \n\t"\ 
1044 
"paddusw "#t", "#a" \n\t"\ 
1045 
"movd "#a", "#dst" \n\t"\ 
1046  
1047 
#define HSUM_MMX2(a, t, dst)\

1048 
"pshufw $0x0E, "#a", "#t" \n\t"\ 
1049 
"paddusw "#t", "#a" \n\t"\ 
1050 
"pshufw $0x01, "#a", "#t" \n\t"\ 
1051 
"paddusw "#t", "#a" \n\t"\ 
1052 
"movd "#a", "#dst" \n\t"\ 
1053  
1054 
#define HSUM_SSE2(a, t, dst)\

1055 
"movhlps "#a", "#t" \n\t"\ 
1056 
"paddusw "#t", "#a" \n\t"\ 
1057 
"pshuflw $0x0E, "#a", "#t" \n\t"\ 
1058 
"paddusw "#t", "#a" \n\t"\ 
1059 
"pshuflw $0x01, "#a", "#t" \n\t"\ 
1060 
"paddusw "#t", "#a" \n\t"\ 
1061 
"movd "#a", "#dst" \n\t"\ 
1062  
1063 
#define HADAMARD8_DIFF_MMX(cpu) \

1064 
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ 
1065 
DECLARE_ALIGNED_8(uint64_t, temp[13]);\

1066 
int sum;\

1067 
\ 
1068 
assert(h==8);\

1069 
\ 
1070 
DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\

1071 
\ 
1072 
__asm__ volatile(\

1073 
HADAMARD48\ 
1074 
\ 
1075 
"movq %%mm7, 96(%1) \n\t"\

1076 
\ 
1077 
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ 
1078 
STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ 
1079 
\ 
1080 
"movq 96(%1), %%mm7 \n\t"\

1081 
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ 
1082 
STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\ 
1083 
\ 
1084 
: "=r" (sum)\

1085 
: "r"(temp)\

1086 
);\ 
1087 
\ 
1088 
DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ 
1089 
\ 
1090 
__asm__ volatile(\

1091 
HADAMARD48\ 
1092 
\ 
1093 
"movq %%mm7, 96(%1) \n\t"\

1094 
\ 
1095 
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ 
1096 
STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ 
1097 
\ 
1098 
"movq 96(%1), %%mm7 \n\t"\

1099 
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ 
1100 
"movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ 
1101 
"movq %%mm6, %%mm7 \n\t"\

1102 
"movq %%mm0, %%mm6 \n\t"\

1103 
\ 
1104 
LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ 
1105 
\ 
1106 
HADAMARD48\ 
1107 
"movq %%mm7, 64(%1) \n\t"\

1108 
MMABS(%%mm0, %%mm7)\ 
1109 
MMABS(%%mm1, %%mm7)\ 
1110 
MMABS_SUM(%%mm2, %%mm7, %%mm0)\ 
1111 
MMABS_SUM(%%mm3, %%mm7, %%mm1)\ 
1112 
MMABS_SUM(%%mm4, %%mm7, %%mm0)\ 
1113 
MMABS_SUM(%%mm5, %%mm7, %%mm1)\ 
1114 
MMABS_SUM(%%mm6, %%mm7, %%mm0)\ 
1115 
"movq 64(%1), %%mm2 \n\t"\

1116 
MMABS_SUM(%%mm2, %%mm7, %%mm1)\ 
1117 
"paddusw %%mm1, %%mm0 \n\t"\

1118 
"movq %%mm0, 64(%1) \n\t"\

1119 
\ 
1120 
LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ 
1121 
LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\ 
1122 
\ 
1123 
HADAMARD48\ 
1124 
"movq %%mm7, (%1) \n\t"\

1125 
MMABS(%%mm0, %%mm7)\ 
1126 
MMABS(%%mm1, %%mm7)\ 
1127 
MMABS_SUM(%%mm2, %%mm7, %%mm0)\ 
1128 
MMABS_SUM(%%mm3, %%mm7, %%mm1)\ 
1129 
MMABS_SUM(%%mm4, %%mm7, %%mm0)\ 
1130 
MMABS_SUM(%%mm5, %%mm7, %%mm1)\ 
1131 
MMABS_SUM(%%mm6, %%mm7, %%mm0)\ 
1132 
"movq (%1), %%mm2 \n\t"\

1133 
MMABS_SUM(%%mm2, %%mm7, %%mm1)\ 
1134 
"paddusw 64(%1), %%mm0 \n\t"\

1135 
"paddusw %%mm1, %%mm0 \n\t"\

1136 
\ 
1137 
HSUM(%%mm0, %%mm1, %0)\

1138 
\ 
1139 
: "=r" (sum)\

1140 
: "r"(temp)\

1141 
);\ 
1142 
return sum&0xFFFF;\ 
1143 
}\ 
1144 
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) 
1145  
1146 
#define HADAMARD8_DIFF_SSE2(cpu) \

1147 
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ 
1148 
DECLARE_ALIGNED_16(uint64_t, temp[4]);\

1149 
int sum;\

1150 
\ 
1151 
assert(h==8);\

1152 
\ 
1153 
DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\

1154 
\ 
1155 
__asm__ volatile(\

1156 
HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ 
1157 
TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\

1158 
HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ 
1159 
MMABS_SUM_8x8\ 
1160 
HSUM_SSE2(%%xmm0, %%xmm1, %0)\

1161 
: "=r" (sum)\

1162 
: "r"(temp)\

1163 
);\ 
1164 
return sum&0xFFFF;\ 
1165 
}\ 
1166 
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) 
1167  
1168 
#define MMABS(a,z) MMABS_MMX(a,z)

1169 
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)

1170 
HADAMARD8_DIFF_MMX(mmx) 
1171 
#undef MMABS

1172 
#undef HSUM

1173  
1174 
#define MMABS(a,z) MMABS_MMX2(a,z)

1175 
#define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2

1176 
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)

1177 
HADAMARD8_DIFF_MMX(mmx2) 
1178 
HADAMARD8_DIFF_SSE2(sse2) 
1179 
#undef MMABS

1180 
#undef MMABS_SUM_8x8

1181 
#undef HSUM

1182  
1183 
#ifdef HAVE_SSSE3

1184 
#define MMABS(a,z) MMABS_SSSE3(a,z)

1185 
#define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL

1186 
HADAMARD8_DIFF_SSE2(ssse3) 
1187 
#undef MMABS

1188 
#undef MMABS_SUM_8x8

1189 
#endif

1190  
1191 
#define DCT_SAD4(m,mm,o)\

1192 
"mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ 
1193 
"mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ 
1194 
"mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ 
1195 
"mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ 
1196 
MMABS_SUM(mm##2, mm##6, mm##0)\ 
1197 
MMABS_SUM(mm##3, mm##7, mm##1)\ 
1198 
MMABS_SUM(mm##4, mm##6, mm##0)\ 
1199 
MMABS_SUM(mm##5, mm##7, mm##1)\ 
1200  
1201 
#define DCT_SAD_MMX\

1202 
"pxor %%mm0, %%mm0 \n\t"\

1203 
"pxor %%mm1, %%mm1 \n\t"\

1204 
DCT_SAD4(q, %%mm, 0)\

1205 
DCT_SAD4(q, %%mm, 8)\

1206 
DCT_SAD4(q, %%mm, 64)\

1207 
DCT_SAD4(q, %%mm, 72)\

1208 
"paddusw %%mm1, %%mm0 \n\t"\

1209 
HSUM(%%mm0, %%mm1, %0)

1210  
1211 
#define DCT_SAD_SSE2\

1212 
"pxor %%xmm0, %%xmm0 \n\t"\

1213 
"pxor %%xmm1, %%xmm1 \n\t"\

1214 
DCT_SAD4(dqa, %%xmm, 0)\

1215 
DCT_SAD4(dqa, %%xmm, 64)\

1216 
"paddusw %%xmm1, %%xmm0 \n\t"\

1217 
HSUM(%%xmm0, %%xmm1, %0)

1218  
1219 
#define DCT_SAD_FUNC(cpu) \

1220 
static int sum_abs_dctelem_##cpu(DCTELEM *block){\ 
1221 
int sum;\

1222 
__asm__ volatile(\

1223 
DCT_SAD\ 
1224 
:"=r"(sum)\

1225 
:"r"(block)\

1226 
);\ 
1227 
return sum&0xFFFF;\ 
1228 
} 
1229  
1230 
#define DCT_SAD DCT_SAD_MMX

1231 
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)

1232 
#define MMABS(a,z) MMABS_MMX(a,z)

1233 
DCT_SAD_FUNC(mmx) 
1234 
#undef MMABS

1235 
#undef HSUM

1236  
1237 
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)

1238 
#define MMABS(a,z) MMABS_MMX2(a,z)

1239 
DCT_SAD_FUNC(mmx2) 
1240 
#undef HSUM

1241 
#undef DCT_SAD

1242  
1243 
#define DCT_SAD DCT_SAD_SSE2

1244 
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)

1245 
DCT_SAD_FUNC(sse2) 
1246 
#undef MMABS

1247  
1248 
#ifdef HAVE_SSSE3

1249 
#define MMABS(a,z) MMABS_SSSE3(a,z)

1250 
DCT_SAD_FUNC(ssse3) 
1251 
#undef MMABS

1252 
#endif

1253 
#undef HSUM

1254 
#undef DCT_SAD

1255  
1256 
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ 
1257 
int sum;

1258 
x86_reg i=size; 
1259 
__asm__ volatile(

1260 
"pxor %%mm4, %%mm4 \n"

1261 
"1: \n"

1262 
"sub $8, %0 \n"

1263 
"movq (%2,%0), %%mm2 \n"

1264 
"movq (%3,%0,2), %%mm0 \n"

1265 
"movq 8(%3,%0,2), %%mm1 \n"

1266 
"punpckhbw %%mm2, %%mm3 \n"

1267 
"punpcklbw %%mm2, %%mm2 \n"

1268 
"psraw $8, %%mm3 \n"

1269 
"psraw $8, %%mm2 \n"

1270 
"psubw %%mm3, %%mm1 \n"

1271 
"psubw %%mm2, %%mm0 \n"

1272 
"pmaddwd %%mm1, %%mm1 \n"

1273 
"pmaddwd %%mm0, %%mm0 \n"

1274 
"paddd %%mm1, %%mm4 \n"

1275 
"paddd %%mm0, %%mm4 \n"

1276 
"jg 1b \n"

1277 
"movq %%mm4, %%mm3 \n"

1278 
"psrlq $32, %%mm3 \n"

1279 
"paddd %%mm3, %%mm4 \n"

1280 
"movd %%mm4, %1 \n"

1281 
:"+r"(i), "=r"(sum) 
1282 
:"r"(pix1), "r"(pix2) 
1283 
); 
1284 
return sum;

1285 
} 
1286  
1287 
#define PHADDD(a, t)\

1288 
"movq "#a", "#t" \n\t"\ 
1289 
"psrlq $32, "#a" \n\t"\ 
1290 
"paddd "#t", "#a" \n\t" 
1291 
/*

1292 
pmulhw: dst[015]=(src[015]*dst[015])[1631]

1293 
pmulhrw: dst[015]=(src[015]*dst[015] + 0x8000)[1631]

1294 
pmulhrsw: dst[015]=(src[015]*dst[015] + 0x4000)[1530]

1295 
*/

1296 
#define PMULHRW(x, y, s, o)\

1297 
"pmulhw " #s ", "#x " \n\t"\ 
1298 
"pmulhw " #s ", "#y " \n\t"\ 
1299 
"paddw " #o ", "#x " \n\t"\ 
1300 
"paddw " #o ", "#y " \n\t"\ 
1301 
"psraw $1, "#x " \n\t"\ 
1302 
"psraw $1, "#y " \n\t" 
1303 
#define DEF(x) x ## _mmx 
1304 
#define SET_RND MOVQ_WONE

1305 
#define SCALE_OFFSET 1 
1306  
1307 
#include "dsputil_mmx_qns_template.c" 
1308  
1309 
#undef DEF

1310 
#undef SET_RND

1311 
#undef SCALE_OFFSET

1312 
#undef PMULHRW

1313  
1314 
#define DEF(x) x ## _3dnow 
1315 
#define SET_RND(x)

1316 
#define SCALE_OFFSET 0 
1317 
#define PMULHRW(x, y, s, o)\

1318 
"pmulhrw " #s ", "#x " \n\t"\ 
1319 
"pmulhrw " #s ", "#y " \n\t" 
1320  
1321 
#include "dsputil_mmx_qns_template.c" 
1322  
1323 
#undef DEF

1324 
#undef SET_RND

1325 
#undef SCALE_OFFSET

1326 
#undef PMULHRW

1327  
1328 
#ifdef HAVE_SSSE3

1329 
#undef PHADDD

1330 
#define DEF(x) x ## _ssse3 
1331 
#define SET_RND(x)

1332 
#define SCALE_OFFSET 1 
1333 
#define PHADDD(a, t)\

1334 
"pshufw $0x0E, "#a", "#t" \n\t"\ 
1335 
"paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ 
1336 
#define PMULHRW(x, y, s, o)\

1337 
"pmulhrsw " #s ", "#x " \n\t"\ 
1338 
"pmulhrsw " #s ", "#y " \n\t" 
1339  
1340 
#include "dsputil_mmx_qns_template.c" 
1341  
1342 
#undef DEF

1343 
#undef SET_RND

1344 
#undef SCALE_OFFSET

1345 
#undef PMULHRW

1346 
#undef PHADDD

1347 
#endif //HAVE_SSSE3 
1348  
1349  
1350 
/* FLAC specific */

1351 
void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag, 
1352 
double *autoc);

1353  
1354  
1355 
void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)

1356 
{ 
1357 
if (mm_flags & FF_MM_MMX) {

1358 
const int dct_algo = avctx>dct_algo; 
1359 
if(dct_algo==FF_DCT_AUTO  dct_algo==FF_DCT_MMX){

1360 
if(mm_flags & FF_MM_SSE2){

1361 
c>fdct = ff_fdct_sse2; 
1362 
}else if(mm_flags & FF_MM_MMXEXT){ 
1363 
c>fdct = ff_fdct_mmx2; 
1364 
}else{

1365 
c>fdct = ff_fdct_mmx; 
1366 
} 
1367 
} 
1368  
1369 
c>get_pixels = get_pixels_mmx; 
1370 
c>diff_pixels = diff_pixels_mmx; 
1371 
c>pix_sum = pix_sum16_mmx; 
1372  
1373 
c>diff_bytes= diff_bytes_mmx; 
1374 
c>sum_abs_dctelem= sum_abs_dctelem_mmx; 
1375  
1376 
c>hadamard8_diff[0]= hadamard8_diff16_mmx;

1377 
c>hadamard8_diff[1]= hadamard8_diff_mmx;

1378  
1379 
c>pix_norm1 = pix_norm1_mmx; 
1380 
c>sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx;

1381 
c>sse[1] = sse8_mmx;

1382 
c>vsad[4]= vsad_intra16_mmx;

1383  
1384 
c>nsse[0] = nsse16_mmx;

1385 
c>nsse[1] = nsse8_mmx;

1386 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){

1387 
c>vsad[0] = vsad16_mmx;

1388 
} 
1389  
1390 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){

1391 
c>try_8x8basis= try_8x8basis_mmx; 
1392 
} 
1393 
c>add_8x8basis= add_8x8basis_mmx; 
1394  
1395 
c>ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; 
1396  
1397  
1398 
if (mm_flags & FF_MM_MMXEXT) {

1399 
c>sum_abs_dctelem= sum_abs_dctelem_mmx2; 
1400 
c>hadamard8_diff[0]= hadamard8_diff16_mmx2;

1401 
c>hadamard8_diff[1]= hadamard8_diff_mmx2;

1402 
c>vsad[4]= vsad_intra16_mmx2;

1403  
1404 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){

1405 
c>vsad[0] = vsad16_mmx2;

1406 
} 
1407  
1408 
c>sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; 
1409 
} 
1410  
1411 
if(mm_flags & FF_MM_SSE2){

1412 
c>get_pixels = get_pixels_sse2; 
1413 
c>sum_abs_dctelem= sum_abs_dctelem_sse2; 
1414 
c>hadamard8_diff[0]= hadamard8_diff16_sse2;

1415 
c>hadamard8_diff[1]= hadamard8_diff_sse2;

1416 
if (ENABLE_FLAC_ENCODER)

1417 
c>flac_compute_autocorr = ff_flac_compute_autocorr_sse2; 
1418 
} 
1419  
1420 
#ifdef HAVE_SSSE3

1421 
if(mm_flags & FF_MM_SSSE3){

1422 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){

1423 
c>try_8x8basis= try_8x8basis_ssse3; 
1424 
} 
1425 
c>add_8x8basis= add_8x8basis_ssse3; 
1426 
c>sum_abs_dctelem= sum_abs_dctelem_ssse3; 
1427 
c>hadamard8_diff[0]= hadamard8_diff16_ssse3;

1428 
c>hadamard8_diff[1]= hadamard8_diff_ssse3;

1429 
} 
1430 
#endif

1431  
1432 
if(mm_flags & FF_MM_3DNOW){

1433 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){

1434 
c>try_8x8basis= try_8x8basis_3dnow; 
1435 
} 
1436 
c>add_8x8basis= add_8x8basis_3dnow; 
1437 
} 
1438 
} 
1439  
1440 
dsputil_init_pix_mmx(c, avctx); 
1441 
} 