ffmpeg / libavcodec / i386 / motion_est_mmx.c @ 766324fc
History  View  Annotate  Download (15.5 KB)
1 
/*


2 
* MMX optimized motion estimation

3 
* Copyright (c) 2001 Fabrice Bellard.

4 
* Copyright (c) 20022004 Michael Niedermayer

5 
*

6 
* mostly by Michael Niedermayer <michaelni@gmx.at>

7 
*

8 
* This file is part of FFmpeg.

9 
*

10 
* FFmpeg is free software; you can redistribute it and/or

11 
* modify it under the terms of the GNU Lesser General Public

12 
* License as published by the Free Software Foundation; either

13 
* version 2.1 of the License, or (at your option) any later version.

14 
*

15 
* FFmpeg is distributed in the hope that it will be useful,

16 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

17 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

18 
* Lesser General Public License for more details.

19 
*

20 
* You should have received a copy of the GNU Lesser General Public

21 
* License along with FFmpeg; if not, write to the Free Software

22 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

23 
*/

24 
#include "dsputil.h" 
25 
#include "x86_cpu.h" 
26  
27 
DECLARE_ASM_CONST(8, uint64_t, round_tab[3])={ 
28 
0x0000000000000000ULL,

29 
0x0001000100010001ULL,

30 
0x0002000200020002ULL,

31 
}; 
32  
33 
DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL; 
34  
35 
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 
36 
{ 
37 
long len= (stride*h);

38 
asm volatile( 
39 
ASMALIGN(4)

40 
"1: \n\t"

41 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 
42 
"movq (%2, %%"REG_a"), %%mm2 \n\t" 
43 
"movq (%2, %%"REG_a"), %%mm4 \n\t" 
44 
"add %3, %%"REG_a" \n\t" 
45 
"psubusb %%mm0, %%mm2 \n\t"

46 
"psubusb %%mm4, %%mm0 \n\t"

47 
"movq (%1, %%"REG_a"), %%mm1 \n\t" 
48 
"movq (%2, %%"REG_a"), %%mm3 \n\t" 
49 
"movq (%2, %%"REG_a"), %%mm5 \n\t" 
50 
"psubusb %%mm1, %%mm3 \n\t"

51 
"psubusb %%mm5, %%mm1 \n\t"

52 
"por %%mm2, %%mm0 \n\t"

53 
"por %%mm1, %%mm3 \n\t"

54 
"movq %%mm0, %%mm1 \n\t"

55 
"movq %%mm3, %%mm2 \n\t"

56 
"punpcklbw %%mm7, %%mm0 \n\t"

57 
"punpckhbw %%mm7, %%mm1 \n\t"

58 
"punpcklbw %%mm7, %%mm3 \n\t"

59 
"punpckhbw %%mm7, %%mm2 \n\t"

60 
"paddw %%mm1, %%mm0 \n\t"

61 
"paddw %%mm3, %%mm2 \n\t"

62 
"paddw %%mm2, %%mm0 \n\t"

63 
"paddw %%mm0, %%mm6 \n\t"

64 
"add %3, %%"REG_a" \n\t" 
65 
" js 1b \n\t"

66 
: "+a" (len)

67 
: "r" (blk1  len), "r" (blk2  len), "r" ((long)stride) 
68 
); 
69 
} 
70  
71 
static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) 
72 
{ 
73 
asm volatile( 
74 
ASMALIGN(4)

75 
"1: \n\t"

76 
"movq (%1), %%mm0 \n\t"

77 
"movq (%1, %3), %%mm1 \n\t"

78 
"psadbw (%2), %%mm0 \n\t"

79 
"psadbw (%2, %3), %%mm1 \n\t"

80 
"paddw %%mm0, %%mm6 \n\t"

81 
"paddw %%mm1, %%mm6 \n\t"

82 
"lea (%1,%3,2), %1 \n\t"

83 
"lea (%2,%3,2), %2 \n\t"

84 
"sub $2, %0 \n\t"

85 
" jg 1b \n\t"

86 
: "+r" (h), "+r" (blk1), "+r" (blk2) 
87 
: "r" ((long)stride) 
88 
); 
89 
} 
90  
91 
static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) 
92 
{ 
93 
int ret;

94 
asm volatile( 
95 
"pxor %%xmm6, %%xmm6 \n\t"

96 
ASMALIGN(4)

97 
"1: \n\t"

98 
"movdqu (%1), %%xmm0 \n\t"

99 
"movdqu (%1, %3), %%xmm1 \n\t"

100 
"psadbw (%2), %%xmm0 \n\t"

101 
"psadbw (%2, %3), %%xmm1 \n\t"

102 
"paddw %%xmm0, %%xmm6 \n\t"

103 
"paddw %%xmm1, %%xmm6 \n\t"

104 
"lea (%1,%3,2), %1 \n\t"

105 
"lea (%2,%3,2), %2 \n\t"

106 
"sub $2, %0 \n\t"

107 
" jg 1b \n\t"

108 
: "+r" (h), "+r" (blk1), "+r" (blk2) 
109 
: "r" ((long)stride) 
110 
); 
111 
asm volatile( 
112 
"movhlps %%xmm6, %%xmm0 \n\t"

113 
"paddw %%xmm0, %%xmm6 \n\t"

114 
"movd %%xmm6, %0 \n\t"

115 
: "=r"(ret)

116 
); 
117 
return ret;

118 
} 
119  
120 
static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) 
121 
{ 
122 
asm volatile( 
123 
ASMALIGN(4)

124 
"1: \n\t"

125 
"movq (%1), %%mm0 \n\t"

126 
"movq (%1, %3), %%mm1 \n\t"

127 
"pavgb 1(%1), %%mm0 \n\t"

128 
"pavgb 1(%1, %3), %%mm1 \n\t"

129 
"psadbw (%2), %%mm0 \n\t"

130 
"psadbw (%2, %3), %%mm1 \n\t"

131 
"paddw %%mm0, %%mm6 \n\t"

132 
"paddw %%mm1, %%mm6 \n\t"

133 
"lea (%1,%3,2), %1 \n\t"

134 
"lea (%2,%3,2), %2 \n\t"

135 
"sub $2, %0 \n\t"

136 
" jg 1b \n\t"

137 
: "+r" (h), "+r" (blk1), "+r" (blk2) 
138 
: "r" ((long)stride) 
139 
); 
140 
} 
141  
142 
static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) 
143 
{ 
144 
asm volatile( 
145 
"movq (%1), %%mm0 \n\t"

146 
"add %3, %1 \n\t"

147 
ASMALIGN(4)

148 
"1: \n\t"

149 
"movq (%1), %%mm1 \n\t"

150 
"movq (%1, %3), %%mm2 \n\t"

151 
"pavgb %%mm1, %%mm0 \n\t"

152 
"pavgb %%mm2, %%mm1 \n\t"

153 
"psadbw (%2), %%mm0 \n\t"

154 
"psadbw (%2, %3), %%mm1 \n\t"

155 
"paddw %%mm0, %%mm6 \n\t"

156 
"paddw %%mm1, %%mm6 \n\t"

157 
"movq %%mm2, %%mm0 \n\t"

158 
"lea (%1,%3,2), %1 \n\t"

159 
"lea (%2,%3,2), %2 \n\t"

160 
"sub $2, %0 \n\t"

161 
" jg 1b \n\t"

162 
: "+r" (h), "+r" (blk1), "+r" (blk2) 
163 
: "r" ((long)stride) 
164 
); 
165 
} 
166  
167 
static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) 
168 
{ 
169 
asm volatile( 
170 
"movq "MANGLE(bone)", %%mm5 \n\t" 
171 
"movq (%1), %%mm0 \n\t"

172 
"pavgb 1(%1), %%mm0 \n\t"

173 
"add %3, %1 \n\t"

174 
ASMALIGN(4)

175 
"1: \n\t"

176 
"movq (%1), %%mm1 \n\t"

177 
"movq (%1,%3), %%mm2 \n\t"

178 
"pavgb 1(%1), %%mm1 \n\t"

179 
"pavgb 1(%1,%3), %%mm2 \n\t"

180 
"psubusb %%mm5, %%mm1 \n\t"

181 
"pavgb %%mm1, %%mm0 \n\t"

182 
"pavgb %%mm2, %%mm1 \n\t"

183 
"psadbw (%2), %%mm0 \n\t"

184 
"psadbw (%2,%3), %%mm1 \n\t"

185 
"paddw %%mm0, %%mm6 \n\t"

186 
"paddw %%mm1, %%mm6 \n\t"

187 
"movq %%mm2, %%mm0 \n\t"

188 
"lea (%1,%3,2), %1 \n\t"

189 
"lea (%2,%3,2), %2 \n\t"

190 
"sub $2, %0 \n\t"

191 
" jg 1b \n\t"

192 
: "+r" (h), "+r" (blk1), "+r" (blk2) 
193 
: "r" ((long)stride) 
194 
); 
195 
} 
196  
197 
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) 
198 
{ 
199 
long len= (stride*h);

200 
asm volatile( 
201 
ASMALIGN(4)

202 
"1: \n\t"

203 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 
204 
"movq (%2, %%"REG_a"), %%mm1 \n\t" 
205 
"movq (%1, %%"REG_a"), %%mm2 \n\t" 
206 
"movq (%2, %%"REG_a"), %%mm3 \n\t" 
207 
"punpcklbw %%mm7, %%mm0 \n\t"

208 
"punpcklbw %%mm7, %%mm1 \n\t"

209 
"punpckhbw %%mm7, %%mm2 \n\t"

210 
"punpckhbw %%mm7, %%mm3 \n\t"

211 
"paddw %%mm0, %%mm1 \n\t"

212 
"paddw %%mm2, %%mm3 \n\t"

213 
"movq (%3, %%"REG_a"), %%mm4 \n\t" 
214 
"movq (%3, %%"REG_a"), %%mm2 \n\t" 
215 
"paddw %%mm5, %%mm1 \n\t"

216 
"paddw %%mm5, %%mm3 \n\t"

217 
"psrlw $1, %%mm1 \n\t"

218 
"psrlw $1, %%mm3 \n\t"

219 
"packuswb %%mm3, %%mm1 \n\t"

220 
"psubusb %%mm1, %%mm4 \n\t"

221 
"psubusb %%mm2, %%mm1 \n\t"

222 
"por %%mm4, %%mm1 \n\t"

223 
"movq %%mm1, %%mm0 \n\t"

224 
"punpcklbw %%mm7, %%mm0 \n\t"

225 
"punpckhbw %%mm7, %%mm1 \n\t"

226 
"paddw %%mm1, %%mm0 \n\t"

227 
"paddw %%mm0, %%mm6 \n\t"

228 
"add %4, %%"REG_a" \n\t" 
229 
" js 1b \n\t"

230 
: "+a" (len)

231 
: "r" (blk1a  len), "r" (blk1b len), "r" (blk2  len), "r" ((long)stride) 
232 
); 
233 
} 
234  
235 
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 
236 
{ 
237 
long len= (stride*h);

238 
asm volatile( 
239 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 
240 
"movq 1(%1, %%"REG_a"), %%mm2 \n\t" 
241 
"movq %%mm0, %%mm1 \n\t"

242 
"movq %%mm2, %%mm3 \n\t"

243 
"punpcklbw %%mm7, %%mm0 \n\t"

244 
"punpckhbw %%mm7, %%mm1 \n\t"

245 
"punpcklbw %%mm7, %%mm2 \n\t"

246 
"punpckhbw %%mm7, %%mm3 \n\t"

247 
"paddw %%mm2, %%mm0 \n\t"

248 
"paddw %%mm3, %%mm1 \n\t"

249 
ASMALIGN(4)

250 
"1: \n\t"

251 
"movq (%2, %%"REG_a"), %%mm2 \n\t" 
252 
"movq 1(%2, %%"REG_a"), %%mm4 \n\t" 
253 
"movq %%mm2, %%mm3 \n\t"

254 
"movq %%mm4, %%mm5 \n\t"

255 
"punpcklbw %%mm7, %%mm2 \n\t"

256 
"punpckhbw %%mm7, %%mm3 \n\t"

257 
"punpcklbw %%mm7, %%mm4 \n\t"

258 
"punpckhbw %%mm7, %%mm5 \n\t"

259 
"paddw %%mm4, %%mm2 \n\t"

260 
"paddw %%mm5, %%mm3 \n\t"

261 
"movq 16+"MANGLE(round_tab)", %%mm5 \n\t" 
262 
"paddw %%mm2, %%mm0 \n\t"

263 
"paddw %%mm3, %%mm1 \n\t"

264 
"paddw %%mm5, %%mm0 \n\t"

265 
"paddw %%mm5, %%mm1 \n\t"

266 
"movq (%3, %%"REG_a"), %%mm4 \n\t" 
267 
"movq (%3, %%"REG_a"), %%mm5 \n\t" 
268 
"psrlw $2, %%mm0 \n\t"

269 
"psrlw $2, %%mm1 \n\t"

270 
"packuswb %%mm1, %%mm0 \n\t"

271 
"psubusb %%mm0, %%mm4 \n\t"

272 
"psubusb %%mm5, %%mm0 \n\t"

273 
"por %%mm4, %%mm0 \n\t"

274 
"movq %%mm0, %%mm4 \n\t"

275 
"punpcklbw %%mm7, %%mm0 \n\t"

276 
"punpckhbw %%mm7, %%mm4 \n\t"

277 
"paddw %%mm0, %%mm6 \n\t"

278 
"paddw %%mm4, %%mm6 \n\t"

279 
"movq %%mm2, %%mm0 \n\t"

280 
"movq %%mm3, %%mm1 \n\t"

281 
"add %4, %%"REG_a" \n\t" 
282 
" js 1b \n\t"

283 
: "+a" (len)

284 
: "r" (blk1  len), "r" (blk1 len + stride), "r" (blk2  len), "r" ((long)stride) 
285 
); 
286 
} 
287  
288 
static inline int sum_mmx(void) 
289 
{ 
290 
int ret;

291 
asm volatile( 
292 
"movq %%mm6, %%mm0 \n\t"

293 
"psrlq $32, %%mm6 \n\t"

294 
"paddw %%mm0, %%mm6 \n\t"

295 
"movq %%mm6, %%mm0 \n\t"

296 
"psrlq $16, %%mm6 \n\t"

297 
"paddw %%mm0, %%mm6 \n\t"

298 
"movd %%mm6, %0 \n\t"

299 
: "=r" (ret)

300 
); 
301 
return ret&0xFFFF; 
302 
} 
303  
304 
static inline int sum_mmx2(void) 
305 
{ 
306 
int ret;

307 
asm volatile( 
308 
"movd %%mm6, %0 \n\t"

309 
: "=r" (ret)

310 
); 
311 
return ret;

312 
} 
313  
314 
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 
315 
{ 
316 
sad8_2_mmx(blk1, blk1+1, blk2, stride, h);

317 
} 
318 
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 
319 
{ 
320 
sad8_2_mmx(blk1, blk1+stride, blk2, stride, h); 
321 
} 
322  
323  
324 
#define PIX_SAD(suf)\

325 
static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 
326 
{\ 
327 
assert(h==8);\

328 
asm volatile("pxor %%mm7, %%mm7 \n\t"\ 
329 
"pxor %%mm6, %%mm6 \n\t":);\

330 
\ 
331 
sad8_1_ ## suf(blk1, blk2, stride, 8);\ 
332 
\ 
333 
return sum_ ## suf();\ 
334 
}\ 
335 
static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 
336 
{\ 
337 
assert(h==8);\

338 
asm volatile("pxor %%mm7, %%mm7 \n\t"\ 
339 
"pxor %%mm6, %%mm6 \n\t"\

340 
"movq %0, %%mm5 \n\t"\

341 
:: "m"(round_tab[1]) \ 
342 
);\ 
343 
\ 
344 
sad8_x2a_ ## suf(blk1, blk2, stride, 8);\ 
345 
\ 
346 
return sum_ ## suf();\ 
347 
}\ 
348 
\ 
349 
static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 
350 
{\ 
351 
assert(h==8);\

352 
asm volatile("pxor %%mm7, %%mm7 \n\t"\ 
353 
"pxor %%mm6, %%mm6 \n\t"\

354 
"movq %0, %%mm5 \n\t"\

355 
:: "m"(round_tab[1]) \ 
356 
);\ 
357 
\ 
358 
sad8_y2a_ ## suf(blk1, blk2, stride, 8);\ 
359 
\ 
360 
return sum_ ## suf();\ 
361 
}\ 
362 
\ 
363 
static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 
364 
{\ 
365 
assert(h==8);\

366 
asm volatile("pxor %%mm7, %%mm7 \n\t"\ 
367 
"pxor %%mm6, %%mm6 \n\t"\

368 
::);\ 
369 
\ 
370 
sad8_4_ ## suf(blk1, blk2, stride, 8);\ 
371 
\ 
372 
return sum_ ## suf();\ 
373 
}\ 
374 
\ 
375 
static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 
376 
{\ 
377 
asm volatile("pxor %%mm7, %%mm7 \n\t"\ 
378 
"pxor %%mm6, %%mm6 \n\t":);\

379 
\ 
380 
sad8_1_ ## suf(blk1 , blk2 , stride, h);\ 
381 
sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\ 
382 
\ 
383 
return sum_ ## suf();\ 
384 
}\ 
385 
static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 
386 
{\ 
387 
asm volatile("pxor %%mm7, %%mm7 \n\t"\ 
388 
"pxor %%mm6, %%mm6 \n\t"\

389 
"movq %0, %%mm5 \n\t"\

390 
:: "m"(round_tab[1]) \ 
391 
);\ 
392 
\ 
393 
sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\ 
394 
sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\ 
395 
\ 
396 
return sum_ ## suf();\ 
397 
}\ 
398 
static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 
399 
{\ 
400 
asm volatile("pxor %%mm7, %%mm7 \n\t"\ 
401 
"pxor %%mm6, %%mm6 \n\t"\

402 
"movq %0, %%mm5 \n\t"\

403 
:: "m"(round_tab[1]) \ 
404 
);\ 
405 
\ 
406 
sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\ 
407 
sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\ 
408 
\ 
409 
return sum_ ## suf();\ 
410 
}\ 
411 
static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 
412 
{\ 
413 
asm volatile("pxor %%mm7, %%mm7 \n\t"\ 
414 
"pxor %%mm6, %%mm6 \n\t"\

415 
::);\ 
416 
\ 
417 
sad8_4_ ## suf(blk1 , blk2 , stride, h);\ 
418 
sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\ 
419 
\ 
420 
return sum_ ## suf();\ 
421 
}\ 
422  
423 
PIX_SAD(mmx) 
424 
PIX_SAD(mmx2) 
425  
426 
void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)

427 
{ 
428 
if (mm_flags & MM_MMX) {

429 
c>pix_abs[0][0] = sad16_mmx; 
430 
c>pix_abs[0][1] = sad16_x2_mmx; 
431 
c>pix_abs[0][2] = sad16_y2_mmx; 
432 
c>pix_abs[0][3] = sad16_xy2_mmx; 
433 
c>pix_abs[1][0] = sad8_mmx; 
434 
c>pix_abs[1][1] = sad8_x2_mmx; 
435 
c>pix_abs[1][2] = sad8_y2_mmx; 
436 
c>pix_abs[1][3] = sad8_xy2_mmx; 
437  
438 
c>sad[0]= sad16_mmx;

439 
c>sad[1]= sad8_mmx;

440 
} 
441 
if (mm_flags & MM_MMXEXT) {

442 
c>pix_abs[0][0] = sad16_mmx2; 
443 
c>pix_abs[1][0] = sad8_mmx2; 
444  
445 
c>sad[0]= sad16_mmx2;

446 
c>sad[1]= sad8_mmx2;

447  
448 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){

449 
c>pix_abs[0][1] = sad16_x2_mmx2; 
450 
c>pix_abs[0][2] = sad16_y2_mmx2; 
451 
c>pix_abs[0][3] = sad16_xy2_mmx2; 
452 
c>pix_abs[1][1] = sad8_x2_mmx2; 
453 
c>pix_abs[1][2] = sad8_y2_mmx2; 
454 
c>pix_abs[1][3] = sad8_xy2_mmx2; 
455 
} 
456 
} 
457 
if ((mm_flags & MM_SSE2) && !(mm_flags & MM_3DNOW)) {

458 
c>sad[0]= sad16_sse2;

459 
} 
460 
} 