Revision 1f630b97 libavcodec/x86/h264dsp_mmx.c
libavcodec/x86/h264dsp_mmx.c  

617  617 
"pavgb %%mm2, "#tmp" \n\t"\ 
618  618 
"pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ 
619  619 
"pxor "q2addr", "#tmp" \n\t"\ 
620 
"pand %8, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\


620 
"pand %9, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\


621  621 
"psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ 
622  622 
"movq "#p1", "#tmp" \n\t"\ 
623  623 
"psubusb "#tc0", "#tmp" \n\t"\ 
...  ...  
631  631 
DECLARE_ALIGNED_8(uint64_t, tmp0[2]); 
632  632  
633  633 
__asm__ volatile( 
634 
"movq (%1,%3), %%mm0 \n\t" //p1


635 
"movq (%1,%3,2), %%mm1 \n\t" //p0


636 
"movq (%2), %%mm2 \n\t" //q0


637 
"movq (%2,%3), %%mm3 \n\t" //q1


638 
H264_DEBLOCK_MASK(%6, %7)


634 
"movq (%2,%4), %%mm0 \n\t" //p1


635 
"movq (%2,%4,2), %%mm1 \n\t" //p0


636 
"movq (%3), %%mm2 \n\t" //q0


637 
"movq (%3,%4), %%mm3 \n\t" //q1


638 
H264_DEBLOCK_MASK(%7, %8)


639  639  
640 
"movd %5, %%mm4 \n\t"


640 
"movd %6, %%mm4 \n\t"


641  641 
"punpcklbw %%mm4, %%mm4 \n\t" 
642  642 
"punpcklwd %%mm4, %%mm4 \n\t" 
643  643 
"pcmpeqb %%mm3, %%mm3 \n\t" 
644  644 
"movq %%mm4, %%mm6 \n\t" 
645  645 
"pcmpgtb %%mm3, %%mm4 \n\t" 
646 
"movq %%mm6, 8+%0 \n\t"


646 
"movq %%mm6, %1 \n\t"


647  647 
"pand %%mm4, %%mm7 \n\t" 
648  648 
"movq %%mm7, %0 \n\t" 
649  649  
650  650 
/* filter p1 */ 
651 
"movq (%1), %%mm3 \n\t" //p2


651 
"movq (%2), %%mm3 \n\t" //p2


652  652 
DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // p2p0>beta1 
653  653 
"pand %%mm7, %%mm6 \n\t" // mask & p2p0<beta 
654 
"pand 8+%0, %%mm7 \n\t" // mask & tc0


654 
"pand %1, %%mm7 \n\t" // mask & tc0


655  655 
"movq %%mm7, %%mm4 \n\t" 
656  656 
"psubb %%mm6, %%mm7 \n\t" 
657  657 
"pand %%mm4, %%mm6 \n\t" // mask & p2p0<beta & tc0 
658 
H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)


658 
H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4)


659  659  
660  660 
/* filter q1 */ 
661 
"movq (%2,%3,2), %%mm4 \n\t" //q2


661 
"movq (%3,%4,2), %%mm4 \n\t" //q2


662  662 
DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // q2q0>beta1 
663  663 
"pand %0, %%mm6 \n\t" 
664 
"movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then


664 
"movq %1, %%mm5 \n\t" // can be merged with the and below but is slower then


665  665 
"pand %%mm6, %%mm5 \n\t" 
666  666 
"psubb %%mm6, %%mm7 \n\t" 
667 
"movq (%2,%3), %%mm3 \n\t"


668 
H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)


667 
"movq (%3,%4), %%mm3 \n\t"


668 
H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6)


669  669  
670  670 
/* filter p0, q0 */ 
671 
H264_DEBLOCK_P0_Q0(%8, unused)


672 
"movq %%mm1, (%1,%3,2) \n\t"


673 
"movq %%mm2, (%2) \n\t"


671 
H264_DEBLOCK_P0_Q0(%9, unused)


672 
"movq %%mm1, (%2,%4,2) \n\t"


673 
"movq %%mm2, (%3) \n\t"


674  674  
675 
: "=m"(*tmp0)


675 
: "=m"(tmp0[0]), "=m"(tmp0[1])


676  676 
: "r"(pix3*stride), "r"(pix), "r"((x86_reg)stride), 
677  677 
"m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), 
678  678 
"m"(ff_bone) 
Also available in: Unified diff