Revision 1f630b97

View differences:

libavcodec/x86/h264dsp_mmx.c
617 617
        "pavgb    %%mm2,  "#tmp"   \n\t"\
618 618
        "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
619 619
        "pxor   "q2addr", "#tmp"   \n\t"\
620
        "pand     %8,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
620
        "pand     %9,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
621 621
        "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
622 622
        "movq     "#p1",  "#tmp"   \n\t"\
623 623
        "psubusb  "#tc0", "#tmp"   \n\t"\
......
631 631
    DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
632 632

  
633 633
    __asm__ volatile(
634
        "movq    (%1,%3), %%mm0    \n\t" //p1
635
        "movq    (%1,%3,2), %%mm1  \n\t" //p0
636
        "movq    (%2),    %%mm2    \n\t" //q0
637
        "movq    (%2,%3), %%mm3    \n\t" //q1
638
        H264_DEBLOCK_MASK(%6, %7)
634
        "movq    (%2,%4), %%mm0    \n\t" //p1
635
        "movq    (%2,%4,2), %%mm1  \n\t" //p0
636
        "movq    (%3),    %%mm2    \n\t" //q0
637
        "movq    (%3,%4), %%mm3    \n\t" //q1
638
        H264_DEBLOCK_MASK(%7, %8)
639 639

  
640
        "movd      %5,    %%mm4    \n\t"
640
        "movd      %6,    %%mm4    \n\t"
641 641
        "punpcklbw %%mm4, %%mm4    \n\t"
642 642
        "punpcklwd %%mm4, %%mm4    \n\t"
643 643
        "pcmpeqb   %%mm3, %%mm3    \n\t"
644 644
        "movq      %%mm4, %%mm6    \n\t"
645 645
        "pcmpgtb   %%mm3, %%mm4    \n\t"
646
        "movq      %%mm6, 8+%0     \n\t"
646
        "movq      %%mm6, %1       \n\t"
647 647
        "pand      %%mm4, %%mm7    \n\t"
648 648
        "movq      %%mm7, %0       \n\t"
649 649

  
650 650
        /* filter p1 */
651
        "movq     (%1),   %%mm3    \n\t" //p2
651
        "movq     (%2),   %%mm3    \n\t" //p2
652 652
        DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
653 653
        "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
654
        "pand     8+%0,   %%mm7    \n\t" // mask & tc0
654
        "pand     %1,     %%mm7    \n\t" // mask & tc0
655 655
        "movq     %%mm7,  %%mm4    \n\t"
656 656
        "psubb    %%mm6,  %%mm7    \n\t"
657 657
        "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
658
        H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
658
        H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4)
659 659

  
660 660
        /* filter q1 */
661
        "movq    (%2,%3,2), %%mm4  \n\t" //q2
661
        "movq    (%3,%4,2), %%mm4  \n\t" //q2
662 662
        DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
663 663
        "pand     %0,     %%mm6    \n\t"
664
        "movq     8+%0,   %%mm5    \n\t" // can be merged with the and below but is slower then
664
        "movq     %1,     %%mm5    \n\t" // can be merged with the and below but is slower then
665 665
        "pand     %%mm6,  %%mm5    \n\t"
666 666
        "psubb    %%mm6,  %%mm7    \n\t"
667
        "movq    (%2,%3), %%mm3    \n\t"
668
        H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
667
        "movq    (%3,%4), %%mm3    \n\t"
668
        H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6)
669 669

  
670 670
        /* filter p0, q0 */
671
        H264_DEBLOCK_P0_Q0(%8, unused)
672
        "movq      %%mm1, (%1,%3,2) \n\t"
673
        "movq      %%mm2, (%2)      \n\t"
671
        H264_DEBLOCK_P0_Q0(%9, unused)
672
        "movq      %%mm1, (%2,%4,2) \n\t"
673
        "movq      %%mm2, (%3)      \n\t"
674 674

  
675
        : "=m"(*tmp0)
675
        : "=m"(tmp0[0]), "=m"(tmp0[1])
676 676
        : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
677 677
          "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
678 678
          "m"(ff_bone)

Also available in: Unified diff