Revision 792a5a7c

View differences:

libavcodec/libpostproc/postprocess.c
33 33
isHorizMinMaxOk		a	E
34 34
doHorizLowPass		E		e	e
35 35
doHorizDefFilter	Ec	Ec	e	e
36
do_a_deblock		Ec	E	Ec	E
36 37
deRing			E		e	e*	Ecp
37 38
Vertical RKAlgo1	E		a	a
38 39
Horizontal RKAlgo1			a	a
......
476 477
/**
477 478
 * accurate deblock filter
478 479
 */
479
static always_inline void do_a_deblock(uint8_t *src, int step, int stride, PPContext *c){
480
static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
480 481
	int y;
481 482
	const int QP= c->QP;
482 483
	const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
libavcodec/libpostproc/postprocess_template.c
1031 1031
		"psubw %%mm6, %%mm1				\n\t"
1032 1032
#endif
1033 1033

  
1034
		"movd %2, %%mm2					\n\t" // QP
1035
		"punpcklbw %%mm7, %%mm2				\n\t"
1036

  
1034 1037
		"movq %%mm7, %%mm6				\n\t" // 0
1035 1038
		"pcmpgtw %%mm4, %%mm6				\n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1036 1039
		"pxor %%mm6, %%mm4				\n\t"
......
1039 1042
		"pxor %%mm7, %%mm5				\n\t"
1040 1043
		"psubw %%mm7, %%mm5				\n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1041 1044
// 100 opcodes
1042
		"movd %2, %%mm2					\n\t" // QP
1043 1045
		"psllw $3, %%mm2				\n\t" // 8QP
1044 1046
		"movq %%mm2, %%mm3				\n\t" // 8QP
1045 1047
		"pcmpgtw %%mm4, %%mm2				\n\t"
......
2610 2612
#endif
2611 2613
}
2612 2614

  
2615
#ifdef HAVE_MMX
2616
/**
2617
 * accurate deblock filter
2618
 */
2619
static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
2620
	int y;
2621
	const int QP= c->QP;
2622
	int64_t dc_mask, eq_mask;
2623
	src+= step*3; // src points to begin of the 8x8 Block
2624
//START_TIMER
2625
asm volatile(
2626
		"movq %0, %%mm7					\n\t" 
2627
		"movq %1, %%mm6					\n\t" 
2628
                : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
2629
                );
2630
                
2631
asm volatile(
2632
		"leal (%2, %3), %%eax				\n\t"
2633
//	0	1	2	3	4	5	6	7	8	9
2634
//	%1	eax	eax+%2	eax+2%2	%1+4%2	ecx	ecx+%2	ecx+2%2	%1+8%2	ecx+4%2
2635

  
2636
		"movq (%2), %%mm0				\n\t"
2637
		"movq (%%eax), %%mm1				\n\t"
2638
                "movq %%mm1, %%mm3				\n\t"
2639
                "movq %%mm1, %%mm4				\n\t"
2640
		"psubb %%mm1, %%mm0				\n\t" // mm0 = differnece
2641
		"paddb %%mm7, %%mm0				\n\t"
2642
		"pcmpgtb %%mm6, %%mm0				\n\t"
2643

  
2644
		"movq (%%eax,%3), %%mm2				\n\t"
2645
                PMAXUB(%%mm2, %%mm4)
2646
                PMINUB(%%mm2, %%mm3, %%mm5)
2647
		"psubb %%mm2, %%mm1				\n\t"
2648
		"paddb %%mm7, %%mm1				\n\t"
2649
		"pcmpgtb %%mm6, %%mm1				\n\t"
2650
		"paddb %%mm1, %%mm0				\n\t"
2651

  
2652
		"movq (%%eax, %3, 2), %%mm1			\n\t"
2653
                PMAXUB(%%mm1, %%mm4)
2654
                PMINUB(%%mm1, %%mm3, %%mm5)
2655
		"psubb %%mm1, %%mm2				\n\t"
2656
		"paddb %%mm7, %%mm2				\n\t"
2657
		"pcmpgtb %%mm6, %%mm2				\n\t"
2658
		"paddb %%mm2, %%mm0				\n\t"
2659
		
2660
		"leal (%%eax, %3, 4), %%eax			\n\t"
2661

  
2662
		"movq (%2, %3, 4), %%mm2			\n\t"
2663
                PMAXUB(%%mm2, %%mm4)
2664
                PMINUB(%%mm2, %%mm3, %%mm5)
2665
		"psubb %%mm2, %%mm1				\n\t"
2666
		"paddb %%mm7, %%mm1				\n\t"
2667
		"pcmpgtb %%mm6, %%mm1				\n\t"
2668
		"paddb %%mm1, %%mm0				\n\t"
2669

  
2670
		"movq (%%eax), %%mm1				\n\t"
2671
                PMAXUB(%%mm1, %%mm4)
2672
                PMINUB(%%mm1, %%mm3, %%mm5)
2673
		"psubb %%mm1, %%mm2				\n\t"
2674
		"paddb %%mm7, %%mm2				\n\t"
2675
		"pcmpgtb %%mm6, %%mm2				\n\t"
2676
		"paddb %%mm2, %%mm0				\n\t"
2677

  
2678
		"movq (%%eax, %3), %%mm2			\n\t"
2679
                PMAXUB(%%mm2, %%mm4)
2680
                PMINUB(%%mm2, %%mm3, %%mm5)
2681
		"psubb %%mm2, %%mm1				\n\t"
2682
		"paddb %%mm7, %%mm1				\n\t"
2683
		"pcmpgtb %%mm6, %%mm1				\n\t"
2684
		"paddb %%mm1, %%mm0				\n\t"
2685

  
2686
		"movq (%%eax, %3, 2), %%mm1			\n\t"
2687
                PMAXUB(%%mm1, %%mm4)
2688
                PMINUB(%%mm1, %%mm3, %%mm5)
2689
		"psubb %%mm1, %%mm2				\n\t"
2690
		"paddb %%mm7, %%mm2				\n\t"
2691
		"pcmpgtb %%mm6, %%mm2				\n\t"
2692
		"paddb %%mm2, %%mm0				\n\t"
2693

  
2694
		"movq (%2, %3, 8), %%mm2			\n\t"
2695
                PMAXUB(%%mm2, %%mm4)
2696
                PMINUB(%%mm2, %%mm3, %%mm5)
2697
		"psubb %%mm2, %%mm1				\n\t"
2698
		"paddb %%mm7, %%mm1				\n\t"
2699
		"pcmpgtb %%mm6, %%mm1				\n\t"
2700
		"paddb %%mm1, %%mm0				\n\t"
2701

  
2702
		"movq (%%eax, %3, 4), %%mm1			\n\t"
2703
		"psubb %%mm1, %%mm2				\n\t"
2704
		"paddb %%mm7, %%mm2				\n\t"
2705
		"pcmpgtb %%mm6, %%mm2				\n\t"
2706
		"paddb %%mm2, %%mm0				\n\t"
2707
		"psubusb %%mm3, %%mm4				\n\t"
2708

  
2709
                "movq %4, %%mm7					\n\t" // QP,..., QP
2710
		"paddusb %%mm7, %%mm7				\n\t" // 2QP ... 2QP
2711
		"pcmpgtb %%mm4, %%mm7				\n\t" // Diff < 2QP -> FF
2712
		"movq %%mm7, %1					\n\t"
2713

  
2714
		"pxor %%mm6, %%mm6				\n\t"
2715
		"movq %5, %%mm7					\n\t"
2716
		"punpcklbw %%mm7, %%mm7				\n\t"
2717
		"punpcklbw %%mm7, %%mm7				\n\t"
2718
		"punpcklbw %%mm7, %%mm7				\n\t"
2719
		"psubb %%mm0, %%mm6				\n\t"
2720
		"pcmpgtb %%mm7, %%mm6				\n\t"
2721
		"movq %%mm6, %0					\n\t"
2722

  
2723
		: "=m" (eq_mask), "=m" (dc_mask)
2724
		: "r" (src), "r" (step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2725
		: "%eax"
2726
		);
2727

  
2728
	src+= step; // src points to begin of the 8x8 Block
2729

  
2730
	if(eq_mask != -1LL){
2731
		asm volatile(
2732
		"pxor %%mm7, %%mm7				\n\t"
2733
		"leal -40(%%esp), %%ecx				\n\t" // make space for 4 8-byte vars
2734
		"andl $0xFFFFFFF8, %%ecx			\n\t" // align
2735
//	0	1	2	3	4	5	6	7	8	9
2736
//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%1+8%1	ecx+4%1
2737

  
2738
		"movq (%0), %%mm0				\n\t"
2739
		"movq %%mm0, %%mm1				\n\t"
2740
		"punpcklbw %%mm7, %%mm0				\n\t" // low part of line 0
2741
		"punpckhbw %%mm7, %%mm1				\n\t" // high part of line 0
2742

  
2743
		"movq (%0, %1), %%mm2				\n\t"
2744
		"leal (%0, %1, 2), %%eax			\n\t"
2745
		"movq %%mm2, %%mm3				\n\t"
2746
		"punpcklbw %%mm7, %%mm2				\n\t" // low part of line 1
2747
		"punpckhbw %%mm7, %%mm3				\n\t" // high part of line 1
2748

  
2749
		"movq (%%eax), %%mm4				\n\t"
2750
		"movq %%mm4, %%mm5				\n\t"
2751
		"punpcklbw %%mm7, %%mm4				\n\t" // low part of line 2
2752
		"punpckhbw %%mm7, %%mm5				\n\t" // high part of line 2
2753

  
2754
		"paddw %%mm0, %%mm0				\n\t" // 2L0
2755
		"paddw %%mm1, %%mm1				\n\t" // 2H0
2756
		"psubw %%mm4, %%mm2				\n\t" // L1 - L2
2757
		"psubw %%mm5, %%mm3				\n\t" // H1 - H2
2758
		"psubw %%mm2, %%mm0				\n\t" // 2L0 - L1 + L2
2759
		"psubw %%mm3, %%mm1				\n\t" // 2H0 - H1 + H2
2760

  
2761
		"psllw $2, %%mm2				\n\t" // 4L1 - 4L2
2762
		"psllw $2, %%mm3				\n\t" // 4H1 - 4H2
2763
		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2
2764
		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2
2765

  
2766
		"movq (%%eax, %1), %%mm2			\n\t"
2767
		"movq %%mm2, %%mm3				\n\t"
2768
		"punpcklbw %%mm7, %%mm2				\n\t" // L3
2769
		"punpckhbw %%mm7, %%mm3				\n\t" // H3
2770

  
2771
		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - L3
2772
		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - H3
2773
		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
2774
		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
2775
		"movq %%mm0, (%%ecx)				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
2776
		"movq %%mm1, 8(%%ecx)				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
2777

  
2778
		"movq (%%eax, %1, 2), %%mm0			\n\t"
2779
		"movq %%mm0, %%mm1				\n\t"
2780
		"punpcklbw %%mm7, %%mm0				\n\t" // L4
2781
		"punpckhbw %%mm7, %%mm1				\n\t" // H4
2782

  
2783
		"psubw %%mm0, %%mm2				\n\t" // L3 - L4
2784
		"psubw %%mm1, %%mm3				\n\t" // H3 - H4
2785
		"movq %%mm2, 16(%%ecx)				\n\t" // L3 - L4
2786
		"movq %%mm3, 24(%%ecx)				\n\t" // H3 - H4
2787
		"paddw %%mm4, %%mm4				\n\t" // 2L2
2788
		"paddw %%mm5, %%mm5				\n\t" // 2H2
2789
		"psubw %%mm2, %%mm4				\n\t" // 2L2 - L3 + L4
2790
		"psubw %%mm3, %%mm5				\n\t" // 2H2 - H3 + H4
2791

  
2792
		"leal (%%eax, %1), %0				\n\t"
2793
		"psllw $2, %%mm2				\n\t" // 4L3 - 4L4
2794
		"psllw $2, %%mm3				\n\t" // 4H3 - 4H4
2795
		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4
2796
		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4
2797
//50 opcodes so far
2798
		"movq (%0, %1, 2), %%mm2			\n\t"
2799
		"movq %%mm2, %%mm3				\n\t"
2800
		"punpcklbw %%mm7, %%mm2				\n\t" // L5
2801
		"punpckhbw %%mm7, %%mm3				\n\t" // H5
2802
		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - L5
2803
		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - H5
2804
		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - 2L5
2805
		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - 2H5
2806

  
2807
		"movq (%%eax, %1, 4), %%mm6			\n\t"
2808
		"punpcklbw %%mm7, %%mm6				\n\t" // L6
2809
		"psubw %%mm6, %%mm2				\n\t" // L5 - L6
2810
		"movq (%%eax, %1, 4), %%mm6			\n\t"
2811
		"punpckhbw %%mm7, %%mm6				\n\t" // H6
2812
		"psubw %%mm6, %%mm3				\n\t" // H5 - H6
2813

  
2814
		"paddw %%mm0, %%mm0				\n\t" // 2L4
2815
		"paddw %%mm1, %%mm1				\n\t" // 2H4
2816
		"psubw %%mm2, %%mm0				\n\t" // 2L4 - L5 + L6
2817
		"psubw %%mm3, %%mm1				\n\t" // 2H4 - H5 + H6
2818

  
2819
		"psllw $2, %%mm2				\n\t" // 4L5 - 4L6
2820
		"psllw $2, %%mm3				\n\t" // 4H5 - 4H6
2821
		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6
2822
		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6
2823

  
2824
		"movq (%0, %1, 4), %%mm2			\n\t"
2825
		"movq %%mm2, %%mm3				\n\t"
2826
		"punpcklbw %%mm7, %%mm2				\n\t" // L7
2827
		"punpckhbw %%mm7, %%mm3				\n\t" // H7
2828

  
2829
		"paddw %%mm2, %%mm2				\n\t" // 2L7
2830
		"paddw %%mm3, %%mm3				\n\t" // 2H7
2831
		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6 - 2L7
2832
		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6 - 2H7
2833

  
2834
		"movq (%%ecx), %%mm2				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
2835
		"movq 8(%%ecx), %%mm3				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
2836

  
2837
#ifdef HAVE_MMX2
2838
		"movq %%mm7, %%mm6				\n\t" // 0
2839
		"psubw %%mm0, %%mm6				\n\t"
2840
		"pmaxsw %%mm6, %%mm0				\n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2841
		"movq %%mm7, %%mm6				\n\t" // 0
2842
		"psubw %%mm1, %%mm6				\n\t"
2843
		"pmaxsw %%mm6, %%mm1				\n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2844
		"movq %%mm7, %%mm6				\n\t" // 0
2845
		"psubw %%mm2, %%mm6				\n\t"
2846
		"pmaxsw %%mm6, %%mm2				\n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2847
		"movq %%mm7, %%mm6				\n\t" // 0
2848
		"psubw %%mm3, %%mm6				\n\t"
2849
		"pmaxsw %%mm6, %%mm3				\n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2850
#else
2851
		"movq %%mm7, %%mm6				\n\t" // 0
2852
		"pcmpgtw %%mm0, %%mm6				\n\t"
2853
		"pxor %%mm6, %%mm0				\n\t"
2854
		"psubw %%mm6, %%mm0				\n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2855
		"movq %%mm7, %%mm6				\n\t" // 0
2856
		"pcmpgtw %%mm1, %%mm6				\n\t"
2857
		"pxor %%mm6, %%mm1				\n\t"
2858
		"psubw %%mm6, %%mm1				\n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2859
		"movq %%mm7, %%mm6				\n\t" // 0
2860
		"pcmpgtw %%mm2, %%mm6				\n\t"
2861
		"pxor %%mm6, %%mm2				\n\t"
2862
		"psubw %%mm6, %%mm2				\n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2863
		"movq %%mm7, %%mm6				\n\t" // 0
2864
		"pcmpgtw %%mm3, %%mm6				\n\t"
2865
		"pxor %%mm6, %%mm3				\n\t"
2866
		"psubw %%mm6, %%mm3				\n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2867
#endif
2868

  
2869
#ifdef HAVE_MMX2
2870
		"pminsw %%mm2, %%mm0				\n\t"
2871
		"pminsw %%mm3, %%mm1				\n\t"
2872
#else
2873
		"movq %%mm0, %%mm6				\n\t"
2874
		"psubusw %%mm2, %%mm6				\n\t"
2875
		"psubw %%mm6, %%mm0				\n\t"
2876
		"movq %%mm1, %%mm6				\n\t"
2877
		"psubusw %%mm3, %%mm6				\n\t"
2878
		"psubw %%mm6, %%mm1				\n\t"
2879
#endif
2880

  
2881
		"movd %2, %%mm2					\n\t" // QP
2882
		"punpcklbw %%mm7, %%mm2				\n\t"
2883

  
2884
		"movq %%mm7, %%mm6				\n\t" // 0
2885
		"pcmpgtw %%mm4, %%mm6				\n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
2886
		"pxor %%mm6, %%mm4				\n\t"
2887
		"psubw %%mm6, %%mm4				\n\t" // |2L2 - 5L3 + 5L4 - 2L5|
2888
		"pcmpgtw %%mm5, %%mm7				\n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
2889
		"pxor %%mm7, %%mm5				\n\t"
2890
		"psubw %%mm7, %%mm5				\n\t" // |2H2 - 5H3 + 5H4 - 2H5|
2891
// 100 opcodes
2892
		"psllw $3, %%mm2				\n\t" // 8QP
2893
		"movq %%mm2, %%mm3				\n\t" // 8QP
2894
		"pcmpgtw %%mm4, %%mm2				\n\t"
2895
		"pcmpgtw %%mm5, %%mm3				\n\t"
2896
		"pand %%mm2, %%mm4				\n\t"
2897
		"pand %%mm3, %%mm5				\n\t"
2898

  
2899

  
2900
		"psubusw %%mm0, %%mm4				\n\t" // hd
2901
		"psubusw %%mm1, %%mm5				\n\t" // ld
2902

  
2903

  
2904
		"movq "MANGLE(w05)", %%mm2			\n\t" // 5
2905
		"pmullw %%mm2, %%mm4				\n\t"
2906
		"pmullw %%mm2, %%mm5				\n\t"
2907
		"movq "MANGLE(w20)", %%mm2			\n\t" // 32
2908
		"paddw %%mm2, %%mm4				\n\t"
2909
		"paddw %%mm2, %%mm5				\n\t"
2910
		"psrlw $6, %%mm4				\n\t"
2911
		"psrlw $6, %%mm5				\n\t"
2912

  
2913
		"movq 16(%%ecx), %%mm0				\n\t" // L3 - L4
2914
		"movq 24(%%ecx), %%mm1				\n\t" // H3 - H4
2915

  
2916
		"pxor %%mm2, %%mm2				\n\t"
2917
		"pxor %%mm3, %%mm3				\n\t"
2918

  
2919
		"pcmpgtw %%mm0, %%mm2				\n\t" // sign (L3-L4)
2920
		"pcmpgtw %%mm1, %%mm3				\n\t" // sign (H3-H4)
2921
		"pxor %%mm2, %%mm0				\n\t"
2922
		"pxor %%mm3, %%mm1				\n\t"
2923
		"psubw %%mm2, %%mm0				\n\t" // |L3-L4|
2924
		"psubw %%mm3, %%mm1				\n\t" // |H3-H4|
2925
		"psrlw $1, %%mm0				\n\t" // |L3 - L4|/2
2926
		"psrlw $1, %%mm1				\n\t" // |H3 - H4|/2
2927

  
2928
		"pxor %%mm6, %%mm2				\n\t"
2929
		"pxor %%mm7, %%mm3				\n\t"
2930
		"pand %%mm2, %%mm4				\n\t"
2931
		"pand %%mm3, %%mm5				\n\t"
2932

  
2933
#ifdef HAVE_MMX2
2934
		"pminsw %%mm0, %%mm4				\n\t"
2935
		"pminsw %%mm1, %%mm5				\n\t"
2936
#else
2937
		"movq %%mm4, %%mm2				\n\t"
2938
		"psubusw %%mm0, %%mm2				\n\t"
2939
		"psubw %%mm2, %%mm4				\n\t"
2940
		"movq %%mm5, %%mm2				\n\t"
2941
		"psubusw %%mm1, %%mm2				\n\t"
2942
		"psubw %%mm2, %%mm5				\n\t"
2943
#endif
2944
		"pxor %%mm6, %%mm4				\n\t"
2945
		"pxor %%mm7, %%mm5				\n\t"
2946
		"psubw %%mm6, %%mm4				\n\t"
2947
		"psubw %%mm7, %%mm5				\n\t"
2948
		"packsswb %%mm5, %%mm4				\n\t"
2949
		"movq %3, %%mm1					\n\t"
2950
		"pandn %%mm4, %%mm1				\n\t"
2951
		"movq (%0), %%mm0				\n\t"
2952
		"paddb   %%mm1, %%mm0				\n\t"
2953
		"movq %%mm0, (%0)				\n\t"
2954
		"movq (%0, %1), %%mm0				\n\t"
2955
		"psubb %%mm1, %%mm0				\n\t"
2956
		"movq %%mm0, (%0, %1)				\n\t"
2957

  
2958
		: "+r" (src)
2959
		: "r" (step), "m" (c->pQPb), "m"(eq_mask)
2960
		: "%eax", "%ecx"
2961
		);
2962
		src-= 3*step; //reverse src change from asm
2963
	}
2964

  
2965
	for(y=0; y<8; y++){
2966
		if((eq_mask>>(y*8))&1){
2967
			if((dc_mask>>(y*8))&1){
2968
				const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
2969
				const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
2970
				
2971
				int sums[10];
2972
				sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
2973
				sums[1] = sums[0] - first       + src[3*step];
2974
				sums[2] = sums[1] - first       + src[4*step];
2975
				sums[3] = sums[2] - first       + src[5*step];
2976
				sums[4] = sums[3] - first       + src[6*step];
2977
				sums[5] = sums[4] - src[0*step] + src[7*step];
2978
				sums[6] = sums[5] - src[1*step] + last;
2979
				sums[7] = sums[6] - src[2*step] + last;
2980
				sums[8] = sums[7] - src[3*step] + last;
2981
				sums[9] = sums[8] - src[4*step] + last;
2982

  
2983
				src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
2984
				src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
2985
				src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
2986
				src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
2987
				src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
2988
				src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
2989
				src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
2990
				src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
2991
			}
2992
		}
2993

  
2994
		src += stride;
2995
	}
2996
/*if(step==16){
2997
    STOP_TIMER("step16")
2998
}else{
2999
    STOP_TIMER("stepX")
3000
}*/
3001
}
3002
#endif //HAVE_MMX
3003

  
2613 3004
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2614 3005
	QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
2615 3006

  
......
3113 3504
					else if(t==2)
3114 3505
						RENAME(doVertDefFilter)(dstBlock, stride, &c);
3115 3506
				}else if(mode & V_A_DEBLOCK){
3116
					do_a_deblock(dstBlock, stride, 1, &c);
3507
					RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
3117 3508
				}
3118 3509
			}
3119 3510

  
......
3136 3527
					else if(t==2)
3137 3528
						RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3138 3529
				}else if(mode & H_A_DEBLOCK){
3139
					do_a_deblock(tempBlock1, 16, 1, &c);
3530
					RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
3140 3531
				}
3141 3532

  
3142 3533
				RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
......
3153 3544
					else if(t==2)
3154 3545
						RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3155 3546
				}else if(mode & H_A_DEBLOCK){
3156
					do_a_deblock(dstBlock-8, 1, stride, &c);
3547
					RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
3157 3548
				}
3158 3549
#endif
3159 3550
				if(mode & DERING)

Also available in: Unified diff