Revision 39d89b69

View differences:

libavcodec/libpostproc/postprocess.c
117 117

  
118 118
#ifdef ARCH_X86
119 119
static uint64_t __attribute__((aligned(8))) attribute_used w05=		0x0005000500050005LL;
120
static uint64_t __attribute__((aligned(8))) attribute_used w04=		0x0004000400040004LL;
120 121
static uint64_t __attribute__((aligned(8))) attribute_used w20=		0x0020002000200020LL;
121 122
static uint64_t __attribute__((aligned(8))) attribute_used b00= 		0x0000000000000000LL;
122 123
static uint64_t __attribute__((aligned(8))) attribute_used b01= 		0x0101010101010101LL;
libavcodec/libpostproc/postprocess_template.c
2617 2617
 * accurate deblock filter
2618 2618
 */
2619 2619
static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
2620
	int y;
2621
	const int QP= c->QP;
2622 2620
	int64_t dc_mask, eq_mask;
2621
	int64_t sums[10*8*2];
2623 2622
	src+= step*3; // src points to begin of the 8x8 Block
2624 2623
//START_TIMER
2625 2624
asm volatile(
......
2725 2724
		: "%eax"
2726 2725
		);
2727 2726

  
2728
	src+= step; // src points to begin of the 8x8 Block
2727
	if(dc_mask & eq_mask){
2728
		int offset= -8*step;
2729
		int64_t *temp_sums= sums;
2730

  
2731
		asm volatile(
2732
		"movq %2, %%mm0					\n\t"  // QP,..., QP
2733
		"pxor %%mm4, %%mm4				\n\t"
2734

  
2735
		"movq (%0), %%mm6				\n\t"
2736
		"movq (%0, %1), %%mm5				\n\t"
2737
		"movq %%mm5, %%mm1				\n\t"
2738
		"movq %%mm6, %%mm2				\n\t"
2739
		"psubusb %%mm6, %%mm5				\n\t"
2740
		"psubusb %%mm1, %%mm2				\n\t"
2741
		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
2742
		"psubusb %%mm2, %%mm0				\n\t" // diff >= QP -> 0
2743
		"pcmpeqb %%mm4, %%mm0				\n\t" // diff >= QP -> FF
2744

  
2745
		"pxor %%mm6, %%mm1				\n\t"
2746
		"pand %%mm0, %%mm1				\n\t"
2747
		"pxor %%mm1, %%mm6				\n\t"
2748
		// 0:QP  6:First
2749

  
2750
		"movq (%0, %1, 8), %%mm5			\n\t"
2751
		"addl %1, %0					\n\t" // %0 points to line 1 not 0
2752
		"movq (%0, %1, 8), %%mm7			\n\t"
2753
		"movq %%mm5, %%mm1				\n\t"
2754
		"movq %%mm7, %%mm2				\n\t"
2755
		"psubusb %%mm7, %%mm5				\n\t"
2756
		"psubusb %%mm1, %%mm2				\n\t"
2757
		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
2758
		"movq %2, %%mm0					\n\t"  // QP,..., QP
2759
		"psubusb %%mm2, %%mm0				\n\t" // diff >= QP -> 0
2760
		"pcmpeqb %%mm4, %%mm0				\n\t" // diff >= QP -> FF
2761

  
2762
		"pxor %%mm7, %%mm1				\n\t"
2763
		"pand %%mm0, %%mm1				\n\t"
2764
		"pxor %%mm1, %%mm7				\n\t"
2765
		
2766
		"movq %%mm6, %%mm5				\n\t"
2767
		"punpckhbw %%mm4, %%mm6				\n\t"
2768
		"punpcklbw %%mm4, %%mm5				\n\t"
2769
		// 4:0 5/6:First 7:Last
2770

  
2771
		"movq %%mm5, %%mm0				\n\t"
2772
		"movq %%mm6, %%mm1				\n\t"
2773
		"psllw $2, %%mm0				\n\t"
2774
		"psllw $2, %%mm1				\n\t"
2775
		"paddw "MANGLE(w04)", %%mm0			\n\t"
2776
		"paddw "MANGLE(w04)", %%mm1			\n\t"
2777

  
2778
#define NEXT\
2779
		"movq (%0), %%mm2				\n\t"\
2780
		"movq (%0), %%mm3				\n\t"\
2781
		"addl %1, %0					\n\t"\
2782
		"punpcklbw %%mm4, %%mm2				\n\t"\
2783
		"punpckhbw %%mm4, %%mm3				\n\t"\
2784
		"paddw %%mm2, %%mm0				\n\t"\
2785
		"paddw %%mm3, %%mm1				\n\t"
2786

  
2787
#define PREV\
2788
		"movq (%0), %%mm2				\n\t"\
2789
		"movq (%0), %%mm3				\n\t"\
2790
		"addl %1, %0					\n\t"\
2791
		"punpcklbw %%mm4, %%mm2				\n\t"\
2792
		"punpckhbw %%mm4, %%mm3				\n\t"\
2793
		"psubw %%mm2, %%mm0				\n\t"\
2794
		"psubw %%mm3, %%mm1				\n\t"
2795

  
2796
				
2797
		NEXT //0
2798
		NEXT //1
2799
		NEXT //2
2800
		"movq %%mm0, (%3)				\n\t"
2801
		"movq %%mm1, 8(%3)				\n\t"
2802

  
2803
		NEXT //3
2804
		"psubw %%mm5, %%mm0				\n\t"
2805
		"psubw %%mm6, %%mm1				\n\t"
2806
		"movq %%mm0, 16(%3)				\n\t"
2807
		"movq %%mm1, 24(%3)				\n\t"
2808

  
2809
		NEXT //4
2810
		"psubw %%mm5, %%mm0				\n\t"
2811
		"psubw %%mm6, %%mm1				\n\t"
2812
		"movq %%mm0, 32(%3)				\n\t"
2813
		"movq %%mm1, 40(%3)				\n\t"
2814

  
2815
		NEXT //5
2816
		"psubw %%mm5, %%mm0				\n\t"
2817
		"psubw %%mm6, %%mm1				\n\t"
2818
		"movq %%mm0, 48(%3)				\n\t"
2819
		"movq %%mm1, 56(%3)				\n\t"
2820

  
2821
		NEXT //6
2822
		"psubw %%mm5, %%mm0				\n\t"
2823
		"psubw %%mm6, %%mm1				\n\t"
2824
		"movq %%mm0, 64(%3)				\n\t"
2825
		"movq %%mm1, 72(%3)				\n\t"
2826

  
2827
		"movq %%mm7, %%mm6				\n\t"
2828
		"punpckhbw %%mm4, %%mm7				\n\t"
2829
		"punpcklbw %%mm4, %%mm6				\n\t"
2830
		
2831
		NEXT //7
2832
		"movl %4, %0					\n\t"
2833
		"addl %1, %0					\n\t"
2834
		PREV //0
2835
		"movq %%mm0, 80(%3)				\n\t"
2836
		"movq %%mm1, 88(%3)				\n\t"
2837

  
2838
		PREV //1
2839
		"paddw %%mm6, %%mm0				\n\t"
2840
		"paddw %%mm7, %%mm1				\n\t"
2841
		"movq %%mm0, 96(%3)				\n\t"
2842
		"movq %%mm1, 104(%3)				\n\t"
2843
		
2844
		PREV //2
2845
		"paddw %%mm6, %%mm0				\n\t"
2846
		"paddw %%mm7, %%mm1				\n\t"
2847
		"movq %%mm0, 112(%3)				\n\t"
2848
		"movq %%mm1, 120(%3)				\n\t"
2849

  
2850
		PREV //3
2851
		"paddw %%mm6, %%mm0				\n\t"
2852
		"paddw %%mm7, %%mm1				\n\t"
2853
		"movq %%mm0, 128(%3)				\n\t"
2854
		"movq %%mm1, 136(%3)				\n\t"
2855

  
2856
		PREV //4
2857
		"paddw %%mm6, %%mm0				\n\t"
2858
		"paddw %%mm7, %%mm1				\n\t"
2859
		"movq %%mm0, 144(%3)				\n\t"
2860
		"movq %%mm1, 152(%3)				\n\t"
2861

  
2862
		"movl %4, %0					\n\t" //FIXME
2863

  
2864
		: "+&r"(src)
2865
		: "r" (step), "m" (c->pQPb), "r"(sums), "g"(src)
2866
		);
2867

  
2868
		src+= step; // src points to begin of the 8x8 Block
2869

  
2870
		asm volatile(
2871
		"movq %4, %%mm6					\n\t"
2872
		"pcmpeqb %%mm5, %%mm5				\n\t"
2873
		"pxor %%mm6, %%mm5				\n\t"
2874
		"pxor %%mm7, %%mm7				\n\t"
2875

  
2876
		"1:						\n\t"
2877
		"movq (%1), %%mm0				\n\t"
2878
		"movq 8(%1), %%mm1				\n\t"
2879
		"paddw 32(%1), %%mm0				\n\t"
2880
		"paddw 40(%1), %%mm1				\n\t"
2881
		"movq (%0, %3), %%mm2				\n\t"
2882
		"movq %%mm2, %%mm3				\n\t"
2883
		"movq %%mm2, %%mm4				\n\t"
2884
		"punpcklbw %%mm7, %%mm2				\n\t"
2885
		"punpckhbw %%mm7, %%mm3				\n\t"
2886
		"paddw %%mm2, %%mm0				\n\t"
2887
		"paddw %%mm3, %%mm1				\n\t"
2888
		"paddw %%mm2, %%mm0				\n\t"
2889
		"paddw %%mm3, %%mm1				\n\t"
2890
		"psrlw $4, %%mm0				\n\t"
2891
		"psrlw $4, %%mm1				\n\t"
2892
		"packuswb %%mm1, %%mm0				\n\t"
2893
		"pand %%mm6, %%mm0				\n\t"
2894
		"pand %%mm5, %%mm4				\n\t"
2895
		"por %%mm4, %%mm0				\n\t"
2896
		"movq %%mm0, (%0, %3)				\n\t"
2897
		"addl $16, %1					\n\t"
2898
		"addl %2, %0					\n\t"
2899
		" js 1b						\n\t"
2900

  
2901
		: "+r"(offset), "+r"(temp_sums)
2902
		: "r" (step), "r"(src - offset), "m"(dc_mask & eq_mask)
2903
		);
2904
	}else
2905
		src+= step; // src points to begin of the 8x8 Block
2729 2906

  
2730 2907
	if(eq_mask != -1LL){
2908
		uint8_t *temp_src= src;
2731 2909
		asm volatile(
2732 2910
		"pxor %%mm7, %%mm7				\n\t"
2733 2911
		"leal -40(%%esp), %%ecx				\n\t" // make space for 4 8-byte vars
......
2955 3133
		"psubb %%mm1, %%mm0				\n\t"
2956 3134
		"movq %%mm0, (%0, %1)				\n\t"
2957 3135

  
2958
		: "+r" (src)
3136
		: "+r" (temp_src)
2959 3137
		: "r" (step), "m" (c->pQPb), "m"(eq_mask)
2960 3138
		: "%eax", "%ecx"
2961 3139
		);
2962
		src-= 3*step; //reverse src change from asm
2963
	}
2964

  
2965
	for(y=0; y<8; y++){
2966
		if((eq_mask>>(y*8))&1){
2967
			if((dc_mask>>(y*8))&1){
2968
				const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
2969
				const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
2970
				
2971
				int sums[10];
2972
				sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
2973
				sums[1] = sums[0] - first       + src[3*step];
2974
				sums[2] = sums[1] - first       + src[4*step];
2975
				sums[3] = sums[2] - first       + src[5*step];
2976
				sums[4] = sums[3] - first       + src[6*step];
2977
				sums[5] = sums[4] - src[0*step] + src[7*step];
2978
				sums[6] = sums[5] - src[1*step] + last;
2979
				sums[7] = sums[6] - src[2*step] + last;
2980
				sums[8] = sums[7] - src[3*step] + last;
2981
				sums[9] = sums[8] - src[4*step] + last;
2982

  
2983
				src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
2984
				src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
2985
				src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
2986
				src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
2987
				src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
2988
				src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
2989
				src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
2990
				src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
2991
			}
2992
		}
2993

  
2994
		src += stride;
2995 3140
	}
2996 3141
/*if(step==16){
2997 3142
    STOP_TIMER("step16")

Also available in: Unified diff