Revision 70c5ae87 postproc/postprocess.c

View differences:

postproc/postprocess.c
26 26
isHorizMinMaxOk		a	E
27 27
doHorizLowPass		E		e	e
28 28
doHorizDefFilter	Ec	Ec	Ec
29
deRing
29
deRing					e
30 30
Vertical RKAlgo1	E		a	a
31 31
Horizontal RKAlgo1			a	a
32 32
Vertical X1		a		E	E
......
65 65
...
66 66

  
67 67
Notes:
68
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
69 68
*/
70 69

  
71 70
//Changelog: use the CVS log
......
116 115
static uint64_t b01= 		0x0101010101010101LL;
117 116
static uint64_t b02= 		0x0202020202020202LL;
118 117
static uint64_t b0F= 		0x0F0F0F0F0F0F0F0FLL;
118
static uint64_t b04= 		0x0404040404040404LL;
119
static uint64_t b08= 		0x0808080808080808LL;
119 120
static uint64_t bFF= 		0xFFFFFFFFFFFFFFFFLL;
120 121
static uint64_t b20= 		0x2020202020202020LL;
121 122
static uint64_t b80= 		0x8080808080808080LL;
......
129 130
static uint64_t temp4=0;
130 131
static uint64_t temp5=0;
131 132
static uint64_t pQPb=0;
133
static uint64_t pQPb2=0;
132 134
static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
133 135

  
134 136
int hFlatnessThreshold= 56 - 16;
......
1806 1808

  
1807 1809
static inline void dering(uint8_t src[], int stride, int QP)
1808 1810
{
1809
//FIXME
1810

  
1811
#ifdef HAVE_MMX2X
1811
#ifdef HAVE_MMX2
1812 1812
	asm volatile(
1813
		"movq pQPb, %%mm0				\n\t"
1814
		"paddusb %%mm0, %%mm0				\n\t"
1815
		"movq %%mm0, pQPb2				\n\t"
1816

  
1813 1817
		"leal (%0, %1), %%eax				\n\t"
1814 1818
		"leal (%%eax, %1, 4), %%ebx			\n\t"
1815 1819
//	0	1	2	3	4	5	6	7	8	9
1816 1820
//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
1817 1821

  
1818
		"pcmpeq %%mm6, %%mm6				\n\t"
1822
		"pcmpeqb %%mm6, %%mm6				\n\t"
1819 1823
		"pxor %%mm7, %%mm7				\n\t"
1820 1824

  
1821 1825
#define FIND_MIN_MAX(addr)\
1822
		"movq (" #addr "), %%mm0,			\n\t"\
1826
		"movq " #addr ", %%mm0				\n\t"\
1823 1827
		"pminub %%mm0, %%mm6				\n\t"\
1824 1828
		"pmaxub %%mm0, %%mm7				\n\t"
1825 1829

  
1826
FIND_MIN_MAX(%0)
1827
FIND_MIN_MAX(%%eax)
1828
FIND_MIN_MAX(%%eax, %1)
1829
FIND_MIN_MAX(%%eax, %1, 2)
1830
FIND_MIN_MAX(%0, %1, 4)
1831
FIND_MIN_MAX(%%ebx)
1832
FIND_MIN_MAX(%%ebx, %1)
1833
FIND_MIN_MAX(%%ebx, %1, 2)
1834
FIND_MIN_MAX(%0, %1, 8)
1835
FIND_MIN_MAX(%%ebx, %1, 2)
1830
FIND_MIN_MAX((%%eax))
1831
FIND_MIN_MAX((%%eax, %1))
1832
FIND_MIN_MAX((%%eax, %1, 2))
1833
FIND_MIN_MAX((%0, %1, 4))
1834
FIND_MIN_MAX((%%ebx))
1835
FIND_MIN_MAX((%%ebx, %1))
1836
FIND_MIN_MAX((%%ebx, %1, 2))
1837
FIND_MIN_MAX((%0, %1, 8))
1836 1838

  
1837 1839
		"movq %%mm6, %%mm4				\n\t"
1838 1840
		"psrlq $8, %%mm6				\n\t"
......
1866 1868
		"psrlq $32, %%mm7				\n\t"
1867 1869
#endif
1868 1870
		"pmaxub %%mm4, %%mm7				\n\t"
1869
		PAVGB(%%mm6, %%mm7)				      // (max + min)/2
1871
		PAVGB(%%mm6, %%mm7)				      // a=(max + min)/2
1870 1872
		"punpcklbw %%mm7, %%mm7				\n\t"
1871 1873
		"punpcklbw %%mm7, %%mm7				\n\t"
1872 1874
		"punpcklbw %%mm7, %%mm7				\n\t"
1875
		"movq %%mm7, temp0				\n\t"
1876

  
1877
		"movq (%0), %%mm0				\n\t" // L10
1878
		"movq %%mm0, %%mm1				\n\t" // L10
1879
		"movq %%mm0, %%mm2				\n\t" // L10
1880
		"psllq $8, %%mm1				\n\t"
1881
		"psrlq $8, %%mm2				\n\t"
1882
		"movd -4(%0), %%mm3				\n\t"
1883
		"movd 8(%0), %%mm4				\n\t"
1884
		"psrlq $24, %%mm3				\n\t"
1885
		"psllq $56, %%mm4				\n\t"
1886
		"por %%mm3, %%mm1				\n\t" // L00
1887
		"por %%mm4, %%mm2				\n\t" // L20
1888
		"movq %%mm1, %%mm3				\n\t" // L00
1889
		PAVGB(%%mm2, %%mm1)				      // (L20 + L00)/2
1890
		PAVGB(%%mm0, %%mm1)				      // (L20 + L00 + 2L10)/4
1891
		"psubusb %%mm7, %%mm0				\n\t"
1892
		"psubusb %%mm7, %%mm2				\n\t"
1893
		"psubusb %%mm7, %%mm3				\n\t"
1894
		"pcmpeqb b00, %%mm0				\n\t" // L10 > a ? 0 : -1
1895
		"pcmpeqb b00, %%mm2				\n\t" // L20 > a ? 0 : -1
1896
		"pcmpeqb b00, %%mm3				\n\t" // L00 > a ? 0 : -1
1897
		"paddb %%mm2, %%mm0				\n\t"
1898
		"paddb %%mm3, %%mm0				\n\t"
1899

  
1900
		"movq (%%eax), %%mm2				\n\t" // L11
1901
		"movq %%mm2, %%mm3				\n\t" // L11
1902
		"movq %%mm2, %%mm4				\n\t" // L11
1903
		"psllq $8, %%mm3				\n\t"
1904
		"psrlq $8, %%mm4				\n\t"
1905
		"movd -4(%%eax), %%mm5				\n\t"
1906
		"movd 8(%%eax), %%mm6				\n\t"
1907
		"psrlq $24, %%mm5				\n\t"
1908
		"psllq $56, %%mm6				\n\t"
1909
		"por %%mm5, %%mm3				\n\t" // L01
1910
		"por %%mm6, %%mm4				\n\t" // L21
1911
		"movq %%mm3, %%mm5				\n\t" // L01
1912
		PAVGB(%%mm4, %%mm3)				      // (L21 + L01)/2
1913
		PAVGB(%%mm2, %%mm3)				      // (L21 + L01 + 2L11)/4
1914
		"psubusb %%mm7, %%mm2				\n\t"
1915
		"psubusb %%mm7, %%mm4				\n\t"
1916
		"psubusb %%mm7, %%mm5				\n\t"
1917
		"pcmpeqb b00, %%mm2				\n\t" // L11 > a ? 0 : -1
1918
		"pcmpeqb b00, %%mm4				\n\t" // L21 > a ? 0 : -1
1919
		"pcmpeqb b00, %%mm5				\n\t" // L01 > a ? 0 : -1
1920
		"paddb %%mm4, %%mm2				\n\t"
1921
		"paddb %%mm5, %%mm2				\n\t"
1922
// 0, 2, 3, 1
1923
#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1924
		"movq " #src ", " #sx "				\n\t" /* src[0] */\
1925
		"movq " #sx ", " #lx "				\n\t" /* src[0] */\
1926
		"movq " #sx ", " #t0 "				\n\t" /* src[0] */\
1927
		"psllq $8, " #lx "				\n\t"\
1928
		"psrlq $8, " #t0 "				\n\t"\
1929
		"movd -4" #src ", " #t1 "			\n\t"\
1930
		"psrlq $24, " #t1 "				\n\t"\
1931
		"por " #t1 ", " #lx "				\n\t" /* src[-1] */\
1932
		"movd 8" #src ", " #t1 "			\n\t"\
1933
		"psllq $56, " #t1 "				\n\t"\
1934
		"por " #t1 ", " #t0 "				\n\t" /* src[+1] */\
1935
		"movq " #lx ", " #t1 "				\n\t" /* src[-1] */\
1936
		PAVGB(t0, lx)				              /* (src[-1] + src[+1])/2 */\
1937
		PAVGB(sx, lx)				      /* (src[-1] + 2src[0] + src[+1])/4 */\
1938
		"psubusb temp0, " #t1 "				\n\t"\
1939
		"psubusb temp0, " #t0 "				\n\t"\
1940
		"psubusb temp0, " #sx "				\n\t"\
1941
		"pcmpeqb b00, " #t1 "				\n\t" /* src[-1] > a ? 0 : -1*/\
1942
		"pcmpeqb b00, " #t0 "				\n\t" /* src[+1] > a ? 0 : -1*/\
1943
		"pcmpeqb b00, " #sx "				\n\t" /* src[0]  > a ? 0 : -1*/\
1944
		"paddb " #t1 ", " #t0 "				\n\t"\
1945
		"paddb " #t0 ", " #sx "				\n\t"\
1946
\
1947
		PAVGB(lx, pplx)					     \
1948
		PAVGB(plx, pplx)				      /* filtered */\
1949
		"movq " #dst ", " #t0 "				\n\t" /* dst */\
1950
		"movq " #pplx ", " #t1 "			\n\t"\
1951
		"psubusb " #t0 ", " #pplx "			\n\t"\
1952
		"psubusb " #t1 ", " #t0 "			\n\t"\
1953
		"por " #t0 ", " #pplx "				\n\t" /* |filtered - dst| */\
1954
		"psubusb pQPb2, " #pplx "			\n\t"\
1955
		"pcmpeqb b00, " #pplx "				\n\t"\
1956
		"paddb " #sx ", " #ppsx "			\n\t"\
1957
		"paddb " #psx ", " #ppsx "			\n\t"\
1958
	"#paddb b02, " #ppsx "				\n\t"\
1959
		"pand b08, " #ppsx "				\n\t"\
1960
		"pcmpeqb b00, " #ppsx "				\n\t"\
1961
		"pand " #pplx ", " #ppsx "			\n\t"\
1962
		"pand " #ppsx ", " #t1 "			\n\t"\
1963
		"pandn " #dst ", " #ppsx "			\n\t"\
1964
		"por " #t1 ", " #ppsx "				\n\t"\
1965
		"movq " #ppsx ", " #dst "			\n\t"
1966
/*
1967
0000000
1968
1111111
1873 1969

  
1874
		"movq (%0), %%mm0				\n\t"
1875
		"movq %%mm0, %%mm1				\n\t"
1970
1111110
1971
1111101
1972
1111100
1973
1111011
1974
1111010
1975
1111001
1876 1976

  
1977
1111000
1978
1110111
1877 1979

  
1980
*/
1981
//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1982
DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1983
DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1984
DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1985
DERING_CORE((%0, %1, 4),(%%ebx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1986
DERING_CORE((%%ebx),(%%ebx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1987
DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1988
DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1989
DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1878 1990

  
1879 1991

  
1880 1992
		: : "r" (src), "r" (stride), "r" (QP)
......
2874 2986
		uint8_t *tempBlock1= tempBlocks;
2875 2987
		uint8_t *tempBlock2= tempBlocks + 8;
2876 2988
#endif
2877
		/* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
2878
		   than use a temporary buffer */
2989
		/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
2990
		   if not than use a temporary buffer */
2879 2991
		if(y+15 >= height)
2880 2992
		{
2881
			/* copy from line 5 to 12 of src, these will e copied with
2993
			/* copy from line 5 to 12 of src, these will be copied with
2882 2994
			   blockcopy to dst later */
2883 2995
			memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
2884 2996
				srcStride*MAX(height-y-5, 0) );
......
2893 3005
			}
2894 3006

  
2895 3007

  
2896
			/* copy up to 5 lines of dst */
2897
			memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) );
2898
			dstBlock= tempDst;
3008
			/* copy up to 6 lines of dst */
3009
			memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 6) );
3010
			dstBlock= tempDst + dstStride;
2899 3011
			srcBlock= tempSrc;
2900 3012
		}
2901 3013

  
......
3046 3158
				T0=T1;
3047 3159
#endif
3048 3160
			}
3161

  
3049 3162
#ifdef HAVE_MMX
3050 3163
			transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3051 3164
#endif
......
3092 3205
				horizTime+= T1-T0;
3093 3206
				T0=T1;
3094 3207
#endif
3095
				dering(dstBlock - 9 - stride, stride, QP);
3208
				if(mode & DERING)
3209
				{
3210
				//FIXME filter first line
3211
					if(y>0) dering(dstBlock - stride - 8, stride, QP);
3212
				}
3213
			}
3214
			else if(mode & DERING)
3215
			{
3216
			 //FIXME y+15 is required cuz of the tempBuffer thing -> bottom right block isnt filtered
3217
					if(y > 8 && y+15 < height) dering(dstBlock - stride*9 + width - 8, stride, QP);
3096 3218
			}
3097
			else if(y!=0)
3098
				dering(dstBlock - stride*9 + width-9, stride, QP);
3099
			//FIXME dering filter will not be applied to last block (bottom right)
3219

  
3100 3220

  
3101 3221
#ifdef PP_FUNNY_STRIDE
3102 3222
			/* did we use a tmp-block buffer */
......
3127 3247
		if(y+15 >= height)
3128 3248
		{
3129 3249
			uint8_t *dstBlock= &(dst[y*dstStride]);
3130
			memcpy(dstBlock, tempDst, dstStride*(height-y) );
3250
			memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3131 3251
		}
3132 3252
	}
3133 3253
#ifdef HAVE_3DNOW

Also available in: Unified diff