1 
/*


2 
* This file is part of FFmpeg.

3 
*

4 
* FFmpeg is free software; you can redistribute it and/or modify

5 
* it under the terms of the GNU General Public License as published by

6 
* the Free Software Foundation; either version 2 of the License, or

7 
* (at your option) any later version.

8 
*

9 
* FFmpeg is distributed in the hope that it will be useful,

10 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

11 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

12 
* GNU General Public License for more details.

13 
*

14 
* You should have received a copy of the GNU General Public License along

15 
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,

16 
* 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA.

17 
*/

18  
19 
#include "libavutil/cpu.h" 
20 
#include "libavutil/x86_cpu.h" 
21 
#include "libavfilter/gradfun.h" 
22  
23 
DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F}; 
24 
DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}; 
25  
26 
void ff_gradfun_filter_line_mmx2(uint8_t *dst, const uint8_t *src, const uint16_t *dc, int width, int thresh, const uint16_t *dithers) 
27 
{ 
28 
#if HAVE_MMX

29 
intptr_t x; 
30 
if (width & 3) { 
31 
x = width & ~3;

32 
ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width  x, thresh, dithers);

33 
width = x; 
34 
} 
35 
x = width; 
36 
__asm__ volatile(

37 
"movd %4, %%mm5 \n"

38 
"pxor %%mm7, %%mm7 \n"

39 
"pshufw $0, %%mm5, %%mm5 \n"

40 
"movq %6, %%mm6 \n"

41 
"movq %5, %%mm4 \n"

42 
"1: \n"

43 
"movd (%2,%0), %%mm0 \n"

44 
"movd (%3,%0), %%mm1 \n"

45 
"punpcklbw %%mm7, %%mm0 \n"

46 
"punpcklwd %%mm1, %%mm1 \n"

47 
"psllw $7, %%mm0 \n"

48 
"pxor %%mm2, %%mm2 \n"

49 
"psubw %%mm0, %%mm1 \n" // delta = dc  pix 
50 
"psubw %%mm1, %%mm2 \n"

51 
"pmaxsw %%mm1, %%mm2 \n"

52 
"pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16 
53 
"psubw %%mm6, %%mm2 \n"

54 
"pminsw %%mm7, %%mm2 \n" // m = max(0, 127m) 
55 
"pmullw %%mm2, %%mm2 \n"

56 
"paddw %%mm4, %%mm0 \n" // pix += dither 
57 
"pmulhw %%mm2, %%mm1 \n"

58 
"psllw $2, %%mm1 \n" // m = m*m*delta >> 14 
59 
"paddw %%mm1, %%mm0 \n" // pix += m 
60 
"psraw $7, %%mm0 \n"

61 
"packuswb %%mm0, %%mm0 \n"

62 
"movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7) 
63 
"add $4, %0 \n"

64 
"jl 1b \n"

65 
"emms \n"

66 
:"+r"(x)

67 
:"r"(dst+width), "r"(src+width), "r"(dc+width/2), 
68 
"rm"(thresh), "m"(*dithers), "m"(*pw_7f) 
69 
:"memory"

70 
); 
71 
#endif

72 
} 
73  
74 
void ff_gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const uint16_t *dc, int width, int thresh, const uint16_t *dithers) 
75 
{ 
76 
#if HAVE_SSSE3

77 
intptr_t x; 
78 
if (width & 7) { 
79 
// could be 10% faster if I somehow eliminated this

80 
x = width & ~7;

81 
ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width  x, thresh, dithers);

82 
width = x; 
83 
} 
84 
x = width; 
85 
__asm__ volatile(

86 
"movd %4, %%xmm5 \n"

87 
"pxor %%xmm7, %%xmm7 \n"

88 
"pshuflw $0,%%xmm5, %%xmm5 \n"

89 
"movdqa %6, %%xmm6 \n"

90 
"punpcklqdq %%xmm5, %%xmm5 \n"

91 
"movdqa %5, %%xmm4 \n"

92 
"1: \n"

93 
"movq (%2,%0), %%xmm0 \n"

94 
"movq (%3,%0), %%xmm1 \n"

95 
"punpcklbw %%xmm7, %%xmm0 \n"

96 
"punpcklwd %%xmm1, %%xmm1 \n"

97 
"psllw $7, %%xmm0 \n"

98 
"psubw %%xmm0, %%xmm1 \n" // delta = dc  pix 
99 
"pabsw %%xmm1, %%xmm2 \n"

100 
"pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16 
101 
"psubw %%xmm6, %%xmm2 \n"

102 
"pminsw %%xmm7, %%xmm2 \n" // m = max(0, 127m) 
103 
"pmullw %%xmm2, %%xmm2 \n"

104 
"psllw $1, %%xmm2 \n"

105 
"paddw %%xmm4, %%xmm0 \n" // pix += dither 
106 
"pmulhrsw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14 
107 
"paddw %%xmm1, %%xmm0 \n" // pix += m 
108 
"psraw $7, %%xmm0 \n"

109 
"packuswb %%xmm0, %%xmm0 \n"

110 
"movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7) 
111 
"add $8, %0 \n"

112 
"jl 1b \n"

113 
:"+&r"(x)

114 
:"r"(dst+width), "r"(src+width), "r"(dc+width/2), 
115 
"rm"(thresh), "m"(*dithers), "m"(*pw_7f) 
116 
:"memory"

117 
); 
118 
#endif // HAVE_SSSE3 
119 
} 
120  
121 
void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1, const uint8_t *src, int src_linesize, int width) 
122 
{ 
123 
#if HAVE_SSE

124 
#define BLURV(load)\

125 
intptr_t x = 2*width;\

126 
__asm__ volatile(\

127 
"movdqa %6, %%xmm7 \n"\

128 
"1: \n"\

129 
load" (%4,%0), %%xmm0 \n"\

130 
load" (%5,%0), %%xmm1 \n"\

131 
"movdqa %%xmm0, %%xmm2 \n"\

132 
"movdqa %%xmm1, %%xmm3 \n"\

133 
"psrlw $8, %%xmm0 \n"\

134 
"psrlw $8, %%xmm1 \n"\

135 
"pand %%xmm7, %%xmm2 \n"\

136 
"pand %%xmm7, %%xmm3 \n"\

137 
"paddw %%xmm1, %%xmm0 \n"\

138 
"paddw %%xmm3, %%xmm2 \n"\

139 
"paddw %%xmm2, %%xmm0 \n"\

140 
"paddw (%2,%0), %%xmm0 \n"\

141 
"movdqa (%1,%0), %%xmm1 \n"\

142 
"movdqa %%xmm0, (%1,%0) \n"\

143 
"psubw %%xmm1, %%xmm0 \n"\

144 
"movdqa %%xmm0, (%3,%0) \n"\

145 
"add $16, %0 \n"\

146 
"jl 1b \n"\

147 
:"+&r"(x)\

148 
:"r"(buf+width),\

149 
"r"(buf1+width),\

150 
"r"(dc+width),\

151 
"r"(src+width*2),\ 
152 
"r"(src+width*2+src_linesize),\ 
153 
"m"(*pw_ff)\

154 
:"memory"\

155 
); 
156 
if (((intptr_t) src  src_linesize) & 15) { 
157 
BLURV("movdqu");

158 
} else {

159 
BLURV("movdqa");

160 
} 
161 
#endif // HAVE_SSE 
162 
} 