1 
/*


2 
* Copyright (c) 2008 Loren Merritt

3 
*

4 
* This file is part of FFmpeg.

5 
*

6 
* FFmpeg is free software; you can redistribute it and/or

7 
* modify it under the terms of the GNU Lesser General Public

8 
* License as published by the Free Software Foundation; either

9 
* version 2.1 of the License, or (at your option) any later version.

10 
*

11 
* FFmpeg is distributed in the hope that it will be useful,

12 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

13 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

14 
* Lesser General Public License for more details.

15 
*

16 
* You should have received a copy of the GNU Lesser General Public

17 
* License along with FFmpeg; if not, write to the Free Software

18 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

19 
*/

20  
21 
/**

22 
* SSSE3 optimized version of (putavg)_h264_chroma_mc8.

23 
* H264_CHROMA_MC8_TMPL must be defined to the desired function name

24 
* H264_CHROMA_MC8_MV0 must be defined to a (putavg)_pixels8 function

25 
* AVG_OP must be defined to empty for put and the identify for avg

26 
*/

27 
static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) 
28 
{ 
29 
if(y==0 && x==0) { 
30 
/* no filter needed */

31 
H264_CHROMA_MC8_MV0(dst, src, stride, h); 
32 
return;

33 
} 
34  
35 
assert(x<8 && y<8 && x>=0 && y>=0); 
36  
37 
if(y==0  x==0) 
38 
{ 
39 
/* 1 dimensional filter only */

40 
__asm__ volatile(

41 
"movd %0, %%xmm7 \n\t"

42 
"movq %1, %%xmm6 \n\t"

43 
"pshuflw $0, %%xmm7, %%xmm7 \n\t"

44 
"movlhps %%xmm6, %%xmm6 \n\t"

45 
"movlhps %%xmm7, %%xmm7 \n\t"

46 
:: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3)) 
47 
); 
48  
49 
if(x) {

50 
__asm__ volatile(

51 
"1: \n\t"

52 
"movq (%1), %%xmm0 \n\t"

53 
"movq 1(%1), %%xmm1 \n\t"

54 
"movq (%1,%3), %%xmm2 \n\t"

55 
"movq 1(%1,%3), %%xmm3 \n\t"

56 
"punpcklbw %%xmm1, %%xmm0 \n\t"

57 
"punpcklbw %%xmm3, %%xmm2 \n\t"

58 
"pmaddubsw %%xmm7, %%xmm0 \n\t"

59 
"pmaddubsw %%xmm7, %%xmm2 \n\t"

60 
AVG_OP("movq (%0), %%xmm4 \n\t")

61 
AVG_OP("movhps (%0,%3), %%xmm4 \n\t")

62 
"paddw %%xmm6, %%xmm0 \n\t"

63 
"paddw %%xmm6, %%xmm2 \n\t"

64 
"psrlw $3, %%xmm0 \n\t"

65 
"psrlw $3, %%xmm2 \n\t"

66 
"packuswb %%xmm2, %%xmm0 \n\t"

67 
AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")

68 
"movq %%xmm0, (%0) \n\t"

69 
"movhps %%xmm0, (%0,%3) \n\t"

70 
"sub $2, %2 \n\t"

71 
"lea (%1,%3,2), %1 \n\t"

72 
"lea (%0,%3,2), %0 \n\t"

73 
"jg 1b \n\t"

74 
:"+r"(dst), "+r"(src), "+r"(h) 
75 
:"r"((x86_reg)stride)

76 
); 
77 
} else {

78 
__asm__ volatile(

79 
"1: \n\t"

80 
"movq (%1), %%xmm0 \n\t"

81 
"movq (%1,%3), %%xmm1 \n\t"

82 
"movdqa %%xmm1, %%xmm2 \n\t"

83 
"movq (%1,%3,2), %%xmm3 \n\t"

84 
"punpcklbw %%xmm1, %%xmm0 \n\t"

85 
"punpcklbw %%xmm3, %%xmm2 \n\t"

86 
"pmaddubsw %%xmm7, %%xmm0 \n\t"

87 
"pmaddubsw %%xmm7, %%xmm2 \n\t"

88 
AVG_OP("movq (%0), %%xmm4 \n\t")

89 
AVG_OP("movhps (%0,%3), %%xmm4 \n\t")

90 
"paddw %%xmm6, %%xmm0 \n\t"

91 
"paddw %%xmm6, %%xmm2 \n\t"

92 
"psrlw $3, %%xmm0 \n\t"

93 
"psrlw $3, %%xmm2 \n\t"

94 
"packuswb %%xmm2, %%xmm0 \n\t"

95 
AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")

96 
"movq %%xmm0, (%0) \n\t"

97 
"movhps %%xmm0, (%0,%3) \n\t"

98 
"sub $2, %2 \n\t"

99 
"lea (%1,%3,2), %1 \n\t"

100 
"lea (%0,%3,2), %0 \n\t"

101 
"jg 1b \n\t"

102 
:"+r"(dst), "+r"(src), "+r"(h) 
103 
:"r"((x86_reg)stride)

104 
); 
105 
} 
106 
return;

107 
} 
108  
109 
/* general case, bilinear */

110 
__asm__ volatile(

111 
"movd %0, %%xmm7 \n\t"

112 
"movd %1, %%xmm6 \n\t"

113 
"movdqa %2, %%xmm5 \n\t"

114 
"pshuflw $0, %%xmm7, %%xmm7 \n\t"

115 
"pshuflw $0, %%xmm6, %%xmm6 \n\t"

116 
"movlhps %%xmm7, %%xmm7 \n\t"

117 
"movlhps %%xmm6, %%xmm6 \n\t"

118 
:: "r"((x*255+8)*(8y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28)) 
119 
); 
120  
121 
__asm__ volatile(

122 
"movq (%1), %%xmm0 \n\t"

123 
"movq 1(%1), %%xmm1 \n\t"

124 
"punpcklbw %%xmm1, %%xmm0 \n\t"

125 
"add %3, %1 \n\t"

126 
"1: \n\t"

127 
"movq (%1), %%xmm1 \n\t"

128 
"movq 1(%1), %%xmm2 \n\t"

129 
"movq (%1,%3), %%xmm3 \n\t"

130 
"movq 1(%1,%3), %%xmm4 \n\t"

131 
"lea (%1,%3,2), %1 \n\t"

132 
"punpcklbw %%xmm2, %%xmm1 \n\t"

133 
"punpcklbw %%xmm4, %%xmm3 \n\t"

134 
"movdqa %%xmm1, %%xmm2 \n\t"

135 
"movdqa %%xmm3, %%xmm4 \n\t"

136 
"pmaddubsw %%xmm7, %%xmm0 \n\t"

137 
"pmaddubsw %%xmm6, %%xmm1 \n\t"

138 
"pmaddubsw %%xmm7, %%xmm2 \n\t"

139 
"pmaddubsw %%xmm6, %%xmm3 \n\t"

140 
"paddw %%xmm5, %%xmm0 \n\t"

141 
"paddw %%xmm5, %%xmm2 \n\t"

142 
"paddw %%xmm0, %%xmm1 \n\t"

143 
"paddw %%xmm2, %%xmm3 \n\t"

144 
"movdqa %%xmm4, %%xmm0 \n\t"

145 
"psrlw $6, %%xmm1 \n\t"

146 
"psrlw $6, %%xmm3 \n\t"

147 
AVG_OP("movq (%0), %%xmm2 \n\t")

148 
AVG_OP("movhps (%0,%3), %%xmm2 \n\t")

149 
"packuswb %%xmm3, %%xmm1 \n\t"

150 
AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")

151 
"movq %%xmm1, (%0)\n\t"

152 
"movhps %%xmm1, (%0,%3)\n\t"

153 
"sub $2, %2 \n\t"

154 
"lea (%0,%3,2), %0 \n\t"

155 
"jg 1b \n\t"

156 
:"+r"(dst), "+r"(src), "+r"(h) 
157 
:"r"((x86_reg)stride)

158 
); 
159 
} 
160  
161 
static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 
162 
{ 
163 
__asm__ volatile(

164 
"movd %0, %%mm7 \n\t"

165 
"movd %1, %%mm6 \n\t"

166 
"movq %2, %%mm5 \n\t"

167 
"pshufw $0, %%mm7, %%mm7 \n\t"

168 
"pshufw $0, %%mm6, %%mm6 \n\t"

169 
:: "r"((x*255+8)*(8y)), "r"((x*255+8)*y), "m"(ff_pw_32) 
170 
); 
171  
172 
__asm__ volatile(

173 
"movd (%1), %%mm0 \n\t"

174 
"punpcklbw 1(%1), %%mm0 \n\t"

175 
"add %3, %1 \n\t"

176 
"1: \n\t"

177 
"movd (%1), %%mm1 \n\t"

178 
"movd (%1,%3), %%mm3 \n\t"

179 
"punpcklbw 1(%1), %%mm1 \n\t"

180 
"punpcklbw 1(%1,%3), %%mm3 \n\t"

181 
"lea (%1,%3,2), %1 \n\t"

182 
"movq %%mm1, %%mm2 \n\t"

183 
"movq %%mm3, %%mm4 \n\t"

184 
"pmaddubsw %%mm7, %%mm0 \n\t"

185 
"pmaddubsw %%mm6, %%mm1 \n\t"

186 
"pmaddubsw %%mm7, %%mm2 \n\t"

187 
"pmaddubsw %%mm6, %%mm3 \n\t"

188 
"paddw %%mm5, %%mm0 \n\t"

189 
"paddw %%mm5, %%mm2 \n\t"

190 
"paddw %%mm0, %%mm1 \n\t"

191 
"paddw %%mm2, %%mm3 \n\t"

192 
"movq %%mm4, %%mm0 \n\t"

193 
"psrlw $6, %%mm1 \n\t"

194 
"psrlw $6, %%mm3 \n\t"

195 
"packuswb %%mm1, %%mm1 \n\t"

196 
"packuswb %%mm3, %%mm3 \n\t"

197 
AVG_OP("pavgb (%0), %%mm1 \n\t")

198 
AVG_OP("pavgb (%0,%3), %%mm3 \n\t")

199 
"movd %%mm1, (%0)\n\t"

200 
"movd %%mm3, (%0,%3)\n\t"

201 
"sub $2, %2 \n\t"

202 
"lea (%0,%3,2), %0 \n\t"

203 
"jg 1b \n\t"

204 
:"+r"(dst), "+r"(src), "+r"(h) 
205 
:"r"((x86_reg)stride)

206 
); 
207 
} 
208 