ffmpeg / libavcodec / i386 / dsputil_h264_template_mmx.c @ 7c4fd7eb
History  View  Annotate  Download (10.9 KB)
1 
/*


2 
* Copyright (c) 2005 Zoltan Hidvegi <hzoli a hzoli d com>,

3 
* Loren Merritt

4 
*

5 
* This file is part of FFmpeg.

6 
*

7 
* FFmpeg is free software; you can redistribute it and/or

8 
* modify it under the terms of the GNU Lesser General Public

9 
* License as published by the Free Software Foundation; either

10 
* version 2.1 of the License, or (at your option) any later version.

11 
*

12 
* FFmpeg is distributed in the hope that it will be useful,

13 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

14 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15 
* Lesser General Public License for more details.

16 
*

17 
* You should have received a copy of the GNU Lesser General Public

18 
* License along with FFmpeg; if not, write to the Free Software

19 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

20 
*/

21  
22 
/**

23 
* MMX optimized version of (putavg)_h264_chroma_mc8.

24 
* H264_CHROMA_MC8_TMPL must be defined to the desired function name

25 
* H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg

26 
* H264_CHROMA_MC8_MV0 must be defined to a (putavg)_pixels8 function

27 
*/

28 
static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 
29 
{ 
30 
DECLARE_ALIGNED_8(uint64_t, AA); 
31 
DECLARE_ALIGNED_8(uint64_t, DD); 
32 
int i;

33  
34 
if(y==0 && x==0) { 
35 
/* no filter needed */

36 
H264_CHROMA_MC8_MV0(dst, src, stride, h); 
37 
return;

38 
} 
39  
40 
assert(x<8 && y<8 && x>=0 && y>=0); 
41  
42 
if(y==0  x==0) 
43 
{ 
44 
/* 1 dimensional filter only */

45 
const int dxy = x ? 1 : stride; 
46  
47 
asm volatile( 
48 
"movd %0, %%mm5\n\t"

49 
"movq %1, %%mm4\n\t"

50 
"punpcklwd %%mm5, %%mm5\n\t"

51 
"punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ 
52 
"movq %%mm4, %%mm6\n\t"

53 
"pxor %%mm7, %%mm7\n\t"

54 
"psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8x */ 
55 
"psrlw $1, %%mm6\n\t" /* mm6 = 4 */ 
56 
:: "rm"(x+y), "m"(ff_pw_8)); 
57  
58 
for(i=0; i<h; i++) { 
59 
asm volatile( 
60 
/* mm0 = src[0..7], mm1 = src[1..8] */

61 
"movq %0, %%mm0\n\t"

62 
"movq %1, %%mm2\n\t"

63 
:: "m"(src[0]), "m"(src[dxy])); 
64  
65 
asm volatile( 
66 
/* [mm0,mm1] = A * src[0..7] */

67 
/* [mm2,mm3] = B * src[1..8] */

68 
"movq %%mm0, %%mm1\n\t"

69 
"movq %%mm2, %%mm3\n\t"

70 
"punpcklbw %%mm7, %%mm0\n\t"

71 
"punpckhbw %%mm7, %%mm1\n\t"

72 
"punpcklbw %%mm7, %%mm2\n\t"

73 
"punpckhbw %%mm7, %%mm3\n\t"

74 
"pmullw %%mm4, %%mm0\n\t"

75 
"pmullw %%mm4, %%mm1\n\t"

76 
"pmullw %%mm5, %%mm2\n\t"

77 
"pmullw %%mm5, %%mm3\n\t"

78  
79 
/* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */

80 
"paddw %%mm6, %%mm0\n\t"

81 
"paddw %%mm6, %%mm1\n\t"

82 
"paddw %%mm2, %%mm0\n\t"

83 
"paddw %%mm3, %%mm1\n\t"

84 
"psrlw $3, %%mm0\n\t"

85 
"psrlw $3, %%mm1\n\t"

86 
"packuswb %%mm1, %%mm0\n\t"

87 
H264_CHROMA_OP(%0, %%mm0)

88 
"movq %%mm0, %0\n\t"

89 
: "=m" (dst[0])); 
90  
91 
src += stride; 
92 
dst += stride; 
93 
} 
94 
return;

95 
} 
96  
97 
/* general case, bilinear */

98 
asm volatile("movd %2, %%mm4\n\t" 
99 
"movd %3, %%mm6\n\t"

100 
"punpcklwd %%mm4, %%mm4\n\t"

101 
"punpcklwd %%mm6, %%mm6\n\t"

102 
"punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ 
103 
"punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ 
104 
"movq %%mm4, %%mm5\n\t"

105 
"pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ 
106 
"psllw $3, %%mm5\n\t"

107 
"psllw $3, %%mm6\n\t"

108 
"movq %%mm5, %%mm7\n\t"

109 
"paddw %%mm6, %%mm7\n\t"

110 
"movq %%mm4, %1\n\t" /* DD = x * y */ 
111 
"psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x  xy */ 
112 
"psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y  xy */ 
113 
"paddw %4, %%mm4\n\t"

114 
"psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy  (8x+8y) + 64 */ 
115 
"pxor %%mm7, %%mm7\n\t"

116 
"movq %%mm4, %0\n\t"

117 
: "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); 
118  
119 
asm volatile( 
120 
/* mm0 = src[0..7], mm1 = src[1..8] */

121 
"movq %0, %%mm0\n\t"

122 
"movq %1, %%mm1\n\t"

123 
: : "m" (src[0]), "m" (src[1])); 
124  
125 
for(i=0; i<h; i++) { 
126 
src += stride; 
127  
128 
asm volatile( 
129 
/* mm2 = A * src[0..3] + B * src[1..4] */

130 
/* mm3 = A * src[4..7] + B * src[5..8] */

131 
"movq %%mm0, %%mm2\n\t"

132 
"movq %%mm1, %%mm3\n\t"

133 
"punpckhbw %%mm7, %%mm0\n\t"

134 
"punpcklbw %%mm7, %%mm1\n\t"

135 
"punpcklbw %%mm7, %%mm2\n\t"

136 
"punpckhbw %%mm7, %%mm3\n\t"

137 
"pmullw %0, %%mm0\n\t"

138 
"pmullw %0, %%mm2\n\t"

139 
"pmullw %%mm5, %%mm1\n\t"

140 
"pmullw %%mm5, %%mm3\n\t"

141 
"paddw %%mm1, %%mm2\n\t"

142 
"paddw %%mm0, %%mm3\n\t"

143 
: : "m" (AA));

144  
145 
asm volatile( 
146 
/* [mm2,mm3] += C * src[0..7] */

147 
"movq %0, %%mm0\n\t"

148 
"movq %%mm0, %%mm1\n\t"

149 
"punpcklbw %%mm7, %%mm0\n\t"

150 
"punpckhbw %%mm7, %%mm1\n\t"

151 
"pmullw %%mm6, %%mm0\n\t"

152 
"pmullw %%mm6, %%mm1\n\t"

153 
"paddw %%mm0, %%mm2\n\t"

154 
"paddw %%mm1, %%mm3\n\t"

155 
: : "m" (src[0])); 
156  
157 
asm volatile( 
158 
/* [mm2,mm3] += D * src[1..8] */

159 
"movq %1, %%mm1\n\t"

160 
"movq %%mm1, %%mm0\n\t"

161 
"movq %%mm1, %%mm4\n\t"

162 
"punpcklbw %%mm7, %%mm0\n\t"

163 
"punpckhbw %%mm7, %%mm4\n\t"

164 
"pmullw %2, %%mm0\n\t"

165 
"pmullw %2, %%mm4\n\t"

166 
"paddw %%mm0, %%mm2\n\t"

167 
"paddw %%mm4, %%mm3\n\t"

168 
"movq %0, %%mm0\n\t"

169 
: : "m" (src[0]), "m" (src[1]), "m" (DD)); 
170  
171 
asm volatile( 
172 
/* dst[0..7] = ([mm2,mm3] + 32) >> 6 */

173 
"paddw %1, %%mm2\n\t"

174 
"paddw %1, %%mm3\n\t"

175 
"psrlw $6, %%mm2\n\t"

176 
"psrlw $6, %%mm3\n\t"

177 
"packuswb %%mm3, %%mm2\n\t"

178 
H264_CHROMA_OP(%0, %%mm2)

179 
"movq %%mm2, %0\n\t"

180 
: "=m" (dst[0]) : "m" (ff_pw_32)); 
181 
dst+= stride; 
182 
} 
183 
} 
184  
185 
static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 
186 
{ 
187 
DECLARE_ALIGNED_8(uint64_t, AA); 
188 
DECLARE_ALIGNED_8(uint64_t, DD); 
189 
int i;

190  
191 
/* no special case for mv=(0,0) in 4x*, since it's much less common than in 8x*.

192 
* could still save a few cycles, but maybe not worth the complexity. */

193  
194 
assert(x<8 && y<8 && x>=0 && y>=0); 
195  
196 
asm volatile("movd %2, %%mm4\n\t" 
197 
"movd %3, %%mm6\n\t"

198 
"punpcklwd %%mm4, %%mm4\n\t"

199 
"punpcklwd %%mm6, %%mm6\n\t"

200 
"punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ 
201 
"punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ 
202 
"movq %%mm4, %%mm5\n\t"

203 
"pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ 
204 
"psllw $3, %%mm5\n\t"

205 
"psllw $3, %%mm6\n\t"

206 
"movq %%mm5, %%mm7\n\t"

207 
"paddw %%mm6, %%mm7\n\t"

208 
"movq %%mm4, %1\n\t" /* DD = x * y */ 
209 
"psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x  xy */ 
210 
"psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y  xy */ 
211 
"paddw %4, %%mm4\n\t"

212 
"psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy  (8x+8y) + 64 */ 
213 
"pxor %%mm7, %%mm7\n\t"

214 
"movq %%mm4, %0\n\t"

215 
: "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); 
216  
217 
asm volatile( 
218 
/* mm0 = src[0..3], mm1 = src[1..4] */

219 
"movd %0, %%mm0\n\t"

220 
"movd %1, %%mm1\n\t"

221 
"punpcklbw %%mm7, %%mm0\n\t"

222 
"punpcklbw %%mm7, %%mm1\n\t"

223 
: : "m" (src[0]), "m" (src[1])); 
224  
225 
for(i=0; i<h; i++) { 
226 
asm volatile( 
227 
/* mm2 = A * src[0..3] + B * src[1..4] */

228 
"movq %%mm0, %%mm2\n\t"

229 
"pmullw %0, %%mm2\n\t"

230 
"pmullw %%mm5, %%mm1\n\t"

231 
"paddw %%mm1, %%mm2\n\t"

232 
: : "m" (AA));

233  
234 
src += stride; 
235 
asm volatile( 
236 
/* mm0 = src[0..3], mm1 = src[1..4] */

237 
"movd %0, %%mm0\n\t"

238 
"movd %1, %%mm1\n\t"

239 
"punpcklbw %%mm7, %%mm0\n\t"

240 
"punpcklbw %%mm7, %%mm1\n\t"

241 
: : "m" (src[0]), "m" (src[1])); 
242  
243 
asm volatile( 
244 
/* mm2 += C * src[0..3] + D * src[1..4] */

245 
"movq %%mm0, %%mm3\n\t"

246 
"movq %%mm1, %%mm4\n\t"

247 
"pmullw %%mm6, %%mm3\n\t"

248 
"pmullw %0, %%mm4\n\t"

249 
"paddw %%mm3, %%mm2\n\t"

250 
"paddw %%mm4, %%mm2\n\t"

251 
: : "m" (DD));

252  
253 
asm volatile( 
254 
/* dst[0..3] = pack((mm2 + 32) >> 6) */

255 
"paddw %1, %%mm2\n\t"

256 
"psrlw $6, %%mm2\n\t"

257 
"packuswb %%mm7, %%mm2\n\t"

258 
H264_CHROMA_OP4(%0, %%mm2, %%mm3)

259 
"movd %%mm2, %0\n\t"

260 
: "=m" (dst[0]) : "m" (ff_pw_32)); 
261 
dst += stride; 
262 
} 
263 
} 
264  
265 
#ifdef H264_CHROMA_MC2_TMPL

266 
static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 
267 
{ 
268 
int tmp = ((1<<16)1)*x + 8; 
269 
int CD= tmp*y;

270 
int AB= (tmp<<3)  CD; 
271 
asm volatile( 
272 
/* mm5 = {A,B,A,B} */

273 
/* mm6 = {C,D,C,D} */

274 
"movd %0, %%mm5\n\t"

275 
"movd %1, %%mm6\n\t"

276 
"punpckldq %%mm5, %%mm5\n\t"

277 
"punpckldq %%mm6, %%mm6\n\t"

278 
"pxor %%mm7, %%mm7\n\t"

279 
/* mm0 = src[0,1,1,2] */

280 
"movd %2, %%mm0\n\t"

281 
"punpcklbw %%mm7, %%mm0\n\t"

282 
"pshufw $0x94, %%mm0, %%mm0\n\t"

283 
:: "r"(AB), "r"(CD), "m"(src[0])); 
284  
285  
286 
asm volatile( 
287 
"1:\n\t"

288 
"addl %4, %1\n\t"

289 
/* mm1 = A * src[0,1] + B * src[1,2] */

290 
"movq %%mm0, %%mm1\n\t"

291 
"pmaddwd %%mm5, %%mm1\n\t"

292 
/* mm0 = src[0,1,1,2] */

293 
"movd (%1), %%mm0\n\t"

294 
"punpcklbw %%mm7, %%mm0\n\t"

295 
"pshufw $0x94, %%mm0, %%mm0\n\t"

296 
/* mm1 += C * src[0,1] + D * src[1,2] */

297 
"movq %%mm0, %%mm2\n\t"

298 
"pmaddwd %%mm6, %%mm2\n\t"

299 
"paddw %%mm2, %%mm1\n\t"

300 
/* dst[0,1] = pack((mm1 + 32) >> 6) */

301 
"paddw %3, %%mm1\n\t"

302 
"psrlw $6, %%mm1\n\t"

303 
"packssdw %%mm7, %%mm1\n\t"

304 
"packuswb %%mm7, %%mm1\n\t"

305 
/* writes garbage to the right of dst.

306 
* ok because partitions are processed from left to right. */

307 
H264_CHROMA_OP4((%0), %%mm1, %%mm3)

308 
"movd %%mm1, (%0)\n\t"

309 
"addl %4, %0\n\t"

310 
"subl $1, %2\n\t"

311 
"jnz 1b\n\t"

312 
: "+r" (dst), "+r"(src), "+r"(h) : "m" (ff_pw_32), "r"(stride)); 
313  
314 
} 
315 
#endif

316 