ffmpeg / libavcodec / x86 / dsputil_h264_template_mmx.c @ 84dc2d8a
History  View  Annotate  Download (10.7 KB)
1 
/*


2 
* Copyright (c) 2005 Zoltan Hidvegi <hzoli a hzoli d com>,

3 
* Loren Merritt

4 
*

5 
* This file is part of FFmpeg.

6 
*

7 
* FFmpeg is free software; you can redistribute it and/or

8 
* modify it under the terms of the GNU Lesser General Public

9 
* License as published by the Free Software Foundation; either

10 
* version 2.1 of the License, or (at your option) any later version.

11 
*

12 
* FFmpeg is distributed in the hope that it will be useful,

13 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

14 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15 
* Lesser General Public License for more details.

16 
*

17 
* You should have received a copy of the GNU Lesser General Public

18 
* License along with FFmpeg; if not, write to the Free Software

19 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

20 
*/

21  
22 
/**

23 
* MMX optimized version of (putavg)_h264_chroma_mc8.

24 
* H264_CHROMA_MC8_TMPL must be defined to the desired function name

25 
* H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg

26 
* H264_CHROMA_MC8_MV0 must be defined to a (putavg)_pixels8 function

27 
*/

28 
static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) 
29 
{ 
30 
DECLARE_ALIGNED(8, uint64_t, AA);

31 
DECLARE_ALIGNED(8, uint64_t, DD);

32 
int i;

33  
34 
if(y==0 && x==0) { 
35 
/* no filter needed */

36 
H264_CHROMA_MC8_MV0(dst, src, stride, h); 
37 
return;

38 
} 
39  
40 
assert(x<8 && y<8 && x>=0 && y>=0); 
41  
42 
if(y==0  x==0) 
43 
{ 
44 
/* 1 dimensional filter only */

45 
const int dxy = x ? 1 : stride; 
46  
47 
__asm__ volatile(

48 
"movd %0, %%mm5\n\t"

49 
"movq %1, %%mm4\n\t"

50 
"movq %2, %%mm6\n\t" /* mm6 = rnd >> 3 */ 
51 
"punpcklwd %%mm5, %%mm5\n\t"

52 
"punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ 
53 
"pxor %%mm7, %%mm7\n\t"

54 
"psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8x */ 
55 
:: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1))); 
56  
57 
for(i=0; i<h; i++) { 
58 
__asm__ volatile(

59 
/* mm0 = src[0..7], mm1 = src[1..8] */

60 
"movq %0, %%mm0\n\t"

61 
"movq %1, %%mm2\n\t"

62 
:: "m"(src[0]), "m"(src[dxy])); 
63  
64 
__asm__ volatile(

65 
/* [mm0,mm1] = A * src[0..7] */

66 
/* [mm2,mm3] = B * src[1..8] */

67 
"movq %%mm0, %%mm1\n\t"

68 
"movq %%mm2, %%mm3\n\t"

69 
"punpcklbw %%mm7, %%mm0\n\t"

70 
"punpckhbw %%mm7, %%mm1\n\t"

71 
"punpcklbw %%mm7, %%mm2\n\t"

72 
"punpckhbw %%mm7, %%mm3\n\t"

73 
"pmullw %%mm4, %%mm0\n\t"

74 
"pmullw %%mm4, %%mm1\n\t"

75 
"pmullw %%mm5, %%mm2\n\t"

76 
"pmullw %%mm5, %%mm3\n\t"

77  
78 
/* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */

79 
"paddw %%mm6, %%mm0\n\t"

80 
"paddw %%mm6, %%mm1\n\t"

81 
"paddw %%mm2, %%mm0\n\t"

82 
"paddw %%mm3, %%mm1\n\t"

83 
"psrlw $3, %%mm0\n\t"

84 
"psrlw $3, %%mm1\n\t"

85 
"packuswb %%mm1, %%mm0\n\t"

86 
H264_CHROMA_OP(%0, %%mm0)

87 
"movq %%mm0, %0\n\t"

88 
: "=m" (dst[0])); 
89  
90 
src += stride; 
91 
dst += stride; 
92 
} 
93 
return;

94 
} 
95  
96 
/* general case, bilinear */

97 
__asm__ volatile("movd %2, %%mm4\n\t" 
98 
"movd %3, %%mm6\n\t"

99 
"punpcklwd %%mm4, %%mm4\n\t"

100 
"punpcklwd %%mm6, %%mm6\n\t"

101 
"punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ 
102 
"punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ 
103 
"movq %%mm4, %%mm5\n\t"

104 
"pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ 
105 
"psllw $3, %%mm5\n\t"

106 
"psllw $3, %%mm6\n\t"

107 
"movq %%mm5, %%mm7\n\t"

108 
"paddw %%mm6, %%mm7\n\t"

109 
"movq %%mm4, %1\n\t" /* DD = x * y */ 
110 
"psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x  xy */ 
111 
"psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y  xy */ 
112 
"paddw %4, %%mm4\n\t"

113 
"psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy  (8x+8y) + 64 */ 
114 
"pxor %%mm7, %%mm7\n\t"

115 
"movq %%mm4, %0\n\t"

116 
: "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); 
117  
118 
__asm__ volatile(

119 
/* mm0 = src[0..7], mm1 = src[1..8] */

120 
"movq %0, %%mm0\n\t"

121 
"movq %1, %%mm1\n\t"

122 
: : "m" (src[0]), "m" (src[1])); 
123  
124 
for(i=0; i<h; i++) { 
125 
src += stride; 
126  
127 
__asm__ volatile(

128 
/* mm2 = A * src[0..3] + B * src[1..4] */

129 
/* mm3 = A * src[4..7] + B * src[5..8] */

130 
"movq %%mm0, %%mm2\n\t"

131 
"movq %%mm1, %%mm3\n\t"

132 
"punpckhbw %%mm7, %%mm0\n\t"

133 
"punpcklbw %%mm7, %%mm1\n\t"

134 
"punpcklbw %%mm7, %%mm2\n\t"

135 
"punpckhbw %%mm7, %%mm3\n\t"

136 
"pmullw %0, %%mm0\n\t"

137 
"pmullw %0, %%mm2\n\t"

138 
"pmullw %%mm5, %%mm1\n\t"

139 
"pmullw %%mm5, %%mm3\n\t"

140 
"paddw %%mm1, %%mm2\n\t"

141 
"paddw %%mm0, %%mm3\n\t"

142 
: : "m" (AA));

143  
144 
__asm__ volatile(

145 
/* [mm2,mm3] += C * src[0..7] */

146 
"movq %0, %%mm0\n\t"

147 
"movq %%mm0, %%mm1\n\t"

148 
"punpcklbw %%mm7, %%mm0\n\t"

149 
"punpckhbw %%mm7, %%mm1\n\t"

150 
"pmullw %%mm6, %%mm0\n\t"

151 
"pmullw %%mm6, %%mm1\n\t"

152 
"paddw %%mm0, %%mm2\n\t"

153 
"paddw %%mm1, %%mm3\n\t"

154 
: : "m" (src[0])); 
155  
156 
__asm__ volatile(

157 
/* [mm2,mm3] += D * src[1..8] */

158 
"movq %1, %%mm1\n\t"

159 
"movq %%mm1, %%mm0\n\t"

160 
"movq %%mm1, %%mm4\n\t"

161 
"punpcklbw %%mm7, %%mm0\n\t"

162 
"punpckhbw %%mm7, %%mm4\n\t"

163 
"pmullw %2, %%mm0\n\t"

164 
"pmullw %2, %%mm4\n\t"

165 
"paddw %%mm0, %%mm2\n\t"

166 
"paddw %%mm4, %%mm3\n\t"

167 
"movq %0, %%mm0\n\t"

168 
: : "m" (src[0]), "m" (src[1]), "m" (DD)); 
169  
170 
__asm__ volatile(

171 
/* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */

172 
"paddw %1, %%mm2\n\t"

173 
"paddw %1, %%mm3\n\t"

174 
"psrlw $6, %%mm2\n\t"

175 
"psrlw $6, %%mm3\n\t"

176 
"packuswb %%mm3, %%mm2\n\t"

177 
H264_CHROMA_OP(%0, %%mm2)

178 
"movq %%mm2, %0\n\t"

179 
: "=m" (dst[0]) : "m" (*rnd_reg)); 
180 
dst+= stride; 
181 
} 
182 
} 
183  
184 
static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) 
185 
{ 
186 
__asm__ volatile(

187 
"pxor %%mm7, %%mm7 \n\t"

188 
"movd %5, %%mm2 \n\t"

189 
"movd %6, %%mm3 \n\t"

190 
"movq "MANGLE(ff_pw_8)", %%mm4\n\t" 
191 
"movq "MANGLE(ff_pw_8)", %%mm5\n\t" 
192 
"punpcklwd %%mm2, %%mm2 \n\t"

193 
"punpcklwd %%mm3, %%mm3 \n\t"

194 
"punpcklwd %%mm2, %%mm2 \n\t"

195 
"punpcklwd %%mm3, %%mm3 \n\t"

196 
"psubw %%mm2, %%mm4 \n\t"

197 
"psubw %%mm3, %%mm5 \n\t"

198  
199 
"movd (%1), %%mm0 \n\t"

200 
"movd 1(%1), %%mm6 \n\t"

201 
"add %3, %1 \n\t"

202 
"punpcklbw %%mm7, %%mm0 \n\t"

203 
"punpcklbw %%mm7, %%mm6 \n\t"

204 
"pmullw %%mm4, %%mm0 \n\t"

205 
"pmullw %%mm2, %%mm6 \n\t"

206 
"paddw %%mm0, %%mm6 \n\t"

207  
208 
"1: \n\t"

209 
"movd (%1), %%mm0 \n\t"

210 
"movd 1(%1), %%mm1 \n\t"

211 
"add %3, %1 \n\t"

212 
"punpcklbw %%mm7, %%mm0 \n\t"

213 
"punpcklbw %%mm7, %%mm1 \n\t"

214 
"pmullw %%mm4, %%mm0 \n\t"

215 
"pmullw %%mm2, %%mm1 \n\t"

216 
"paddw %%mm0, %%mm1 \n\t"

217 
"movq %%mm1, %%mm0 \n\t"

218 
"pmullw %%mm5, %%mm6 \n\t"

219 
"pmullw %%mm3, %%mm1 \n\t"

220 
"paddw %4, %%mm6 \n\t"

221 
"paddw %%mm6, %%mm1 \n\t"

222 
"psrlw $6, %%mm1 \n\t"

223 
"packuswb %%mm1, %%mm1 \n\t"

224 
H264_CHROMA_OP4((%0), %%mm1, %%mm6)

225 
"movd %%mm1, (%0) \n\t"

226 
"add %3, %0 \n\t"

227 
"movd (%1), %%mm6 \n\t"

228 
"movd 1(%1), %%mm1 \n\t"

229 
"add %3, %1 \n\t"

230 
"punpcklbw %%mm7, %%mm6 \n\t"

231 
"punpcklbw %%mm7, %%mm1 \n\t"

232 
"pmullw %%mm4, %%mm6 \n\t"

233 
"pmullw %%mm2, %%mm1 \n\t"

234 
"paddw %%mm6, %%mm1 \n\t"

235 
"movq %%mm1, %%mm6 \n\t"

236 
"pmullw %%mm5, %%mm0 \n\t"

237 
"pmullw %%mm3, %%mm1 \n\t"

238 
"paddw %4, %%mm0 \n\t"

239 
"paddw %%mm0, %%mm1 \n\t"

240 
"psrlw $6, %%mm1 \n\t"

241 
"packuswb %%mm1, %%mm1 \n\t"

242 
H264_CHROMA_OP4((%0), %%mm1, %%mm0)

243 
"movd %%mm1, (%0) \n\t"

244 
"add %3, %0 \n\t"

245 
"sub $2, %2 \n\t"

246 
"jnz 1b \n\t"

247 
: "+r"(dst), "+r"(src), "+r"(h) 
248 
: "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y) 
249 
); 
250 
} 
251  
252 
#ifdef H264_CHROMA_MC2_TMPL

253 
static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 
254 
{ 
255 
int tmp = ((1<<16)1)*x + 8; 
256 
int CD= tmp*y;

257 
int AB= (tmp<<3)  CD; 
258 
__asm__ volatile(

259 
/* mm5 = {A,B,A,B} */

260 
/* mm6 = {C,D,C,D} */

261 
"movd %0, %%mm5\n\t"

262 
"movd %1, %%mm6\n\t"

263 
"punpckldq %%mm5, %%mm5\n\t"

264 
"punpckldq %%mm6, %%mm6\n\t"

265 
"pxor %%mm7, %%mm7\n\t"

266 
/* mm0 = src[0,1,1,2] */

267 
"movd %2, %%mm2\n\t"

268 
"punpcklbw %%mm7, %%mm2\n\t"

269 
"pshufw $0x94, %%mm2, %%mm2\n\t"

270 
:: "r"(AB), "r"(CD), "m"(src[0])); 
271  
272  
273 
__asm__ volatile(

274 
"1:\n\t"

275 
"add %4, %1\n\t"

276 
/* mm1 = A * src[0,1] + B * src[1,2] */

277 
"movq %%mm2, %%mm1\n\t"

278 
"pmaddwd %%mm5, %%mm1\n\t"

279 
/* mm0 = src[0,1,1,2] */

280 
"movd (%1), %%mm0\n\t"

281 
"punpcklbw %%mm7, %%mm0\n\t"

282 
"pshufw $0x94, %%mm0, %%mm0\n\t"

283 
/* mm1 += C * src[0,1] + D * src[1,2] */

284 
"movq %%mm0, %%mm2\n\t"

285 
"pmaddwd %%mm6, %%mm0\n\t"

286 
"paddw %3, %%mm1\n\t"

287 
"paddw %%mm0, %%mm1\n\t"

288 
/* dst[0,1] = pack((mm1 + 32) >> 6) */

289 
"psrlw $6, %%mm1\n\t"

290 
"packssdw %%mm7, %%mm1\n\t"

291 
"packuswb %%mm7, %%mm1\n\t"

292 
H264_CHROMA_OP4((%0), %%mm1, %%mm3)

293 
"movd %%mm1, %%esi\n\t"

294 
"movw %%si, (%0)\n\t"

295 
"add %4, %0\n\t"

296 
"sub $1, %2\n\t"

297 
"jnz 1b\n\t"

298 
: "+r" (dst), "+r"(src), "+r"(h) 
299 
: "m" (ff_pw_32), "r"((x86_reg)stride) 
300 
: "%esi");

301  
302 
} 
303 
#endif

304 