1 
/*


2 
* Copyright (c) 2005 Zoltan Hidvegi <hzoli a hzoli d com>,

3 
* Loren Merritt

* 
*

5 
* This file is part of FFmpeg.

* 
*

7 
* FFmpeg is free software; you can redistribute it and/or

8 
* modify it under the terms of the GNU Lesser General Public

9 
* License as published by the Free Software Foundation; either

10 
* version 2.1 of the License, or (at your option) any later version.

* 
*

12 
* FFmpeg is distributed in the hope that it will be useful,

13 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

14 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15 
* Lesser General Public License for more details.

* 
*

17 
* You should have received a copy of the GNU Lesser General Public

18 
* License along with FFmpeg; if not, write to the Free Software

19 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

*/ 
*/

  
22 
/**

23 
* MMX optimized version of (putavg)_h264_chroma_mc8.

24 
* H264_CHROMA_MC8_TMPL must be defined to the desired function name

25 
* H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg

26 
* H264_CHROMA_MC8_MV0 must be defined to a (putavg)_pixels8 function

27 
*/

28 
static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) 
29 
{ 
30 
const uint64_t *rnd_reg;

31 
DECLARE_ALIGNED_8(uint64_t, AA); 
32 
DECLARE_ALIGNED_8(uint64_t, DD); 
33 
int i;

34  
 
if(y==0 && x==0) { 
36 
/* no filter needed */

37 
H264_CHROMA_MC8_MV0(dst, src, stride, h); 
38 
return;

39 
} 
}  
41 
assert(x<8 && y<8 && x>=0 && y>=0); 
42  
 
if(y==0  x==0) 
44 
{ 
{ 
/* 1 dimensional filter only */

46 
const int dxy = x ? 1 : stride; 
47  
48 
rnd_reg = rnd ? &ff_pw_4 : &ff_pw_3; 
49  
 
__asm__ volatile(

51 
"movd %0, %%mm5\n\t"

52 
"movq %1, %%mm4\n\t"

53 
"movq %2, %%mm6\n\t" /* mm6 = rnd */ 
54 
"punpcklwd %%mm5, %%mm5\n\t"

55 
"punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ 
56 
"pxor %%mm7, %%mm7\n\t"

57 
"psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8x */ 
58 
:: "rm"(x+y), "m"(ff_pw_8), "m"(*rnd_reg)); 
59  
60 
for(i=0; i<h; i++) { 
61 
__asm__ volatile(

62 
/* mm0 = src[0..7], mm1 = src[1..8] */

63 
"movq %0, %%mm0\n\t"

64 
"movq %1, %%mm2\n\t"

65 
:: "m"(src[0]), "m"(src[dxy])); 
66  
 
__asm__ volatile(

68 
/* [mm0,mm1] = A * src[0..7] */

69 
/* [mm2,mm3] = B * src[1..8] */

70 
"movq %%mm0, %%mm1\n\t"

71 
"movq %%mm2, %%mm3\n\t"

72 
"punpcklbw %%mm7, %%mm0\n\t"

73 
"punpckhbw %%mm7, %%mm1\n\t"

74 
"punpcklbw %%mm7, %%mm2\n\t"

75 
"punpckhbw %%mm7, %%mm3\n\t"

76 
"pmullw %%mm4, %%mm0\n\t"

77 
"pmullw %%mm4, %%mm1\n\t"

78 
"pmullw %%mm5, %%mm2\n\t"

79 
"pmullw %%mm5, %%mm3\n\t"

80  
81 


82 
"paddw %%mm6, %%mm0\n\t"

83 
"paddw %%mm6, %%mm0\n\t"

84 
"paddw %%mm6, %%mm1\n\t"

85 
"paddw %%mm2, %%mm0\n\t"

86 
"paddw %%mm3, %%mm1\n\t"

87 
"psrlw $3, %%mm0\n\t"

88 
"psrlw $3, %%mm1\n\t"

89 
"packuswb %%mm1, %%mm0\n\t"

90 
H264_CHROMA_OP(%0, %%mm0)

91 
"movq %%mm0, %0\n\t" 
92  
93 
src += stride; 
94 
dst += stride; 
95 
} 
} 
return;

97 
} 
}  
99 


100 
/* general case, bilinear */ 
101 
rnd_reg = rnd ? &ff_pw_32.a : &ff_pw_28.a; 
102 
"movd %3, %%mm6\n\t"

103 
"movd %3, %%mm6\n\t"

104 
"punpcklwd %%mm4, %%mm4\n\t"

105 
"punpcklwd %%mm6, %%mm6\n\t" 
106 
"punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ 
107 
"movq %%mm4, %%mm5\n\t"

108 
"movq %%mm4, %%mm5\n\t" 
109 
"psllw $3, %%mm5\n\t"

110 
"psllw $3, %%mm5\n\t"

111 
"psllw $3, %%mm6\n\t"

112 
"movq %%mm5, %%mm7\n\t"

113 
"paddw %%mm6, %%mm7\n\t" 
114 
"movq %%mm4, %1\n\t" /* DD = x * y */ 
115 
"psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x  xy */ 
116 
"paddw %4, %%mm4\n\t"

117 
"paddw %4, %%mm4\n\t" 
118 
"pxor %%mm7, %%mm7\n\t"

119 
"pxor %%mm7, %%mm7\n\t"

120 
"movq %%mm4, %0\n\t" 
121  
122 
__asm__ volatile(

 
/* mm0 = src[0..7], mm1 = src[1..8] */

124 
"movq %0, %%mm0\n\t"

125 
"movq %1, %%mm1\n\t"

126 
: : "m" (src[0]), "m" (src[1])); 
127  
128 
for(i=0; i<h; i++) { 
129 
 
130  
131 
__asm__ volatile(

132 
/* mm2 = A * src[0..3] + B * src[1..4] */

 
/* mm3 = A * src[4..7] + B * src[5..8] */

134 
"movq %%mm0, %%mm2\n\t"

135 
"movq %%mm1, %%mm3\n\t"

136 
"punpckhbw %%mm7, %%mm0\n\t"

137 
"punpcklbw %%mm7, %%mm1\n\t"

138 
"punpcklbw %%mm7, %%mm2\n\t"

139 
"punpckhbw %%mm7, %%mm3\n\t"

140 
"pmullw %0, %%mm0\n\t"

141 
"pmullw %0, %%mm2\n\t"

142 
"pmullw %%mm5, %%mm1\n\t"

143 
"pmullw %%mm5, %%mm3\n\t"

144 
"paddw %%mm1, %%mm2\n\t"

145 
"paddw %%mm0, %%mm3\n\t"

146 
: : "m" (AA));

147  
148 
"paddw %%mm0, %%mm3\n\t"

149 
: : "m" (AA));

150 


151 
__asm__ volatile(

152 
/* [mm2,mm3] += C * src[0..7] */

153 
"movq %0, %%mm0\n\t"

154 
"movq %%mm0, %%mm1\n\t"

155 
"punpcklbw %%mm7, %%mm0\n\t"

156 
"punpckhbw %%mm7, %%mm1\n\t"

157 
"pmullw %%mm6, %%mm0\n\t"

158 
: : "m" (src[0])); 
159  
160 
__asm__ volatile(

161 
/* [mm2,mm3] += D * src[1..8] */

162 
"movq %1, %%mm1\n\t"

163 
"movq %%mm1, %%mm0\n\t"

164 
"movq %%mm1, %%mm4\n\t"

165 
"punpcklbw %%mm7, %%mm0\n\t"

166 
"punpckhbw %%mm7, %%mm4\n\t"

167 
"pmullw %2, %%mm0\n\t"

168 
"pmullw %2, %%mm4\n\t"

169 
"paddw %%mm0, %%mm2\n\t"

170 
"paddw %%mm4, %%mm3\n\t"

171 
"movq %0, %%mm0\n\t"

172 
: : "m" (src[0]), "m" (src[1]), "m" (DD)); 
173  
174 
__asm__ volatile(

175 
/* dst[0..7] = ([mm2,mm3] + 32) >> 6 */

176 
"paddw %1, %%mm2\n\t"

177 
"paddw %1, %%mm3\n\t"

178 
"psrlw $6, %%mm2\n\t"

179 
"psrlw $6, %%mm3\n\t"

180 
"packuswb %%mm3, %%mm2\n\t"

181 
H264_CHROMA_OP(%0, %%mm2)

182 
"movq %%mm2, %0\n\t"

183 
: "=m" (dst[0]) : "m" (*rnd_reg)); 
184 
dst+= stride; 
185 
} 
186 
} 
187  
188 
static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 
189 
{ 
190 
__asm__ volatile(

191 
"pxor %%mm7, %%mm7 \n\t"

192 
"movd %5, %%mm2 \n\t"

193 
"movd %6, %%mm3 \n\t"

194 
"movq "MANGLE(ff_pw_8)", %%mm4\n\t" 
195 
"movq "MANGLE(ff_pw_8)", %%mm5\n\t" 
196 
"punpcklwd %%mm2, %%mm2 \n\t"

197 
"punpcklwd %%mm3, %%mm3 \n\t"

198 
"punpcklwd %%mm2, %%mm2 \n\t"

199 
"punpcklwd %%mm3, %%mm3 \n\t"

200 
"psubw %%mm2, %%mm4 \n\t"

201 
"psubw %%mm3, %%mm5 \n\t"

202  
203 
"movd (%1), %%mm0 \n\t"

204 
"movd 1(%1), %%mm6 \n\t"

205 
"add %3, %1 \n\t"

206 
"punpcklbw %%mm7, %%mm0 \n\t"

207 
"punpcklbw %%mm7, %%mm6 \n\t"

208 
"pmullw %%mm4, %%mm0 \n\t"

209 
"pmullw %%mm2, %%mm6 \n\t"

210 
"paddw %%mm0, %%mm6 \n\t"

211  
212 
"1: \n\t"

213 
"movd (%1), %%mm0 \n\t"

214 
"movd 1(%1), %%mm1 \n\t"

215 
"add %3, %1 \n\t"

216 
"punpcklbw %%mm7, %%mm0 \n\t"

217 
"punpcklbw %%mm7, %%mm1 \n\t"

218 
"pmullw %%mm4, %%mm0 \n\t"

219 
"pmullw %%mm2, %%mm1 \n\t"

220 
"paddw %%mm0, %%mm1 \n\t"

221 
"movq %%mm1, %%mm0 \n\t"

222 
"pmullw %%mm5, %%mm6 \n\t"

223 
"pmullw %%mm3, %%mm1 \n\t"

224 
"paddw %4, %%mm6 \n\t"

225 
"paddw %%mm6, %%mm1 \n\t"

226 
"psrlw $6, %%mm1 \n\t"

227 
"packuswb %%mm1, %%mm1 \n\t"

228 
H264_CHROMA_OP4((%0), %%mm1, %%mm6)

229 
"movd %%mm1, (%0) \n\t"

230 
"add %3, %0 \n\t"

231 
"movd (%1), %%mm6 \n\t"

232 
"movd 1(%1), %%mm1 \n\t"

233 
"add %3, %1 \n\t"

234 
"punpcklbw %%mm7, %%mm6 \n\t"

235 
"punpcklbw %%mm7, %%mm1 \n\t"

236 
"pmullw %%mm4, %%mm6 \n\t"

237 
"pmullw %%mm2, %%mm1 \n\t"

238 
"paddw %%mm6, %%mm1 \n\t"

239 
"movq %%mm1, %%mm6 \n\t"

240 
"pmullw %%mm5, %%mm0 \n\t"

241 
"pmullw %%mm3, %%mm1 \n\t"

242 
"paddw %4, %%mm0 \n\t"

243 
"paddw %%mm0, %%mm1 \n\t"

244 
"psrlw $6, %%mm1 \n\t"

245 
"packuswb %%mm1, %%mm1 \n\t"

246 
H264_CHROMA_OP4((%0), %%mm1, %%mm0)

247 
"movd %%mm1, (%0) \n\t"

248 
"add %3, %0 \n\t"

249 
"sub $2, %2 \n\t"

250 
"jnz 1b \n\t"

251 
: "+r"(dst), "+r"(src), "+r"(h) 
252 
: "r"((x86_reg)stride), "m"(ff_pw_32), "m"(x), "m"(y) 
253 
); 
254 
} 
255  
256 
#ifdef H264_CHROMA_MC2_TMPL

257 
static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 
258 
{ 
259 
int tmp = ((1<<16)1)*x + 8; 
260 
int CD= tmp*y;

261 
int AB= (tmp<<3)  CD; 
262 
__asm__ volatile(

263 
/* mm5 = {A,B,A,B} */

264 
/* mm6 = {C,D,C,D} */

265 
"movd %0, %%mm5\n\t"

266 
"movd %1, %%mm6\n\t"

267 
"punpckldq %%mm5, %%mm5\n\t"

268 
"punpckldq %%mm6, %%mm6\n\t"

269 
"pxor %%mm7, %%mm7\n\t"

270 
/* mm0 = src[0,1,1,2] */

271 
"movd %2, %%mm2\n\t"

272 
"punpcklbw %%mm7, %%mm2\n\t"

273 
"pshufw $0x94, %%mm2, %%mm2\n\t"

274 
:: "r"(AB), "r"(CD), "m"(src[0])); 
275  
276  
277 
__asm__ volatile(

278 
"1:\n\t"

279 
"add %4, %1\n\t"

280 
/* mm1 = A * src[0,1] + B * src[1,2] */

281 
"movq %%mm2, %%mm1\n\t"

282 
"pmaddwd %%mm5, %%mm1\n\t"

283 
/* mm0 = src[0,1,1,2] */

284 
"movd (%1), %%mm0\n\t"

285 
"punpcklbw %%mm7, %%mm0\n\t"

286 
"pshufw $0x94, %%mm0, %%mm0\n\t"

287 
/* mm1 += C * src[0,1] + D * src[1,2] */

288 
"movq %%mm0, %%mm2\n\t"

289 
"pmaddwd %%mm6, %%mm0\n\t"

290 
"paddw %3, %%mm1\n\t"

291 
"paddw %%mm0, %%mm1\n\t"

292 
/* dst[0,1] = pack((mm1 + 32) >> 6) */

293 
"psrlw $6, %%mm1\n\t"

294 
"packssdw %%mm7, %%mm1\n\t"

295 
"packuswb %%mm7, %%mm1\n\t"

296 
H264_CHROMA_OP4((%0), %%mm1, %%mm3)

297 
"movd %%mm1, %%esi\n\t"

298 
"movw %%si, (%0)\n\t"

299 
"add %4, %0\n\t"

300 
"sub $1, %2\n\t"

301 
"jnz 1b\n\t"

302 
: "+r" (dst), "+r"(src), "+r"(h) 
303 
: "m" (ff_pw_32), "r"((x86_reg)stride) 
304 
: "%esi");

305  
306 
} 
307 
#endif

308 