/*
/*
``` |
---|---|---|---|

2 | a6624e21 | Loren Merritt | ```
* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
``` |

3 | ```
* Loren Merritt
``` |
||

4 | 3072f0cb | Zoltán Hidvégi | ```
*
``` |

5 | ```
* This library is free software; you can redistribute it and/or
``` |
||

6 | ```
* modify it under the terms of the GNU Lesser General Public
``` |
||

7 | ```
* License as published by the Free Software Foundation; either
``` |
||

8 | ```
* version 2 of the License, or (at your option) any later version.
``` |
||

9 | ```
*
``` |
||

10 | ```
* This library is distributed in the hope that it will be useful,
``` |
||

11 | ```
* but WITHOUT ANY WARRANTY; without even the implied warranty of
``` |
||

12 | ```
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
``` |
||

13 | ```
* Lesser General Public License for more details.
``` |
||

14 | ```
*
``` |
||

15 | ```
* You should have received a copy of the GNU Lesser General Public
``` |
||

16 | ```
* License along with this library; if not, write to the Free Software
``` |
||

17 | 5509bffa | Diego Biurrun | ```
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
``` |

18 | 3072f0cb | Zoltán Hidvégi | ```
*/
``` |

19 | |||

20 | ```
/**
``` |
||

21 | ```
* MMX optimized version of (put|avg)_h264_chroma_mc8.
``` |
||

22 | a6624e21 | Loren Merritt | ```
* H264_CHROMA_MC8_TMPL must be defined to the desired function name
``` |

23 | ```
* H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
``` |
||

24 | ```
* H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
``` |
||

25 | 3072f0cb | Zoltán Hidvégi | ```
*/
``` |

static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
||

{
||

uint64_t AA __align8;
||

uint64_t DD __align8;
||

30 | ```
int i;
``` |
||

31 | |||

if(y==0 && x==0) {

33 | ```
/* no filter needed */
``` |
||

H264_CHROMA_MC8_MV0(dst, src, stride, h);
||

35 | ```
return;
``` |
||

}
||

37 | |||

assert(x<8 && y<8 && x>=0 && y>=0);

39 | |||

if(y==0)

{
||

42 | ```
/* horizontal filter only */
``` |
||

asm volatile("movd %0, %%mm5\n\t"
||

44 | ```
"punpcklwd %%mm5, %%mm5\n\t"
``` |
||

"punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
||

46 | ```
"movq %1, %%mm4\n\t"
``` |
||

47 | ```
"pxor %%mm7, %%mm7\n\t"
``` |
||

"psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */
||

: : "rm" (x), "m" (ff_pw_8));
||

50 | |||

for(i=0; i<h; i++) {
||

asm volatile(
||

53 | ```
/* mm0 = src[0..7], mm1 = src[1..8] */
``` |
||

54 | ```
"movq %0, %%mm0\n\t"
``` |
||

55 | ```
"movq %1, %%mm1\n\t"
``` |
||

: : "m" (src[0]), "m" (src[1]));
||

57 | |||

asm volatile(
||

59 | ```
/* [mm2,mm3] = A * src[0..7] */
``` |
||

60 | ```
"movq %%mm0, %%mm2\n\t"
``` |
||

61 | ```
"punpcklbw %%mm7, %%mm2\n\t"
``` |
||

62 | ```
"pmullw %%mm4, %%mm2\n\t"
``` |
||

63 | ```
"movq %%mm0, %%mm3\n\t"
``` |
||

64 | ```
"punpckhbw %%mm7, %%mm3\n\t"
``` |
||

65 | ```
"pmullw %%mm4, %%mm3\n\t"
``` |
||

66 | |||

67 | ```
/* [mm2,mm3] += B * src[1..8] */
``` |
||

68 | ```
"movq %%mm1, %%mm0\n\t"
``` |
||

69 | ```
"punpcklbw %%mm7, %%mm0\n\t"
``` |
||

70 | ```
"pmullw %%mm5, %%mm0\n\t"
``` |
||

71 | ```
"punpckhbw %%mm7, %%mm1\n\t"
``` |
||

72 | ```
"pmullw %%mm5, %%mm1\n\t"
``` |
||

73 | ```
"paddw %%mm0, %%mm2\n\t"
``` |
||

74 | ```
"paddw %%mm1, %%mm3\n\t"
``` |
||

75 | |||

76 | ```
/* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
``` |
||

77 | ```
"paddw %1, %%mm2\n\t"
``` |
||

78 | ```
"paddw %1, %%mm3\n\t"
``` |
||

79 | ```
"psrlw $3, %%mm2\n\t"
``` |
||

80 | ```
"psrlw $3, %%mm3\n\t"
``` |
||

81 | ```
"packuswb %%mm3, %%mm2\n\t"
``` |
||

82 | ```
H264_CHROMA_OP(%0, %%mm2)
``` |
||

83 | ```
"movq %%mm2, %0\n\t"
``` |
||

: "=m" (dst[0]) : "m" (ff_pw_4));
||

85 | |||

src += stride;
||

dst += stride;
||

}
||

89 | ```
return;
``` |
||

}
||

91 | |||

if(x==0)
||

{
||

94 | ```
/* vertical filter only */
``` |
||

asm volatile("movd %0, %%mm6\n\t"
||

96 | ```
"punpcklwd %%mm6, %%mm6\n\t"
``` |
||

"punpckldq %%mm6, %%mm6\n\t" /* mm6 = C = y */
||

98 | ```
"movq %1, %%mm4\n\t"
``` |
||

99 | ```
"pxor %%mm7, %%mm7\n\t"
``` |
||

"psubw %%mm6, %%mm4\n\t" /* mm4 = A = 8-y */
||

: : "rm" (y), "m" (ff_pw_8));
||

102 | |||

asm volatile(
||

104 | ```
/* mm0 = src[0..7] */
``` |
||

105 | ```
"movq %0, %%mm0\n\t"
``` |
||

: : "m" (src[0]));
||

107 | |||

for(i=0; i<h; i++) {
||

asm volatile(
||

110 | ```
/* [mm2,mm3] = A * src[0..7] */
``` |
||

111 | ```
"movq %mm0, %mm2\n\t"
``` |
||

112 | ```
"punpcklbw %mm7, %mm2\n\t"
``` |
||

113 | ```
"pmullw %mm4, %mm2\n\t"
``` |
||

114 | ```
"movq %mm0, %mm3\n\t"
``` |
||

115 | ```
"punpckhbw %mm7, %mm3\n\t"
``` |
||

116 | ```
"pmullw %mm4, %mm3\n\t");
``` |
||

117 | |||

src += stride;
||

asm volatile(
||

120 | ```
/* mm0 = src[0..7] */
``` |
||

121 | ```
"movq %0, %%mm0\n\t"
``` |
||

: : "m" (src[0]));
||

123 | |||

asm volatile(
||

125 | ```
/* [mm2,mm3] += C * src[0..7] */
``` |
||

126 | ```
"movq %mm0, %mm1\n\t"
``` |
||

127 | ```
"punpcklbw %mm7, %mm1\n\t"
``` |
||

128 | ```
"pmullw %mm6, %mm1\n\t"
``` |
||

129 | ```
"paddw %mm1, %mm2\n\t"
``` |
||

130 | ```
"movq %mm0, %mm5\n\t"
``` |
||

131 | ```
"punpckhbw %mm7, %mm5\n\t"
``` |
||

132 | ```
"pmullw %mm6, %mm5\n\t"
``` |
||

133 | ```
"paddw %mm5, %mm3\n\t");
``` |
||

134 | |||

asm volatile(
||

136 | ```
/* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
``` |
||

137 | ```
"paddw %1, %%mm2\n\t"
``` |
||

138 | ```
"paddw %1, %%mm3\n\t"
``` |
||

139 | ```
"psrlw $3, %%mm2\n\t"
``` |
||

140 | ```
"psrlw $3, %%mm3\n\t"
``` |
||

141 | ```
"packuswb %%mm3, %%mm2\n\t"
``` |
||

142 | ```
H264_CHROMA_OP(%0, %%mm2)
``` |
||

143 | ```
"movq %%mm2, %0\n\t"
``` |
||

: "=m" (dst[0]) : "m" (ff_pw_4));
||

145 | |||

dst += stride;
||

}
||

148 | ```
return;
``` |
||

}
||

150 | |||

151 | ```
/* general case, bilinear */
``` |
||

asm volatile("movd %2, %%mm4\n\t"
||

153 | ```
"movd %3, %%mm6\n\t"
``` |
||

154 | 3072f0cb | Zoltán Hidvégi | ```
"punpcklwd %%mm4, %%mm4\n\t"
``` |

155 | ```
"punpcklwd %%mm6, %%mm6\n\t"
``` |
||

"punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
||

"punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
||

158 | ```
"movq %%mm4, %%mm5\n\t"
``` |
||

"pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */
||

160 | ```
"psllw $3, %%mm5\n\t"
``` |
||

161 | ```
"psllw $3, %%mm6\n\t"
``` |
||

162 | ```
"movq %%mm5, %%mm7\n\t"
``` |
||

163 | ```
"paddw %%mm6, %%mm7\n\t"
``` |
||

"movq %%mm4, %1\n\t" /* DD = x * y */

"psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */

"psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */
||

167 | a6624e21 | Loren Merritt | ```
"paddw %4, %%mm4\n\t"
``` |

"psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */

169 | ```
"pxor %%mm7, %%mm7\n\t"
``` |
||

170 | a6624e21 | Loren Merritt | ```
"movq %%mm4, %0\n\t"
``` |

171 | : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); |
||

172 | 3072f0cb | Zoltán Hidvégi | |

173 | asm volatile( |
||

174 | ```
/* mm0 = src[0..7], mm1 = src[1..8] */
``` |
||

175 | a6624e21 | Loren Merritt | ```
"movq %0, %%mm0\n\t"
``` |

176 | ```
"movq %1, %%mm1\n\t"
``` |
||

177 | : : "m" (src[0]), "m" (src[1])); |
||

178 | 3072f0cb | Zoltán Hidvégi | |

179 | for(i=0; i<h; i++) { |
||

180 | asm volatile( |
||

181 | ```
/* [mm2,mm3] = A * src[0..7] */
``` |
||

182 | ```
"movq %%mm0, %%mm2\n\t"
``` |
||

183 | ```
"punpcklbw %%mm7, %%mm2\n\t"
``` |
||

184 | ```
"pmullw %0, %%mm2\n\t"
``` |
||

185 | ```
"movq %%mm0, %%mm3\n\t"
``` |
||

186 | ```
"punpckhbw %%mm7, %%mm3\n\t"
``` |
||

187 | ```
"pmullw %0, %%mm3\n\t"
``` |
||

188 | |||

189 | ```
/* [mm2,mm3] += B * src[1..8] */
``` |
||

190 | ```
"movq %%mm1, %%mm0\n\t"
``` |
||

191 | ```
"punpcklbw %%mm7, %%mm0\n\t"
``` |
||

192 | ```
"pmullw %%mm5, %%mm0\n\t"
``` |
||

193 | ```
"punpckhbw %%mm7, %%mm1\n\t"
``` |
||

194 | ```
"pmullw %%mm5, %%mm1\n\t"
``` |
||

195 | ```
"paddw %%mm0, %%mm2\n\t"
``` |
||

196 | ```
"paddw %%mm1, %%mm3\n\t"
``` |
||

197 | ```
: : "m" (AA));
``` |
||

198 | |||

199 | src += stride; |
||

200 | asm volatile( |
||

201 | ```
/* mm0 = src[0..7], mm1 = src[1..8] */
``` |
||

202 | a6624e21 | Loren Merritt | ```
"movq %0, %%mm0\n\t"
``` |

203 | ```
"movq %1, %%mm1\n\t"
``` |
||

204 | : : "m" (src[0]), "m" (src[1])); |
||

205 | 3072f0cb | Zoltán Hidvégi | |

206 | asm volatile( |
||

207 | ```
/* [mm2,mm3] += C * src[0..7] */
``` |
||

208 | ```
"movq %mm0, %mm4\n\t"
``` |
||

209 | ```
"punpcklbw %mm7, %mm4\n\t"
``` |
||

210 | ```
"pmullw %mm6, %mm4\n\t"
``` |
||

211 | ```
"paddw %mm4, %mm2\n\t"
``` |
||

212 | ```
"movq %mm0, %mm4\n\t"
``` |
||

213 | ```
"punpckhbw %mm7, %mm4\n\t"
``` |
||

214 | ```
"pmullw %mm6, %mm4\n\t"
``` |
||

215 | ```
"paddw %mm4, %mm3\n\t");
``` |
||

216 | |||

217 | asm volatile( |
||

218 | ```
/* [mm2,mm3] += D * src[1..8] */
``` |
||

219 | ```
"movq %%mm1, %%mm4\n\t"
``` |
||

220 | ```
"punpcklbw %%mm7, %%mm4\n\t"
``` |
||

221 | ```
"pmullw %0, %%mm4\n\t"
``` |
||

222 | ```
"paddw %%mm4, %%mm2\n\t"
``` |
||

223 | ```
"movq %%mm1, %%mm4\n\t"
``` |
||

224 | ```
"punpckhbw %%mm7, %%mm4\n\t"
``` |
||

225 | ```
"pmullw %0, %%mm4\n\t"
``` |
||

226 | ```
"paddw %%mm4, %%mm3\n\t"
``` |
||

227 | ```
: : "m" (DD));
``` |
||

228 | |||

229 | asm volatile( |
||

230 | ```
/* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
``` |
||

231 | ```
"paddw %1, %%mm2\n\t"
``` |
||

232 | ```
"paddw %1, %%mm3\n\t"
``` |
||

233 | ```
"psrlw $6, %%mm2\n\t"
``` |
||

234 | ```
"psrlw $6, %%mm3\n\t"
``` |
||

235 | ```
"packuswb %%mm3, %%mm2\n\t"
``` |
||

236 | ```
H264_CHROMA_OP(%0, %%mm2)
``` |
||

237 | ```
"movq %%mm2, %0\n\t"
``` |
||

238 | d2bb7db1 | Loren Merritt | : "=m" (dst[0]) : "m" (ff_pw_32)); |

239 | 3072f0cb | Zoltán Hidvégi | dst+= stride; |

240 | } |
||

241 | } |
||

242 | a6624e21 | Loren Merritt | |

243 | static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) |
||

244 | { |
||

245 | uint64_t AA __align8; |
||

246 | uint64_t DD __align8; |
||

247 | ```
int i;
``` |
||

248 | |||

249 | ```
/* no special case for mv=(0,0) in 4x*, since it's much less common than in 8x*.
``` |
||

250 | ```
* could still save a few cycles, but maybe not worth the complexity. */
``` |
||

251 | |||

252 | assert(x<8 && y<8 && x>=0 && y>=0); |
||

253 | |||

254 | asm volatile("movd %2, %%mm4\n\t" |
||

255 | ```
"movd %3, %%mm6\n\t"
``` |
||

256 | ```
"punpcklwd %%mm4, %%mm4\n\t"
``` |
||

257 | ```
"punpcklwd %%mm6, %%mm6\n\t"
``` |
||

258 | "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ |
||

259 | "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ |
||

260 | ```
"movq %%mm4, %%mm5\n\t"
``` |
||

261 | "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ |
||

262 | ```
"psllw $3, %%mm5\n\t"
``` |
||

263 | ```
"psllw $3, %%mm6\n\t"
``` |
||

264 | ```
"movq %%mm5, %%mm7\n\t"
``` |
||

265 | ```
"paddw %%mm6, %%mm7\n\t"
``` |
||

266 | "movq %%mm4, %1\n\t" /* DD = x * y */ |
||

267 | "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */ |
||

268 | "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */ |
||

269 | ```
"paddw %4, %%mm4\n\t"
``` |
||

270 | "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */ |
||

271 | ```
"pxor %%mm7, %%mm7\n\t"
``` |
||

272 | ```
"movq %%mm4, %0\n\t"
``` |
||

273 | : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); |
||

274 | |||

275 | asm volatile( |
||

276 | ```
/* mm0 = src[0..3], mm1 = src[1..4] */
``` |
||

277 | ```
"movd %0, %%mm0\n\t"
``` |
||

278 | ```
"movd %1, %%mm1\n\t"
``` |
||

279 | ```
"punpcklbw %%mm7, %%mm0\n\t"
``` |
||

280 | ```
"punpcklbw %%mm7, %%mm1\n\t"
``` |
||

281 | : : "m" (src[0]), "m" (src[1])); |
||

282 | |||

283 | for(i=0; i<h; i++) { |
||

284 | asm volatile( |
||

285 | ```
/* mm2 = A * src[0..3] + B * src[1..4] */
``` |
||

286 | ```
"movq %%mm0, %%mm2\n\t"
``` |
||

287 | ```
"pmullw %0, %%mm2\n\t"
``` |
||

288 | ```
"pmullw %%mm5, %%mm1\n\t"
``` |
||

289 | ```
"paddw %%mm1, %%mm2\n\t"
``` |
||

290 | ```
: : "m" (AA));
``` |
||

291 | |||

292 | src += stride; |
||

293 | asm volatile( |
||

294 | ```
/* mm0 = src[0..3], mm1 = src[1..4] */
``` |
||

295 | ```
"movd %0, %%mm0\n\t"
``` |
||

296 | ```
"movd %1, %%mm1\n\t"
``` |
||

297 | ```
"punpcklbw %%mm7, %%mm0\n\t"
``` |
||

298 | ```
"punpcklbw %%mm7, %%mm1\n\t"
``` |
||

299 | : : "m" (src[0]), "m" (src[1])); |
||

300 | |||

301 | asm volatile( |
||

302 | ```
/* mm2 += C * src[0..3] + D * src[1..4] */
``` |
||

303 | ```
"movq %%mm0, %%mm3\n\t"
``` |
||

304 | ```
"movq %%mm1, %%mm4\n\t"
``` |
||

305 | ```
"pmullw %%mm6, %%mm3\n\t"
``` |
||

306 | ```
"pmullw %0, %%mm4\n\t"
``` |
||

307 | ```
"paddw %%mm3, %%mm2\n\t"
``` |
||

308 | ```
"paddw %%mm4, %%mm2\n\t"
``` |
||

309 | ```
: : "m" (DD));
``` |
||

310 | |||

311 | asm volatile( |
||

312 | ```
/* dst[0..3] = pack((mm2 + 32) >> 6) */
``` |
||

313 | ```
"paddw %1, %%mm2\n\t"
``` |
||

314 | ```
"psrlw $6, %%mm2\n\t"
``` |
||

315 | ```
"packuswb %%mm7, %%mm2\n\t"
``` |
||

316 | ```
H264_CHROMA_OP4(%0, %%mm2, %%mm3)
``` |
||

317 | ```
"movd %%mm2, %0\n\t"
``` |
||

318 | : "=m" (dst[0]) : "m" (ff_pw_32)); |
||

319 | dst += stride; |
||

320 | } |
||

321 | } |