## ffmpeg / libavcodec / x86 / h264_weight.asm @ 98c6053c

History | View | Annotate | Download (8.33 KB)

1 | a33a2562 | Ronald S. Bultje | ;***************************************************************************** |
---|---|---|---|

2 | ;* SSE2-optimized weighted prediction code |
||

3 | ;***************************************************************************** |
||

4 | ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt |
||

5 | ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> |
||

6 | ;* |
||

7 | ;* This file is part of FFmpeg. |
||

8 | ;* |
||

9 | ;* FFmpeg is free software; you can redistribute it and/or |
||

10 | ;* modify it under the terms of the GNU Lesser General Public |
||

11 | ;* License as published by the Free Software Foundation; either |
||

12 | ;* version 2.1 of the License, or (at your option) any later version. |
||

13 | ;* |
||

14 | ;* FFmpeg is distributed in the hope that it will be useful, |
||

15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||

16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||

17 | ;* Lesser General Public License for more details. |
||

18 | ;* |
||

19 | ;* You should have received a copy of the GNU Lesser General Public |
||

20 | ;* License along with FFmpeg; if not, write to the Free Software |
||

21 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||

22 | ;****************************************************************************** |
||

23 | |||

24 | %include "x86inc.asm" |
||

25 | |||

26 | SECTION .text |
||

27 | |||

28 | ;----------------------------------------------------------------------------- |
||

29 | ; biweight pred: |
||

30 | ; |
||

31 | ; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, |
||

32 | ; int log2_denom, int weightd, int weights, |
||

33 | ; int offset); |
||

34 | ; and |
||

35 | ; void h264_weight_16x16_sse2(uint8_t *dst, int stride, |
||

36 | ; int log2_denom, int weight, |
||

37 | ; int offset); |
||

38 | ;----------------------------------------------------------------------------- |
||

39 | |||

40 | %macro WEIGHT_SETUP 0 |
||

41 | add r4, r4 |
||

42 | inc r4 |
||

43 | b1c32fb5 | Reimar Döffinger | movd m3, r3d |

44 | movd m5, r4d |
||

45 | movd m6, r2d |
||

46 | a33a2562 | Ronald S. Bultje | pslld m5, m6 |

47 | psrld m5, 1 |
||

48 | %if mmsize == 16 |
||

49 | pshuflw m3, m3, 0 |
||

50 | pshuflw m5, m5, 0 |
||

51 | punpcklqdq m3, m3 |
||

52 | punpcklqdq m5, m5 |
||

53 | %else |
||

54 | pshufw m3, m3, 0 |
||

55 | pshufw m5, m5, 0 |
||

56 | %endif |
||

57 | pxor m7, m7 |
||

58 | %endmacro |
||

59 | |||

60 | %macro WEIGHT_OP 2 |
||

61 | movh m0, [r0+%1] |
||

62 | movh m1, [r0+%2] |
||

63 | punpcklbw m0, m7 |
||

64 | punpcklbw m1, m7 |
||

65 | pmullw m0, m3 |
||

66 | pmullw m1, m3 |
||

67 | paddsw m0, m5 |
||

68 | paddsw m1, m5 |
||

69 | psraw m0, m6 |
||

70 | psraw m1, m6 |
||

71 | packuswb m0, m1 |
||

72 | %endmacro |
||

73 | |||

74 | %macro WEIGHT_FUNC_DBL_MM 1 |
||

75 | cglobal h264_weight_16x%1_mmx2, 5, 5, 0 |
||

76 | WEIGHT_SETUP |
||

77 | mov r2, %1 |
||

78 | %if %1 == 16 |
||

79 | .nextrow |
||

80 | WEIGHT_OP 0, 4 |
||

81 | mova [r0 ], m0 |
||

82 | WEIGHT_OP 8, 12 |
||

83 | mova [r0+8], m0 |
||

84 | add r0, r1 |
||

85 | dec r2 |
||

86 | jnz .nextrow |
||

87 | REP_RET |
||

88 | %else |
||

89 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_weight_16x16_mmx2.nextrow) |

90 | a33a2562 | Ronald S. Bultje | %endif |

91 | %endmacro |
||

92 | |||

93 | INIT_MMX |
||

94 | WEIGHT_FUNC_DBL_MM 16 |
||

95 | WEIGHT_FUNC_DBL_MM 8 |
||

96 | |||

97 | %macro WEIGHT_FUNC_MM 4 |
||

98 | a10a9f5c | Eli Friedman | cglobal h264_weight_%1x%2_%4, 7, 7, %3 |

99 | a33a2562 | Ronald S. Bultje | WEIGHT_SETUP |

100 | mov r2, %2 |
||

101 | %if %2 == 16 |
||

102 | .nextrow |
||

103 | WEIGHT_OP 0, mmsize/2 |
||

104 | mova [r0], m0 |
||

105 | add r0, r1 |
||

106 | dec r2 |
||

107 | jnz .nextrow |
||

108 | REP_RET |
||

109 | %else |
||

110 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_weight_%1x16_%4.nextrow) |

111 | a33a2562 | Ronald S. Bultje | %endif |

112 | %endmacro |
||

113 | |||

114 | INIT_MMX |
||

115 | WEIGHT_FUNC_MM 8, 16, 0, mmx2 |
||

116 | WEIGHT_FUNC_MM 8, 8, 0, mmx2 |
||

117 | WEIGHT_FUNC_MM 8, 4, 0, mmx2 |
||

118 | INIT_XMM |
||

119 | WEIGHT_FUNC_MM 16, 16, 8, sse2 |
||

120 | WEIGHT_FUNC_MM 16, 8, 8, sse2 |
||

121 | |||

122 | %macro WEIGHT_FUNC_HALF_MM 5 |
||

123 | cglobal h264_weight_%1x%2_%5, 5, 5, %4 |
||

124 | WEIGHT_SETUP |
||

125 | mov r2, %2/2 |
||

126 | lea r3, [r1*2] |
||

127 | %if %2 == mmsize |
||

128 | .nextrow |
||

129 | WEIGHT_OP 0, r1 |
||

130 | movh [r0], m0 |
||

131 | %if mmsize == 16 |
||

132 | movhps [r0+r1], m0 |
||

133 | %else |
||

134 | psrlq m0, 32 |
||

135 | movh [r0+r1], m0 |
||

136 | %endif |
||

137 | add r0, r3 |
||

138 | dec r2 |
||

139 | jnz .nextrow |
||

140 | REP_RET |
||

141 | %else |
||

142 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_weight_%1x%3_%5.nextrow) |

143 | a33a2562 | Ronald S. Bultje | %endif |

144 | %endmacro |
||

145 | |||

146 | INIT_MMX |
||

147 | WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 |
||

148 | WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 |
||

149 | WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 |
||

150 | INIT_XMM |
||

151 | WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 |
||

152 | WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 |
||

153 | WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 |
||

154 | |||

155 | %macro BIWEIGHT_SETUP 0 |
||

156 | add r6, 1 |
||

157 | or r6, 1 |
||

158 | add r3, 1 |
||

159 | b1c32fb5 | Reimar Döffinger | movd m3, r4d |

160 | movd m4, r5d |
||

161 | movd m5, r6d |
||

162 | movd m6, r3d |
||

163 | a33a2562 | Ronald S. Bultje | pslld m5, m6 |

164 | psrld m5, 1 |
||

165 | %if mmsize == 16 |
||

166 | pshuflw m3, m3, 0 |
||

167 | pshuflw m4, m4, 0 |
||

168 | pshuflw m5, m5, 0 |
||

169 | punpcklqdq m3, m3 |
||

170 | punpcklqdq m4, m4 |
||

171 | punpcklqdq m5, m5 |
||

172 | %else |
||

173 | pshufw m3, m3, 0 |
||

174 | pshufw m4, m4, 0 |
||

175 | pshufw m5, m5, 0 |
||

176 | %endif |
||

177 | pxor m7, m7 |
||

178 | %endmacro |
||

179 | |||

180 | %macro BIWEIGHT_STEPA 3 |
||

181 | movh m%1, [r0+%3] |
||

182 | movh m%2, [r1+%3] |
||

183 | punpcklbw m%1, m7 |
||

184 | punpcklbw m%2, m7 |
||

185 | pmullw m%1, m3 |
||

186 | pmullw m%2, m4 |
||

187 | paddsw m%1, m%2 |
||

188 | %endmacro |
||

189 | |||

190 | %macro BIWEIGHT_STEPB 0 |
||

191 | paddsw m0, m5 |
||

192 | paddsw m1, m5 |
||

193 | psraw m0, m6 |
||

194 | psraw m1, m6 |
||

195 | packuswb m0, m1 |
||

196 | %endmacro |
||

197 | |||

198 | %macro BIWEIGHT_FUNC_DBL_MM 1 |
||

199 | cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 |
||

200 | BIWEIGHT_SETUP |
||

201 | mov r3, %1 |
||

202 | %if %1 == 16 |
||

203 | .nextrow |
||

204 | BIWEIGHT_STEPA 0, 1, 0 |
||

205 | BIWEIGHT_STEPA 1, 2, 4 |
||

206 | BIWEIGHT_STEPB |
||

207 | mova [r0], m0 |
||

208 | BIWEIGHT_STEPA 0, 1, 8 |
||

209 | BIWEIGHT_STEPA 1, 2, 12 |
||

210 | BIWEIGHT_STEPB |
||

211 | mova [r0+8], m0 |
||

212 | add r0, r2 |
||

213 | add r1, r2 |
||

214 | dec r3 |
||

215 | jnz .nextrow |
||

216 | REP_RET |
||

217 | %else |
||

218 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow) |

219 | a33a2562 | Ronald S. Bultje | %endif |

220 | %endmacro |
||

221 | |||

222 | INIT_MMX |
||

223 | BIWEIGHT_FUNC_DBL_MM 16 |
||

224 | BIWEIGHT_FUNC_DBL_MM 8 |
||

225 | |||

226 | %macro BIWEIGHT_FUNC_MM 4 |
||

227 | cglobal h264_biweight_%1x%2_%4, 7, 7, %3 |
||

228 | BIWEIGHT_SETUP |
||

229 | mov r3, %2 |
||

230 | %if %2 == 16 |
||

231 | .nextrow |
||

232 | BIWEIGHT_STEPA 0, 1, 0 |
||

233 | BIWEIGHT_STEPA 1, 2, mmsize/2 |
||

234 | BIWEIGHT_STEPB |
||

235 | mova [r0], m0 |
||

236 | add r0, r2 |
||

237 | add r1, r2 |
||

238 | dec r3 |
||

239 | jnz .nextrow |
||

240 | REP_RET |
||

241 | %else |
||

242 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_biweight_%1x16_%4.nextrow) |

243 | a33a2562 | Ronald S. Bultje | %endif |

244 | %endmacro |
||

245 | |||

246 | INIT_MMX |
||

247 | BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 |
||

248 | BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 |
||

249 | BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 |
||

250 | INIT_XMM |
||

251 | BIWEIGHT_FUNC_MM 16, 16, 8, sse2 |
||

252 | BIWEIGHT_FUNC_MM 16, 8, 8, sse2 |
||

253 | |||

254 | %macro BIWEIGHT_FUNC_HALF_MM 5 |
||

255 | cglobal h264_biweight_%1x%2_%5, 7, 7, %4 |
||

256 | BIWEIGHT_SETUP |
||

257 | mov r3, %2/2 |
||

258 | lea r4, [r2*2] |
||

259 | %if %2 == mmsize |
||

260 | .nextrow |
||

261 | BIWEIGHT_STEPA 0, 1, 0 |
||

262 | BIWEIGHT_STEPA 1, 2, r2 |
||

263 | BIWEIGHT_STEPB |
||

264 | movh [r0], m0 |
||

265 | %if mmsize == 16 |
||

266 | movhps [r0+r2], m0 |
||

267 | %else |
||

268 | psrlq m0, 32 |
||

269 | movh [r0+r2], m0 |
||

270 | %endif |
||

271 | add r0, r4 |
||

272 | add r1, r4 |
||

273 | dec r3 |
||

274 | jnz .nextrow |
||

275 | REP_RET |
||

276 | %else |
||

277 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow) |

278 | a33a2562 | Ronald S. Bultje | %endif |

279 | %endmacro |
||

280 | |||

281 | INIT_MMX |
||

282 | BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 |
||

283 | BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 |
||

284 | BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 |
||

285 | INIT_XMM |
||

286 | BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 |
||

287 | BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 |
||

288 | BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 |
||

289 | |||

290 | %macro BIWEIGHT_SSSE3_SETUP 0 |
||

291 | add r6, 1 |
||

292 | or r6, 1 |
||

293 | add r3, 1 |
||

294 | b1c32fb5 | Reimar Döffinger | movd m4, r4d |

295 | movd m0, r5d |
||

296 | movd m5, r6d |
||

297 | movd m6, r3d |
||

298 | a33a2562 | Ronald S. Bultje | pslld m5, m6 |

299 | psrld m5, 1 |
||

300 | punpcklbw m4, m0 |
||

301 | pshuflw m4, m4, 0 |
||

302 | pshuflw m5, m5, 0 |
||

303 | punpcklqdq m4, m4 |
||

304 | punpcklqdq m5, m5 |
||

305 | %endmacro |
||

306 | |||

307 | %macro BIWEIGHT_SSSE3_OP 0 |
||

308 | pmaddubsw m0, m4 |
||

309 | pmaddubsw m2, m4 |
||

310 | paddsw m0, m5 |
||

311 | paddsw m2, m5 |
||

312 | psraw m0, m6 |
||

313 | psraw m2, m6 |
||

314 | packuswb m0, m2 |
||

315 | %endmacro |
||

316 | |||

317 | %macro BIWEIGHT_SSSE3_16 1 |
||

318 | cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 |
||

319 | BIWEIGHT_SSSE3_SETUP |
||

320 | mov r3, %1 |
||

321 | |||

322 | %if %1 == 16 |
||

323 | .nextrow |
||

324 | movh m0, [r0] |
||

325 | movh m2, [r0+8] |
||

326 | movh m3, [r1+8] |
||

327 | punpcklbw m0, [r1] |
||

328 | punpcklbw m2, m3 |
||

329 | BIWEIGHT_SSSE3_OP |
||

330 | mova [r0], m0 |
||

331 | add r0, r2 |
||

332 | add r1, r2 |
||

333 | dec r3 |
||

334 | jnz .nextrow |
||

335 | REP_RET |
||

336 | %else |
||

337 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow) |

338 | a33a2562 | Ronald S. Bultje | %endif |

339 | %endmacro |
||

340 | |||

341 | INIT_XMM |
||

342 | BIWEIGHT_SSSE3_16 16 |
||

343 | BIWEIGHT_SSSE3_16 8 |
||

344 | |||

345 | %macro BIWEIGHT_SSSE3_8 1 |
||

346 | cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 |
||

347 | BIWEIGHT_SSSE3_SETUP |
||

348 | mov r3, %1/2 |
||

349 | lea r4, [r2*2] |
||

350 | |||

351 | %if %1 == 16 |
||

352 | .nextrow |
||

353 | movh m0, [r0] |
||

354 | movh m1, [r1] |
||

355 | movh m2, [r0+r2] |
||

356 | movh m3, [r1+r2] |
||

357 | punpcklbw m0, m1 |
||

358 | punpcklbw m2, m3 |
||

359 | BIWEIGHT_SSSE3_OP |
||

360 | movh [r0], m0 |
||

361 | movhps [r0+r2], m0 |
||

362 | add r0, r4 |
||

363 | add r1, r4 |
||

364 | dec r3 |
||

365 | jnz .nextrow |
||

366 | REP_RET |
||

367 | %else |
||

368 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow) |

369 | a33a2562 | Ronald S. Bultje | %endif |

370 | %endmacro |
||

371 | |||

372 | INIT_XMM |
||

373 | BIWEIGHT_SSSE3_8 16 |
||

374 | BIWEIGHT_SSSE3_8 8 |
||

375 | BIWEIGHT_SSSE3_8 4 |