1 | d0eb5a11 | Ronald S. Bultje | ;****************************************************************************** |
---|---|---|---|

2 | ;* MMX/SSSE3-optimized functions for H264 chroma MC |
||

3 | ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, |
||

4 | ;* 2005-2008 Loren Merritt |
||

5 | ;* |
||

6 | 2912e87a | Mans Rullgard | ;* This file is part of Libav. |

7 | d0eb5a11 | Ronald S. Bultje | ;* |

8 | 2912e87a | Mans Rullgard | ;* Libav is free software; you can redistribute it and/or |

9 | d0eb5a11 | Ronald S. Bultje | ;* modify it under the terms of the GNU Lesser General Public |

10 | ;* License as published by the Free Software Foundation; either |
||

11 | ;* version 2.1 of the License, or (at your option) any later version. |
||

12 | ;* |
||

13 | 2912e87a | Mans Rullgard | ;* Libav is distributed in the hope that it will be useful, |

14 | d0eb5a11 | Ronald S. Bultje | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||

16 | ;* Lesser General Public License for more details. |
||

17 | ;* |
||

18 | ;* You should have received a copy of the GNU Lesser General Public |
||

19 | 2912e87a | Mans Rullgard | ;* License along with Libav; if not, write to the Free Software |

20 | 888fa31e | Diego Biurrun | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

21 | d0eb5a11 | Ronald S. Bultje | ;****************************************************************************** |

22 | |||

23 | %include "x86inc.asm" |
||

24 | %include "x86util.asm" |
||

25 | |||

26 | SECTION_RODATA |
||

27 | |||

28 | rnd_rv40_2d_tbl: times 4 dw 0 |
||

29 | times 4 dw 16 |
||

30 | times 4 dw 32 |
||

31 | times 4 dw 16 |
||

32 | times 4 dw 32 |
||

33 | times 4 dw 28 |
||

34 | times 4 dw 32 |
||

35 | times 4 dw 28 |
||

36 | times 4 dw 0 |
||

37 | times 4 dw 32 |
||

38 | times 4 dw 16 |
||

39 | times 4 dw 32 |
||

40 | times 4 dw 32 |
||

41 | times 4 dw 28 |
||

42 | times 4 dw 32 |
||

43 | times 4 dw 28 |
||

44 | rnd_rv40_1d_tbl: times 4 dw 0 |
||

45 | times 4 dw 2 |
||

46 | times 4 dw 4 |
||

47 | times 4 dw 2 |
||

48 | times 4 dw 4 |
||

49 | times 4 dw 3 |
||

50 | times 4 dw 4 |
||

51 | times 4 dw 3 |
||

52 | times 4 dw 0 |
||

53 | times 4 dw 4 |
||

54 | times 4 dw 2 |
||

55 | times 4 dw 4 |
||

56 | times 4 dw 4 |
||

57 | times 4 dw 3 |
||

58 | times 4 dw 4 |
||

59 | times 4 dw 3 |
||

60 | |||

61 | cextern pw_3 |
||

62 | cextern pw_4 |
||

63 | cextern pw_8 |
||

64 | cextern pw_28 |
||

65 | cextern pw_32 |
||

66 | cextern pw_64 |
||

67 | |||

68 | SECTION .text |
||

69 | |||

70 | %macro mv0_pixels_mc8 0 |
||

71 | lea r4, [r2*2 ] |
||

72 | .next4rows |
||

73 | movq mm0, [r1 ] |
||

74 | movq mm1, [r1+r2] |
||

75 | CHROMAMC_AVG mm0, [r0 ] |
||

76 | CHROMAMC_AVG mm1, [r0+r2] |
||

77 | movq [r0 ], mm0 |
||

78 | movq [r0+r2], mm1 |
||

79 | add r0, r4 |
||

80 | add r1, r4 |
||

81 | movq mm0, [r1 ] |
||

82 | movq mm1, [r1+r2] |
||

83 | CHROMAMC_AVG mm0, [r0 ] |
||

84 | CHROMAMC_AVG mm1, [r0+r2] |
||

85 | add r1, r4 |
||

86 | movq [r0 ], mm0 |
||

87 | movq [r0+r2], mm1 |
||

88 | add r0, r4 |
||

89 | sub r3d, 4 |
||

90 | jne .next4rows |
||

91 | %endmacro |
||

92 | |||

93 | %macro chroma_mc8_mmx_func 3 |
||

94 | ; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, |
||

95 | ; int stride, int h, int mx, int my) |
||

96 | cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 |
||

97 | %ifdef ARCH_X86_64 |
||

98 | movsxd r2, r2d |
||

99 | %endif |
||

100 | mov r6d, r5d |
||

101 | or r6d, r4d |
||

102 | jne .at_least_one_non_zero |
||

103 | ; mx == 0 AND my == 0 - no filter needed |
||

104 | mv0_pixels_mc8 |
||

105 | REP_RET |
||

106 | |||

107 | .at_least_one_non_zero |
||

108 | %ifidn %2, rv40 |
||

109 | %ifdef PIC |
||

110 | %define rnd_1d_rv40 r11 |
||

111 | %define rnd_2d_rv40 r11 |
||

112 | %else ; no-PIC |
||

113 | %define rnd_1d_rv40 rnd_rv40_1d_tbl |
||

114 | %define rnd_2d_rv40 rnd_rv40_2d_tbl |
||

115 | %endif |
||

116 | %ifdef ARCH_X86_64 |
||

117 | mov r10, r5 |
||

118 | and r10, 6 ; &~1 for mx/my=[0,7] |
||

119 | lea r10, [r10*4+r4] |
||

120 | sar r10d, 1 |
||

121 | %define rnd_bias r10 |
||

122 | %define dest_reg r0 |
||

123 | %else ; x86-32 |
||

124 | mov r0, r5 |
||

125 | and r0, 6 ; &~1 for mx/my=[0,7] |
||

126 | lea r0, [r0*4+r4] |
||

127 | sar r0d, 1 |
||

128 | %define rnd_bias r0 |
||

129 | %define dest_reg r5 |
||

130 | %endif |
||

131 | %else ; vc1, h264 |
||

132 | %define rnd_bias 0 |
||

133 | %define dest_reg r0 |
||

134 | %endif |
||

135 | |||

136 | test r5d, r5d |
||

137 | mov r6, 1 |
||

138 | je .my_is_zero |
||

139 | test r4d, r4d |
||

140 | mov r6, r2 ; dxy = x ? 1 : stride |
||

141 | jne .both_non_zero |
||

142 | .my_is_zero |
||

143 | ; mx == 0 XOR my == 0 - 1 dimensional filter only |
||

144 | or r4d, r5d ; x + y |
||

145 | |||

146 | %ifidn %2, rv40 |
||

147 | %ifdef PIC |
||

148 | lea r11, [rnd_rv40_1d_tbl] |
||

149 | %endif |
||

150 | %ifndef ARCH_X86_64 |
||

151 | mov r5, r0m |
||

152 | %endif |
||

153 | %endif |
||

154 | |||

155 | movd m5, r4d |
||

156 | movq m4, [pw_8] |
||

157 | movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 |
||

158 | punpcklwd m5, m5 |
||

159 | punpckldq m5, m5 ; mm5 = B = x |
||

160 | pxor m7, m7 |
||

161 | psubw m4, m5 ; mm4 = A = 8-x |
||

162 | |||

163 | .next1drow |
||

164 | movq m0, [r1 ] ; mm0 = src[0..7] |
||

165 | movq m2, [r1+r6] ; mm1 = src[1..8] |
||

166 | |||

167 | movq m1, m0 |
||

168 | movq m3, m2 |
||

169 | punpcklbw m0, m7 |
||

170 | punpckhbw m1, m7 |
||

171 | punpcklbw m2, m7 |
||

172 | punpckhbw m3, m7 |
||

173 | pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] |
||

174 | pmullw m1, m4 |
||

175 | pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] |
||

176 | pmullw m3, m5 |
||

177 | |||

178 | paddw m0, m6 |
||

179 | paddw m1, m6 |
||

180 | paddw m0, m2 |
||

181 | paddw m1, m3 |
||

182 | psrlw m0, 3 |
||

183 | psrlw m1, 3 |
||

184 | packuswb m0, m1 |
||

185 | CHROMAMC_AVG m0, [dest_reg] |
||

186 | movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 |
||

187 | |||

188 | add dest_reg, r2 |
||

189 | add r1, r2 |
||

190 | dec r3d |
||

191 | jne .next1drow |
||

192 | REP_RET |
||

193 | |||

194 | .both_non_zero ; general case, bilinear |
||

195 | movd m4, r4d ; x |
||

196 | movd m6, r5d ; y |
||

197 | %ifidn %2, rv40 |
||

198 | %ifdef PIC |
||

199 | lea r11, [rnd_rv40_2d_tbl] |
||

200 | %endif |
||

201 | %ifndef ARCH_X86_64 |
||

202 | mov r5, r0m |
||

203 | %endif |
||

204 | %endif |
||

205 | mov r6, rsp ; backup stack pointer |
||

206 | and rsp, ~(mmsize-1) ; align stack |
||

207 | sub rsp, 16 ; AA and DD |
||

208 | |||

209 | punpcklwd m4, m4 |
||

210 | punpcklwd m6, m6 |
||

211 | punpckldq m4, m4 ; mm4 = x words |
||

212 | punpckldq m6, m6 ; mm6 = y words |
||

213 | movq m5, m4 |
||

214 | pmullw m4, m6 ; mm4 = x * y |
||

215 | psllw m5, 3 |
||

216 | psllw m6, 3 |
||

217 | movq m7, m5 |
||

218 | paddw m7, m6 |
||

219 | movq [rsp+8], m4 ; DD = x * y |
||

220 | psubw m5, m4 ; mm5 = B = 8x - xy |
||

221 | psubw m6, m4 ; mm6 = C = 8y - xy |
||

222 | paddw m4, [pw_64] |
||

223 | psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 |
||

224 | pxor m7, m7 |
||

225 | movq [rsp ], m4 |
||

226 | |||

227 | movq m0, [r1 ] ; mm0 = src[0..7] |
||

228 | movq m1, [r1+1] ; mm1 = src[1..8] |
||

229 | .next2drow |
||

230 | add r1, r2 |
||

231 | |||

232 | movq m2, m0 |
||

233 | movq m3, m1 |
||

234 | punpckhbw m0, m7 |
||

235 | punpcklbw m1, m7 |
||

236 | punpcklbw m2, m7 |
||

237 | punpckhbw m3, m7 |
||

238 | pmullw m0, [rsp] |
||

239 | pmullw m2, [rsp] |
||

240 | pmullw m1, m5 |
||

241 | pmullw m3, m5 |
||

242 | paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] |
||

243 | paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] |
||

244 | |||

245 | movq m0, [r1] |
||

246 | movq m1, m0 |
||

247 | punpcklbw m0, m7 |
||

248 | punpckhbw m1, m7 |
||

249 | pmullw m0, m6 |
||

250 | pmullw m1, m6 |
||

251 | paddw m2, m0 |
||

252 | paddw m3, m1 ; [mm2,mm3] += C * src[0..7] |
||

253 | |||

254 | movq m1, [r1+1] |
||

255 | movq m0, m1 |
||

256 | movq m4, m1 |
||

257 | punpcklbw m0, m7 |
||

258 | punpckhbw m4, m7 |
||

259 | pmullw m0, [rsp+8] |
||

260 | pmullw m4, [rsp+8] |
||

261 | paddw m2, m0 |
||

262 | paddw m3, m4 ; [mm2,mm3] += D * src[1..8] |
||

263 | movq m0, [r1] |
||

264 | |||

265 | paddw m2, [rnd_2d_%2+rnd_bias*8] |
||

266 | paddw m3, [rnd_2d_%2+rnd_bias*8] |
||

267 | psrlw m2, 6 |
||

268 | psrlw m3, 6 |
||

269 | packuswb m2, m3 |
||

270 | CHROMAMC_AVG m2, [dest_reg] |
||

271 | movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 |
||

272 | |||

273 | add dest_reg, r2 |
||

274 | dec r3d |
||

275 | jne .next2drow |
||

276 | mov rsp, r6 ; restore stack pointer |
||

277 | RET |
||

278 | %endmacro |
||

279 | |||

280 | %macro chroma_mc4_mmx_func 3 |
||

281 | cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 |
||

282 | %ifdef ARCH_X86_64 |
||

283 | movsxd r2, r2d |
||

284 | %endif |
||

285 | pxor m7, m7 |
||

286 | movd m2, r4d ; x |
||

287 | movd m3, r5d ; y |
||

288 | movq m4, [pw_8] |
||

289 | movq m5, [pw_8] |
||

290 | punpcklwd m2, m2 |
||

291 | punpcklwd m3, m3 |
||

292 | punpcklwd m2, m2 |
||

293 | punpcklwd m3, m3 |
||

294 | psubw m4, m2 |
||

295 | psubw m5, m3 |
||

296 | |||

297 | %ifidn %2, rv40 |
||

298 | %ifdef PIC |
||

299 | lea r11, [rnd_rv40_2d_tbl] |
||

300 | %define rnd_2d_rv40 r11 |
||

301 | %else |
||

302 | %define rnd_2d_rv40 rnd_rv40_2d_tbl |
||

303 | %endif |
||

304 | and r5, 6 ; &~1 for mx/my=[0,7] |
||

305 | lea r5, [r5*4+r4] |
||

306 | sar r5d, 1 |
||

307 | %define rnd_bias r5 |
||

308 | %else ; vc1, h264 |
||

309 | %define rnd_bias 0 |
||

310 | %endif |
||

311 | |||

312 | movd m0, [r1 ] |
||

313 | movd m6, [r1+1] |
||

314 | add r1, r2 |
||

315 | punpcklbw m0, m7 |
||

316 | punpcklbw m6, m7 |
||

317 | pmullw m0, m4 |
||

318 | pmullw m6, m2 |
||

319 | paddw m6, m0 |
||

320 | |||

321 | .next2rows |
||

322 | movd m0, [r1 ] |
||

323 | movd m1, [r1+1] |
||

324 | add r1, r2 |
||

325 | punpcklbw m0, m7 |
||

326 | punpcklbw m1, m7 |
||

327 | pmullw m0, m4 |
||

328 | pmullw m1, m2 |
||

329 | paddw m1, m0 |
||

330 | movq m0, m1 |
||

331 | |||

332 | pmullw m6, m5 |
||

333 | pmullw m1, m3 |
||

334 | paddw m6, [rnd_2d_%2+rnd_bias*8] |
||

335 | paddw m1, m6 |
||

336 | psrlw m1, 6 |
||

337 | packuswb m1, m1 |
||

338 | CHROMAMC_AVG4 m1, m6, [r0] |
||

339 | movd [r0], m1 |
||

340 | add r0, r2 |
||

341 | |||

342 | movd m6, [r1 ] |
||

343 | movd m1, [r1+1] |
||

344 | add r1, r2 |
||

345 | punpcklbw m6, m7 |
||

346 | punpcklbw m1, m7 |
||

347 | pmullw m6, m4 |
||

348 | pmullw m1, m2 |
||

349 | paddw m1, m6 |
||

350 | movq m6, m1 |
||

351 | pmullw m0, m5 |
||

352 | pmullw m1, m3 |
||

353 | paddw m0, [rnd_2d_%2+rnd_bias*8] |
||

354 | paddw m1, m0 |
||

355 | psrlw m1, 6 |
||

356 | packuswb m1, m1 |
||

357 | CHROMAMC_AVG4 m1, m0, [r0] |
||

358 | movd [r0], m1 |
||

359 | add r0, r2 |
||

360 | sub r3d, 2 |
||

361 | jnz .next2rows |
||

362 | REP_RET |
||

363 | %endmacro |
||

364 | |||

365 | %macro chroma_mc2_mmx_func 3 |
||

366 | cglobal %1_%2_chroma_mc2_%3, 6, 7, 0 |
||

367 | %ifdef ARCH_X86_64 |
||

368 | movsxd r2, r2d |
||

369 | %endif |
||

370 | |||

371 | mov r6d, r4d |
||

372 | shl r4d, 16 |
||

373 | sub r4d, r6d |
||

374 | add r4d, 8 |
||

375 | imul r5d, r4d ; x*y<<16 | y*(8-x) |
||

376 | shl r4d, 3 |
||

377 | sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) |
||

378 | |||

379 | movd m5, r4d |
||

380 | movd m6, r5d |
||

381 | punpckldq m5, m5 ; mm5 = {A,B,A,B} |
||

382 | punpckldq m6, m6 ; mm6 = {C,D,C,D} |
||

383 | pxor m7, m7 |
||

384 | movd m2, [r1] |
||

385 | punpcklbw m2, m7 |
||

386 | pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] |
||

387 | |||

388 | .nextrow |
||

389 | add r1, r2 |
||

390 | movq m1, m2 |
||

391 | pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] |
||

392 | movd m0, [r1] |
||

393 | punpcklbw m0, m7 |
||

394 | pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] |
||

395 | movq m2, m0 |
||

396 | pmaddwd m0, m6 |
||

397 | paddw m1, [rnd_2d_%2] |
||

398 | paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] |
||

399 | psrlw m1, 6 |
||

400 | packssdw m1, m7 |
||

401 | packuswb m1, m7 |
||

402 | CHROMAMC_AVG4 m1, m3, [r0] |
||

403 | movd r5d, m1 |
||

404 | mov [r0], r5w |
||

405 | add r0, r2 |
||

406 | sub r3d, 1 |
||

407 | jnz .nextrow |
||

408 | REP_RET |
||

409 | %endmacro |
||

410 | |||

411 | %define rnd_1d_h264 pw_4 |
||

412 | %define rnd_2d_h264 pw_32 |
||

413 | %define rnd_1d_vc1 pw_3 |
||

414 | %define rnd_2d_vc1 pw_28 |
||

415 | |||

416 | %macro NOTHING 2-3 |
||

417 | %endmacro |
||

418 | %macro DIRECT_AVG 2 |
||

419 | PAVG %1, %2 |
||

420 | %endmacro |
||

421 | %macro COPY_AVG 3 |
||

422 | movd %2, %3 |
||

423 | PAVG %1, %2 |
||

424 | %endmacro |
||

425 | |||

426 | INIT_MMX |
||

427 | %define CHROMAMC_AVG NOTHING |
||

428 | %define CHROMAMC_AVG4 NOTHING |
||

429 | chroma_mc8_mmx_func put, h264, mmx_rnd |
||

430 | chroma_mc8_mmx_func put, vc1, mmx_nornd |
||

431 | chroma_mc8_mmx_func put, rv40, mmx |
||

432 | chroma_mc4_mmx_func put, h264, mmx |
||

433 | chroma_mc4_mmx_func put, rv40, mmx |
||

434 | chroma_mc2_mmx_func put, h264, mmx2 |
||

435 | |||

436 | %define CHROMAMC_AVG DIRECT_AVG |
||

437 | %define CHROMAMC_AVG4 COPY_AVG |
||

438 | %define PAVG pavgb |
||

439 | chroma_mc8_mmx_func avg, h264, mmx2_rnd |
||

440 | chroma_mc8_mmx_func avg, vc1, mmx2_nornd |
||

441 | chroma_mc8_mmx_func avg, rv40, mmx2 |
||

442 | chroma_mc4_mmx_func avg, h264, mmx2 |
||

443 | chroma_mc4_mmx_func avg, rv40, mmx2 |
||

444 | chroma_mc2_mmx_func avg, h264, mmx2 |
||

445 | |||

446 | %define PAVG pavgusb |
||

447 | chroma_mc8_mmx_func avg, h264, 3dnow_rnd |
||

448 | chroma_mc8_mmx_func avg, vc1, 3dnow_nornd |
||

449 | chroma_mc8_mmx_func avg, rv40, 3dnow |
||

450 | chroma_mc4_mmx_func avg, h264, 3dnow |
||

451 | chroma_mc4_mmx_func avg, rv40, 3dnow |
||

452 | |||

453 | %macro chroma_mc8_ssse3_func 3 |
||

454 | cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 |
||

455 | %ifdef ARCH_X86_64 |
||

456 | movsxd r2, r2d |
||

457 | %endif |
||

458 | mov r6d, r5d |
||

459 | or r6d, r4d |
||

460 | jne .at_least_one_non_zero |
||

461 | ; mx == 0 AND my == 0 - no filter needed |
||

462 | mv0_pixels_mc8 |
||

463 | REP_RET |
||

464 | |||

465 | .at_least_one_non_zero |
||

466 | test r5d, r5d |
||

467 | je .my_is_zero |
||

468 | test r4d, r4d |
||

469 | je .mx_is_zero |
||

470 | |||

471 | ; general case, bilinear |
||

472 | mov r6d, r4d |
||

473 | shl r4d, 8 |
||

474 | sub r4, r6 |
||

475 | add r4, 8 ; x*288+8 = x<<8 | (8-x) |
||

476 | mov r6, 8 |
||

477 | sub r6d, r5d |
||

478 | imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) |
||

479 | imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) |
||

480 | |||

481 | movd m7, r6d |
||

482 | movd m6, r4d |
||

483 | movdqa m5, [rnd_2d_%2] |
||

484 | pshuflw m7, m7, 0 |
||

485 | pshuflw m6, m6, 0 |
||

486 | movlhps m7, m7 |
||

487 | movlhps m6, m6 |
||

488 | |||

489 | movq m0, [r1 ] |
||

490 | movq m1, [r1 +1] |
||

491 | punpcklbw m0, m1 |
||

492 | add r1, r2 |
||

493 | .next2rows |
||

494 | movq m1, [r1 ] |
||

495 | movq m2, [r1 +1] |
||

496 | movq m3, [r1+r2 ] |
||

497 | movq m4, [r1+r2+1] |
||

498 | lea r1, [r1+r2*2] |
||

499 | punpcklbw m1, m2 |
||

500 | punpcklbw m3, m4 |
||

501 | movdqa m2, m1 |
||

502 | movdqa m4, m3 |
||

503 | pmaddubsw m0, m7 |
||

504 | pmaddubsw m1, m6 |
||

505 | pmaddubsw m2, m7 |
||

506 | pmaddubsw m3, m6 |
||

507 | paddw m0, m5 |
||

508 | paddw m2, m5 |
||

509 | paddw m1, m0 |
||

510 | paddw m3, m2 |
||

511 | movdqa m0, m4 |
||

512 | psrlw m1, 6 |
||

513 | psrlw m3, 6 |
||

514 | %ifidn %1, avg |
||

515 | movq m2, [r0 ] |
||

516 | movhps m2, [r0+r2] |
||

517 | %endif |
||

518 | packuswb m1, m3 |
||

519 | CHROMAMC_AVG m1, m2 |
||

520 | movq [r0 ], m1 |
||

521 | movhps [r0+r2], m1 |
||

522 | sub r3d, 2 |
||

523 | lea r0, [r0+r2*2] |
||

524 | jg .next2rows |
||

525 | REP_RET |
||

526 | |||

527 | .my_is_zero |
||

528 | mov r5d, r4d |
||

529 | shl r4d, 8 |
||

530 | add r4, 8 |
||

531 | sub r4, r5 ; 255*x+8 = x<<8 | (8-x) |
||

532 | movd m7, r4d |
||

533 | 8d147f1f | Ronald S. Bultje | movdqa m6, [rnd_1d_%2] |

534 | d0eb5a11 | Ronald S. Bultje | pshuflw m7, m7, 0 |

535 | movlhps m7, m7 |
||

536 | |||

537 | .next2xrows |
||

538 | movq m0, [r1 ] |
||

539 | movq m1, [r1 +1] |
||

540 | movq m2, [r1+r2 ] |
||

541 | movq m3, [r1+r2+1] |
||

542 | punpcklbw m0, m1 |
||

543 | punpcklbw m2, m3 |
||

544 | pmaddubsw m0, m7 |
||

545 | pmaddubsw m2, m7 |
||

546 | %ifidn %1, avg |
||

547 | movq m4, [r0 ] |
||

548 | movhps m4, [r0+r2] |
||

549 | %endif |
||

550 | paddw m0, m6 |
||

551 | paddw m2, m6 |
||

552 | psrlw m0, 3 |
||

553 | psrlw m2, 3 |
||

554 | packuswb m0, m2 |
||

555 | CHROMAMC_AVG m0, m4 |
||

556 | movq [r0 ], m0 |
||

557 | movhps [r0+r2], m0 |
||

558 | sub r3d, 2 |
||

559 | lea r0, [r0+r2*2] |
||

560 | lea r1, [r1+r2*2] |
||

561 | jg .next2xrows |
||

562 | REP_RET |
||

563 | |||

564 | .mx_is_zero |
||

565 | mov r4d, r5d |
||

566 | shl r5d, 8 |
||

567 | add r5, 8 |
||

568 | sub r5, r4 ; 255*y+8 = y<<8 | (8-y) |
||

569 | movd m7, r5d |
||

570 | 8d147f1f | Ronald S. Bultje | movdqa m6, [rnd_1d_%2] |

571 | d0eb5a11 | Ronald S. Bultje | pshuflw m7, m7, 0 |

572 | movlhps m7, m7 |
||

573 | |||

574 | .next2yrows |
||

575 | movq m0, [r1 ] |
||

576 | movq m1, [r1+r2 ] |
||

577 | movdqa m2, m1 |
||

578 | movq m3, [r1+r2*2] |
||

579 | punpcklbw m0, m1 |
||

580 | punpcklbw m2, m3 |
||

581 | pmaddubsw m0, m7 |
||

582 | pmaddubsw m2, m7 |
||

583 | %ifidn %1, avg |
||

584 | movq m4, [r0 ] |
||

585 | movhps m4, [r0+r2] |
||

586 | %endif |
||

587 | paddw m0, m6 |
||

588 | paddw m2, m6 |
||

589 | psrlw m0, 3 |
||

590 | psrlw m2, 3 |
||

591 | packuswb m0, m2 |
||

592 | CHROMAMC_AVG m0, m4 |
||

593 | movq [r0 ], m0 |
||

594 | movhps [r0+r2], m0 |
||

595 | sub r3d, 2 |
||

596 | lea r0, [r0+r2*2] |
||

597 | lea r1, [r1+r2*2] |
||

598 | jg .next2yrows |
||

599 | REP_RET |
||

600 | %endmacro |
||

601 | |||

602 | %macro chroma_mc4_ssse3_func 3 |
||

603 | cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 |
||

604 | %ifdef ARCH_X86_64 |
||

605 | movsxd r2, r2d |
||

606 | %endif |
||

607 | mov r6, r4 |
||

608 | shl r4d, 8 |
||

609 | sub r4d, r6d |
||

610 | add r4d, 8 ; x*288+8 |
||

611 | mov r6, 8 |
||

612 | sub r6d, r5d |
||

613 | imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) |
||

614 | imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) |
||

615 | |||

616 | movd m7, r6d |
||

617 | movd m6, r4d |
||

618 | movq m5, [pw_32] |
||

619 | pshufw m7, m7, 0 |
||

620 | pshufw m6, m6, 0 |
||

621 | |||

622 | movd m0, [r1 ] |
||

623 | punpcklbw m0, [r1 +1] |
||

624 | add r1, r2 |
||

625 | .next2rows |
||

626 | movd m1, [r1 ] |
||

627 | movd m3, [r1+r2 ] |
||

628 | punpcklbw m1, [r1 +1] |
||

629 | punpcklbw m3, [r1+r2+1] |
||

630 | lea r1, [r1+r2*2] |
||

631 | movq m2, m1 |
||

632 | movq m4, m3 |
||

633 | pmaddubsw m0, m7 |
||

634 | pmaddubsw m1, m6 |
||

635 | pmaddubsw m2, m7 |
||

636 | pmaddubsw m3, m6 |
||

637 | paddw m0, m5 |
||

638 | paddw m2, m5 |
||

639 | paddw m1, m0 |
||

640 | paddw m3, m2 |
||

641 | movq m0, m4 |
||

642 | psrlw m1, 6 |
||

643 | psrlw m3, 6 |
||

644 | packuswb m1, m1 |
||

645 | packuswb m3, m3 |
||

646 | CHROMAMC_AVG m1, [r0 ] |
||

647 | CHROMAMC_AVG m3, [r0+r2] |
||

648 | movd [r0 ], m1 |
||

649 | movd [r0+r2], m3 |
||

650 | sub r3d, 2 |
||

651 | lea r0, [r0+r2*2] |
||

652 | jg .next2rows |
||

653 | REP_RET |
||

654 | %endmacro |
||

655 | |||

656 | %define CHROMAMC_AVG NOTHING |
||

657 | INIT_XMM |
||

658 | chroma_mc8_ssse3_func put, h264, ssse3_rnd |
||

659 | chroma_mc8_ssse3_func put, vc1, ssse3_nornd |
||

660 | INIT_MMX |
||

661 | chroma_mc4_ssse3_func put, h264, ssse3 |
||

662 | |||

663 | %define CHROMAMC_AVG DIRECT_AVG |
||

664 | %define PAVG pavgb |
||

665 | INIT_XMM |
||

666 | chroma_mc8_ssse3_func avg, h264, ssse3_rnd |
||

667 | chroma_mc8_ssse3_func avg, vc1, ssse3_nornd |
||

668 | INIT_MMX |
||

669 | chroma_mc4_ssse3_func avg, h264, ssse3 |