## ffmpeg / libavcodec / x86 / h264_chromamc.asm @ 2912e87a

History | View | Annotate | Download (17.4 KB)

1 |
;****************************************************************************** |
---|---|

2 |
;* MMX/SSSE3-optimized functions for H264 chroma MC |

3 |
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, |

4 |
;* 2005-2008 Loren Merritt |

5 |
;* |

6 |
;* This file is part of Libav. |

7 |
;* |

8 |
;* Libav is free software; you can redistribute it and/or |

9 |
;* modify it under the terms of the GNU Lesser General Public |

10 |
;* License as published by the Free Software Foundation; either |

11 |
;* version 2.1 of the License, or (at your option) any later version. |

12 |
;* |

13 |
;* Libav is distributed in the hope that it will be useful, |

14 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

15 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

16 |
;* Lesser General Public License for more details. |

17 |
;* |

18 |
;* You should have received a copy of the GNU Lesser General Public |

19 |
;* License along with Libav; if not, write to the Free Software |

20 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

21 |
;****************************************************************************** |

22 | |

23 |
%include "x86inc.asm" |

24 |
%include "x86util.asm" |

25 | |

26 |
SECTION_RODATA |

27 | |

28 |
rnd_rv40_2d_tbl: times 4 dw 0 |

29 |
times 4 dw 16 |

30 |
times 4 dw 32 |

31 |
times 4 dw 16 |

32 |
times 4 dw 32 |

33 |
times 4 dw 28 |

34 |
times 4 dw 32 |

35 |
times 4 dw 28 |

36 |
times 4 dw 0 |

37 |
times 4 dw 32 |

38 |
times 4 dw 16 |

39 |
times 4 dw 32 |

40 |
times 4 dw 32 |

41 |
times 4 dw 28 |

42 |
times 4 dw 32 |

43 |
times 4 dw 28 |

44 |
rnd_rv40_1d_tbl: times 4 dw 0 |

45 |
times 4 dw 2 |

46 |
times 4 dw 4 |

47 |
times 4 dw 2 |

48 |
times 4 dw 4 |

49 |
times 4 dw 3 |

50 |
times 4 dw 4 |

51 |
times 4 dw 3 |

52 |
times 4 dw 0 |

53 |
times 4 dw 4 |

54 |
times 4 dw 2 |

55 |
times 4 dw 4 |

56 |
times 4 dw 4 |

57 |
times 4 dw 3 |

58 |
times 4 dw 4 |

59 |
times 4 dw 3 |

60 | |

61 |
cextern pw_3 |

62 |
cextern pw_4 |

63 |
cextern pw_8 |

64 |
cextern pw_28 |

65 |
cextern pw_32 |

66 |
cextern pw_64 |

67 | |

68 |
SECTION .text |

69 | |

70 |
%macro mv0_pixels_mc8 0 |

71 |
lea r4, [r2*2 ] |

72 |
.next4rows |

73 |
movq mm0, [r1 ] |

74 |
movq mm1, [r1+r2] |

75 |
CHROMAMC_AVG mm0, [r0 ] |

76 |
CHROMAMC_AVG mm1, [r0+r2] |

77 |
movq [r0 ], mm0 |

78 |
movq [r0+r2], mm1 |

79 |
add r0, r4 |

80 |
add r1, r4 |

81 |
movq mm0, [r1 ] |

82 |
movq mm1, [r1+r2] |

83 |
CHROMAMC_AVG mm0, [r0 ] |

84 |
CHROMAMC_AVG mm1, [r0+r2] |

85 |
add r1, r4 |

86 |
movq [r0 ], mm0 |

87 |
movq [r0+r2], mm1 |

88 |
add r0, r4 |

89 |
sub r3d, 4 |

90 |
jne .next4rows |

91 |
%endmacro |

92 | |

93 |
%macro chroma_mc8_mmx_func 3 |

94 |
; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, |

95 |
; int stride, int h, int mx, int my) |

96 |
cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 |

97 |
%ifdef ARCH_X86_64 |

98 |
movsxd r2, r2d |

99 |
%endif |

100 |
mov r6d, r5d |

101 |
or r6d, r4d |

102 |
jne .at_least_one_non_zero |

103 |
; mx == 0 AND my == 0 - no filter needed |

104 |
mv0_pixels_mc8 |

105 |
REP_RET |

106 | |

107 |
.at_least_one_non_zero |

108 |
%ifidn %2, rv40 |

109 |
%ifdef PIC |

110 |
%define rnd_1d_rv40 r11 |

111 |
%define rnd_2d_rv40 r11 |

112 |
%else ; no-PIC |

113 |
%define rnd_1d_rv40 rnd_rv40_1d_tbl |

114 |
%define rnd_2d_rv40 rnd_rv40_2d_tbl |

115 |
%endif |

116 |
%ifdef ARCH_X86_64 |

117 |
mov r10, r5 |

118 |
and r10, 6 ; &~1 for mx/my=[0,7] |

119 |
lea r10, [r10*4+r4] |

120 |
sar r10d, 1 |

121 |
%define rnd_bias r10 |

122 |
%define dest_reg r0 |

123 |
%else ; x86-32 |

124 |
mov r0, r5 |

125 |
and r0, 6 ; &~1 for mx/my=[0,7] |

126 |
lea r0, [r0*4+r4] |

127 |
sar r0d, 1 |

128 |
%define rnd_bias r0 |

129 |
%define dest_reg r5 |

130 |
%endif |

131 |
%else ; vc1, h264 |

132 |
%define rnd_bias 0 |

133 |
%define dest_reg r0 |

134 |
%endif |

135 | |

136 |
test r5d, r5d |

137 |
mov r6, 1 |

138 |
je .my_is_zero |

139 |
test r4d, r4d |

140 |
mov r6, r2 ; dxy = x ? 1 : stride |

141 |
jne .both_non_zero |

142 |
.my_is_zero |

143 |
; mx == 0 XOR my == 0 - 1 dimensional filter only |

144 |
or r4d, r5d ; x + y |

145 | |

146 |
%ifidn %2, rv40 |

147 |
%ifdef PIC |

148 |
lea r11, [rnd_rv40_1d_tbl] |

149 |
%endif |

150 |
%ifndef ARCH_X86_64 |

151 |
mov r5, r0m |

152 |
%endif |

153 |
%endif |

154 | |

155 |
movd m5, r4d |

156 |
movq m4, [pw_8] |

157 |
movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 |

158 |
punpcklwd m5, m5 |

159 |
punpckldq m5, m5 ; mm5 = B = x |

160 |
pxor m7, m7 |

161 |
psubw m4, m5 ; mm4 = A = 8-x |

162 | |

163 |
.next1drow |

164 |
movq m0, [r1 ] ; mm0 = src[0..7] |

165 |
movq m2, [r1+r6] ; mm1 = src[1..8] |

166 | |

167 |
movq m1, m0 |

168 |
movq m3, m2 |

169 |
punpcklbw m0, m7 |

170 |
punpckhbw m1, m7 |

171 |
punpcklbw m2, m7 |

172 |
punpckhbw m3, m7 |

173 |
pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] |

174 |
pmullw m1, m4 |

175 |
pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] |

176 |
pmullw m3, m5 |

177 | |

178 |
paddw m0, m6 |

179 |
paddw m1, m6 |

180 |
paddw m0, m2 |

181 |
paddw m1, m3 |

182 |
psrlw m0, 3 |

183 |
psrlw m1, 3 |

184 |
packuswb m0, m1 |

185 |
CHROMAMC_AVG m0, [dest_reg] |

186 |
movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 |

187 | |

188 |
add dest_reg, r2 |

189 |
add r1, r2 |

190 |
dec r3d |

191 |
jne .next1drow |

192 |
REP_RET |

193 | |

194 |
.both_non_zero ; general case, bilinear |

195 |
movd m4, r4d ; x |

196 |
movd m6, r5d ; y |

197 |
%ifidn %2, rv40 |

198 |
%ifdef PIC |

199 |
lea r11, [rnd_rv40_2d_tbl] |

200 |
%endif |

201 |
%ifndef ARCH_X86_64 |

202 |
mov r5, r0m |

203 |
%endif |

204 |
%endif |

205 |
mov r6, rsp ; backup stack pointer |

206 |
and rsp, ~(mmsize-1) ; align stack |

207 |
sub rsp, 16 ; AA and DD |

208 | |

209 |
punpcklwd m4, m4 |

210 |
punpcklwd m6, m6 |

211 |
punpckldq m4, m4 ; mm4 = x words |

212 |
punpckldq m6, m6 ; mm6 = y words |

213 |
movq m5, m4 |

214 |
pmullw m4, m6 ; mm4 = x * y |

215 |
psllw m5, 3 |

216 |
psllw m6, 3 |

217 |
movq m7, m5 |

218 |
paddw m7, m6 |

219 |
movq [rsp+8], m4 ; DD = x * y |

220 |
psubw m5, m4 ; mm5 = B = 8x - xy |

221 |
psubw m6, m4 ; mm6 = C = 8y - xy |

222 |
paddw m4, [pw_64] |

223 |
psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 |

224 |
pxor m7, m7 |

225 |
movq [rsp ], m4 |

226 | |

227 |
movq m0, [r1 ] ; mm0 = src[0..7] |

228 |
movq m1, [r1+1] ; mm1 = src[1..8] |

229 |
.next2drow |

230 |
add r1, r2 |

231 | |

232 |
movq m2, m0 |

233 |
movq m3, m1 |

234 |
punpckhbw m0, m7 |

235 |
punpcklbw m1, m7 |

236 |
punpcklbw m2, m7 |

237 |
punpckhbw m3, m7 |

238 |
pmullw m0, [rsp] |

239 |
pmullw m2, [rsp] |

240 |
pmullw m1, m5 |

241 |
pmullw m3, m5 |

242 |
paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] |

243 |
paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] |

244 | |

245 |
movq m0, [r1] |

246 |
movq m1, m0 |

247 |
punpcklbw m0, m7 |

248 |
punpckhbw m1, m7 |

249 |
pmullw m0, m6 |

250 |
pmullw m1, m6 |

251 |
paddw m2, m0 |

252 |
paddw m3, m1 ; [mm2,mm3] += C * src[0..7] |

253 | |

254 |
movq m1, [r1+1] |

255 |
movq m0, m1 |

256 |
movq m4, m1 |

257 |
punpcklbw m0, m7 |

258 |
punpckhbw m4, m7 |

259 |
pmullw m0, [rsp+8] |

260 |
pmullw m4, [rsp+8] |

261 |
paddw m2, m0 |

262 |
paddw m3, m4 ; [mm2,mm3] += D * src[1..8] |

263 |
movq m0, [r1] |

264 | |

265 |
paddw m2, [rnd_2d_%2+rnd_bias*8] |

266 |
paddw m3, [rnd_2d_%2+rnd_bias*8] |

267 |
psrlw m2, 6 |

268 |
psrlw m3, 6 |

269 |
packuswb m2, m3 |

270 |
CHROMAMC_AVG m2, [dest_reg] |

271 |
movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 |

272 | |

273 |
add dest_reg, r2 |

274 |
dec r3d |

275 |
jne .next2drow |

276 |
mov rsp, r6 ; restore stack pointer |

277 |
RET |

278 |
%endmacro |

279 | |

280 |
%macro chroma_mc4_mmx_func 3 |

281 |
cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 |

282 |
%ifdef ARCH_X86_64 |

283 |
movsxd r2, r2d |

284 |
%endif |

285 |
pxor m7, m7 |

286 |
movd m2, r4d ; x |

287 |
movd m3, r5d ; y |

288 |
movq m4, [pw_8] |

289 |
movq m5, [pw_8] |

290 |
punpcklwd m2, m2 |

291 |
punpcklwd m3, m3 |

292 |
punpcklwd m2, m2 |

293 |
punpcklwd m3, m3 |

294 |
psubw m4, m2 |

295 |
psubw m5, m3 |

296 | |

297 |
%ifidn %2, rv40 |

298 |
%ifdef PIC |

299 |
lea r11, [rnd_rv40_2d_tbl] |

300 |
%define rnd_2d_rv40 r11 |

301 |
%else |

302 |
%define rnd_2d_rv40 rnd_rv40_2d_tbl |

303 |
%endif |

304 |
and r5, 6 ; &~1 for mx/my=[0,7] |

305 |
lea r5, [r5*4+r4] |

306 |
sar r5d, 1 |

307 |
%define rnd_bias r5 |

308 |
%else ; vc1, h264 |

309 |
%define rnd_bias 0 |

310 |
%endif |

311 | |

312 |
movd m0, [r1 ] |

313 |
movd m6, [r1+1] |

314 |
add r1, r2 |

315 |
punpcklbw m0, m7 |

316 |
punpcklbw m6, m7 |

317 |
pmullw m0, m4 |

318 |
pmullw m6, m2 |

319 |
paddw m6, m0 |

320 | |

321 |
.next2rows |

322 |
movd m0, [r1 ] |

323 |
movd m1, [r1+1] |

324 |
add r1, r2 |

325 |
punpcklbw m0, m7 |

326 |
punpcklbw m1, m7 |

327 |
pmullw m0, m4 |

328 |
pmullw m1, m2 |

329 |
paddw m1, m0 |

330 |
movq m0, m1 |

331 | |

332 |
pmullw m6, m5 |

333 |
pmullw m1, m3 |

334 |
paddw m6, [rnd_2d_%2+rnd_bias*8] |

335 |
paddw m1, m6 |

336 |
psrlw m1, 6 |

337 |
packuswb m1, m1 |

338 |
CHROMAMC_AVG4 m1, m6, [r0] |

339 |
movd [r0], m1 |

340 |
add r0, r2 |

341 | |

342 |
movd m6, [r1 ] |

343 |
movd m1, [r1+1] |

344 |
add r1, r2 |

345 |
punpcklbw m6, m7 |

346 |
punpcklbw m1, m7 |

347 |
pmullw m6, m4 |

348 |
pmullw m1, m2 |

349 |
paddw m1, m6 |

350 |
movq m6, m1 |

351 |
pmullw m0, m5 |

352 |
pmullw m1, m3 |

353 |
paddw m0, [rnd_2d_%2+rnd_bias*8] |

354 |
paddw m1, m0 |

355 |
psrlw m1, 6 |

356 |
packuswb m1, m1 |

357 |
CHROMAMC_AVG4 m1, m0, [r0] |

358 |
movd [r0], m1 |

359 |
add r0, r2 |

360 |
sub r3d, 2 |

361 |
jnz .next2rows |

362 |
REP_RET |

363 |
%endmacro |

364 | |

365 |
%macro chroma_mc2_mmx_func 3 |

366 |
cglobal %1_%2_chroma_mc2_%3, 6, 7, 0 |

367 |
%ifdef ARCH_X86_64 |

368 |
movsxd r2, r2d |

369 |
%endif |

370 | |

371 |
mov r6d, r4d |

372 |
shl r4d, 16 |

373 |
sub r4d, r6d |

374 |
add r4d, 8 |

375 |
imul r5d, r4d ; x*y<<16 | y*(8-x) |

376 |
shl r4d, 3 |

377 |
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) |

378 | |

379 |
movd m5, r4d |

380 |
movd m6, r5d |

381 |
punpckldq m5, m5 ; mm5 = {A,B,A,B} |

382 |
punpckldq m6, m6 ; mm6 = {C,D,C,D} |

383 |
pxor m7, m7 |

384 |
movd m2, [r1] |

385 |
punpcklbw m2, m7 |

386 |
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] |

387 | |

388 |
.nextrow |

389 |
add r1, r2 |

390 |
movq m1, m2 |

391 |
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] |

392 |
movd m0, [r1] |

393 |
punpcklbw m0, m7 |

394 |
pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] |

395 |
movq m2, m0 |

396 |
pmaddwd m0, m6 |

397 |
paddw m1, [rnd_2d_%2] |

398 |
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] |

399 |
psrlw m1, 6 |

400 |
packssdw m1, m7 |

401 |
packuswb m1, m7 |

402 |
CHROMAMC_AVG4 m1, m3, [r0] |

403 |
movd r5d, m1 |

404 |
mov [r0], r5w |

405 |
add r0, r2 |

406 |
sub r3d, 1 |

407 |
jnz .nextrow |

408 |
REP_RET |

409 |
%endmacro |

410 | |

411 |
%define rnd_1d_h264 pw_4 |

412 |
%define rnd_2d_h264 pw_32 |

413 |
%define rnd_1d_vc1 pw_3 |

414 |
%define rnd_2d_vc1 pw_28 |

415 | |

416 |
%macro NOTHING 2-3 |

417 |
%endmacro |

418 |
%macro DIRECT_AVG 2 |

419 |
PAVG %1, %2 |

420 |
%endmacro |

421 |
%macro COPY_AVG 3 |

422 |
movd %2, %3 |

423 |
PAVG %1, %2 |

424 |
%endmacro |

425 | |

426 |
INIT_MMX |

427 |
%define CHROMAMC_AVG NOTHING |

428 |
%define CHROMAMC_AVG4 NOTHING |

429 |
chroma_mc8_mmx_func put, h264, mmx_rnd |

430 |
chroma_mc8_mmx_func put, vc1, mmx_nornd |

431 |
chroma_mc8_mmx_func put, rv40, mmx |

432 |
chroma_mc4_mmx_func put, h264, mmx |

433 |
chroma_mc4_mmx_func put, rv40, mmx |

434 |
chroma_mc2_mmx_func put, h264, mmx2 |

435 | |

436 |
%define CHROMAMC_AVG DIRECT_AVG |

437 |
%define CHROMAMC_AVG4 COPY_AVG |

438 |
%define PAVG pavgb |

439 |
chroma_mc8_mmx_func avg, h264, mmx2_rnd |

440 |
chroma_mc8_mmx_func avg, vc1, mmx2_nornd |

441 |
chroma_mc8_mmx_func avg, rv40, mmx2 |

442 |
chroma_mc4_mmx_func avg, h264, mmx2 |

443 |
chroma_mc4_mmx_func avg, rv40, mmx2 |

444 |
chroma_mc2_mmx_func avg, h264, mmx2 |

445 | |

446 |
%define PAVG pavgusb |

447 |
chroma_mc8_mmx_func avg, h264, 3dnow_rnd |

448 |
chroma_mc8_mmx_func avg, vc1, 3dnow_nornd |

449 |
chroma_mc8_mmx_func avg, rv40, 3dnow |

450 |
chroma_mc4_mmx_func avg, h264, 3dnow |

451 |
chroma_mc4_mmx_func avg, rv40, 3dnow |

452 | |

453 |
%macro chroma_mc8_ssse3_func 3 |

454 |
cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 |

455 |
%ifdef ARCH_X86_64 |

456 |
movsxd r2, r2d |

457 |
%endif |

458 |
mov r6d, r5d |

459 |
or r6d, r4d |

460 |
jne .at_least_one_non_zero |

461 |
; mx == 0 AND my == 0 - no filter needed |

462 |
mv0_pixels_mc8 |

463 |
REP_RET |

464 | |

465 |
.at_least_one_non_zero |

466 |
test r5d, r5d |

467 |
je .my_is_zero |

468 |
test r4d, r4d |

469 |
je .mx_is_zero |

470 | |

471 |
; general case, bilinear |

472 |
mov r6d, r4d |

473 |
shl r4d, 8 |

474 |
sub r4, r6 |

475 |
add r4, 8 ; x*288+8 = x<<8 | (8-x) |

476 |
mov r6, 8 |

477 |
sub r6d, r5d |

478 |
imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) |

479 |
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) |

480 | |

481 |
movd m7, r6d |

482 |
movd m6, r4d |

483 |
movdqa m5, [rnd_2d_%2] |

484 |
pshuflw m7, m7, 0 |

485 |
pshuflw m6, m6, 0 |

486 |
movlhps m7, m7 |

487 |
movlhps m6, m6 |

488 | |

489 |
movq m0, [r1 ] |

490 |
movq m1, [r1 +1] |

491 |
punpcklbw m0, m1 |

492 |
add r1, r2 |

493 |
.next2rows |

494 |
movq m1, [r1 ] |

495 |
movq m2, [r1 +1] |

496 |
movq m3, [r1+r2 ] |

497 |
movq m4, [r1+r2+1] |

498 |
lea r1, [r1+r2*2] |

499 |
punpcklbw m1, m2 |

500 |
punpcklbw m3, m4 |

501 |
movdqa m2, m1 |

502 |
movdqa m4, m3 |

503 |
pmaddubsw m0, m7 |

504 |
pmaddubsw m1, m6 |

505 |
pmaddubsw m2, m7 |

506 |
pmaddubsw m3, m6 |

507 |
paddw m0, m5 |

508 |
paddw m2, m5 |

509 |
paddw m1, m0 |

510 |
paddw m3, m2 |

511 |
movdqa m0, m4 |

512 |
psrlw m1, 6 |

513 |
psrlw m3, 6 |

514 |
%ifidn %1, avg |

515 |
movq m2, [r0 ] |

516 |
movhps m2, [r0+r2] |

517 |
%endif |

518 |
packuswb m1, m3 |

519 |
CHROMAMC_AVG m1, m2 |

520 |
movq [r0 ], m1 |

521 |
movhps [r0+r2], m1 |

522 |
sub r3d, 2 |

523 |
lea r0, [r0+r2*2] |

524 |
jg .next2rows |

525 |
REP_RET |

526 | |

527 |
.my_is_zero |

528 |
mov r5d, r4d |

529 |
shl r4d, 8 |

530 |
add r4, 8 |

531 |
sub r4, r5 ; 255*x+8 = x<<8 | (8-x) |

532 |
movd m7, r4d |

533 |
movdqa m6, [rnd_1d_%2] |

534 |
pshuflw m7, m7, 0 |

535 |
movlhps m7, m7 |

536 | |

537 |
.next2xrows |

538 |
movq m0, [r1 ] |

539 |
movq m1, [r1 +1] |

540 |
movq m2, [r1+r2 ] |

541 |
movq m3, [r1+r2+1] |

542 |
punpcklbw m0, m1 |

543 |
punpcklbw m2, m3 |

544 |
pmaddubsw m0, m7 |

545 |
pmaddubsw m2, m7 |

546 |
%ifidn %1, avg |

547 |
movq m4, [r0 ] |

548 |
movhps m4, [r0+r2] |

549 |
%endif |

550 |
paddw m0, m6 |

551 |
paddw m2, m6 |

552 |
psrlw m0, 3 |

553 |
psrlw m2, 3 |

554 |
packuswb m0, m2 |

555 |
CHROMAMC_AVG m0, m4 |

556 |
movq [r0 ], m0 |

557 |
movhps [r0+r2], m0 |

558 |
sub r3d, 2 |

559 |
lea r0, [r0+r2*2] |

560 |
lea r1, [r1+r2*2] |

561 |
jg .next2xrows |

562 |
REP_RET |

563 | |

564 |
.mx_is_zero |

565 |
mov r4d, r5d |

566 |
shl r5d, 8 |

567 |
add r5, 8 |

568 |
sub r5, r4 ; 255*y+8 = y<<8 | (8-y) |

569 |
movd m7, r5d |

570 |
movdqa m6, [rnd_1d_%2] |

571 |
pshuflw m7, m7, 0 |

572 |
movlhps m7, m7 |

573 | |

574 |
.next2yrows |

575 |
movq m0, [r1 ] |

576 |
movq m1, [r1+r2 ] |

577 |
movdqa m2, m1 |

578 |
movq m3, [r1+r2*2] |

579 |
punpcklbw m0, m1 |

580 |
punpcklbw m2, m3 |

581 |
pmaddubsw m0, m7 |

582 |
pmaddubsw m2, m7 |

583 |
%ifidn %1, avg |

584 |
movq m4, [r0 ] |

585 |
movhps m4, [r0+r2] |

586 |
%endif |

587 |
paddw m0, m6 |

588 |
paddw m2, m6 |

589 |
psrlw m0, 3 |

590 |
psrlw m2, 3 |

591 |
packuswb m0, m2 |

592 |
CHROMAMC_AVG m0, m4 |

593 |
movq [r0 ], m0 |

594 |
movhps [r0+r2], m0 |

595 |
sub r3d, 2 |

596 |
lea r0, [r0+r2*2] |

597 |
lea r1, [r1+r2*2] |

598 |
jg .next2yrows |

599 |
REP_RET |

600 |
%endmacro |

601 | |

602 |
%macro chroma_mc4_ssse3_func 3 |

603 |
cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 |

604 |
%ifdef ARCH_X86_64 |

605 |
movsxd r2, r2d |

606 |
%endif |

607 |
mov r6, r4 |

608 |
shl r4d, 8 |

609 |
sub r4d, r6d |

610 |
add r4d, 8 ; x*288+8 |

611 |
mov r6, 8 |

612 |
sub r6d, r5d |

613 |
imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) |

614 |
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) |

615 | |

616 |
movd m7, r6d |

617 |
movd m6, r4d |

618 |
movq m5, [pw_32] |

619 |
pshufw m7, m7, 0 |

620 |
pshufw m6, m6, 0 |

621 | |

622 |
movd m0, [r1 ] |

623 |
punpcklbw m0, [r1 +1] |

624 |
add r1, r2 |

625 |
.next2rows |

626 |
movd m1, [r1 ] |

627 |
movd m3, [r1+r2 ] |

628 |
punpcklbw m1, [r1 +1] |

629 |
punpcklbw m3, [r1+r2+1] |

630 |
lea r1, [r1+r2*2] |

631 |
movq m2, m1 |

632 |
movq m4, m3 |

633 |
pmaddubsw m0, m7 |

634 |
pmaddubsw m1, m6 |

635 |
pmaddubsw m2, m7 |

636 |
pmaddubsw m3, m6 |

637 |
paddw m0, m5 |

638 |
paddw m2, m5 |

639 |
paddw m1, m0 |

640 |
paddw m3, m2 |

641 |
movq m0, m4 |

642 |
psrlw m1, 6 |

643 |
psrlw m3, 6 |

644 |
packuswb m1, m1 |

645 |
packuswb m3, m3 |

646 |
CHROMAMC_AVG m1, [r0 ] |

647 |
CHROMAMC_AVG m3, [r0+r2] |

648 |
movd [r0 ], m1 |

649 |
movd [r0+r2], m3 |

650 |
sub r3d, 2 |

651 |
lea r0, [r0+r2*2] |

652 |
jg .next2rows |

653 |
REP_RET |

654 |
%endmacro |

655 | |

656 |
%define CHROMAMC_AVG NOTHING |

657 |
INIT_XMM |

658 |
chroma_mc8_ssse3_func put, h264, ssse3_rnd |

659 |
chroma_mc8_ssse3_func put, vc1, ssse3_nornd |

660 |
INIT_MMX |

661 |
chroma_mc4_ssse3_func put, h264, ssse3 |

662 | |

663 |
%define CHROMAMC_AVG DIRECT_AVG |

664 |
%define PAVG pavgb |

665 |
INIT_XMM |

666 |
chroma_mc8_ssse3_func avg, h264, ssse3_rnd |

667 |
chroma_mc8_ssse3_func avg, vc1, ssse3_nornd |

668 |
INIT_MMX |

669 |
chroma_mc4_ssse3_func avg, h264, ssse3 |