## ffmpeg / libavcodec / x86 / h264_deblock_10bit.asm @ 888fa31e

History | View | Annotate | Download (22.2 KB)

1 | 9f3d6ca4 | Jason Garrett-Glaser | ;***************************************************************************** |
---|---|---|---|

2 | ;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code |
||

3 | ;***************************************************************************** |
||

4 | ;* Copyright (C) 2005-2011 x264 project |
||

5 | ;* |
||

6 | ;* Authors: Oskar Arvidsson <oskar@irock.se> |
||

7 | ;* Loren Merritt <lorenm@u.washington.edu> |
||

8 | ;* Jason Garrett-Glaser <darkshikari@gmail.com> |
||

9 | ;* |
||

10 | ;* This file is part of Libav. |
||

11 | ;* |
||

12 | ;* Libav is free software; you can redistribute it and/or |
||

13 | ;* modify it under the terms of the GNU Lesser General Public |
||

14 | ;* License as published by the Free Software Foundation; either |
||

15 | ;* version 2.1 of the License, or (at your option) any later version. |
||

16 | ;* |
||

17 | ;* Libav is distributed in the hope that it will be useful, |
||

18 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||

19 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||

20 | ;* Lesser General Public License for more details. |
||

21 | ;* |
||

22 | ;* You should have received a copy of the GNU Lesser General Public |
||

23 | ;* License along with Libav; if not, write to the Free Software |
||

24 | 888fa31e | Diego Biurrun | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

25 | 9f3d6ca4 | Jason Garrett-Glaser | ;****************************************************************************** |

26 | |||

27 | %include "x86inc.asm" |
||

28 | %include "x86util.asm" |
||

29 | |||

30 | SECTION_RODATA |
||

31 | |||

32 | pw_pixel_max: times 8 dw ((1 << 10)-1) |
||

33 | |||

34 | SECTION .text |
||

35 | |||

36 | cextern pw_2 |
||

37 | 5705b020 | Jason Garrett-Glaser | cextern pw_3 |

38 | 9f3d6ca4 | Jason Garrett-Glaser | cextern pw_4 |

39 | |||

40 | ; out: %4 = |%1-%2|-%3 |
||

41 | ; clobbers: %5 |
||

42 | %macro ABS_SUB 5 |
||

43 | psubusw %5, %2, %1 |
||

44 | psubusw %4, %1, %2 |
||

45 | por %4, %5 |
||

46 | psubw %4, %3 |
||

47 | %endmacro |
||

48 | |||

49 | ; out: %4 = |%1-%2|<%3 |
||

50 | %macro DIFF_LT 5 |
||

51 | psubusw %4, %2, %1 |
||

52 | psubusw %5, %1, %2 |
||

53 | por %5, %4 ; |%1-%2| |
||

54 | pxor %4, %4 |
||

55 | psubw %5, %3 ; |%1-%2|-%3 |
||

56 | pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 |
||

57 | %endmacro |
||

58 | |||

59 | %macro LOAD_AB 4 |
||

60 | movd %1, %3 |
||

61 | movd %2, %4 |
||

62 | SPLATW %1, %1 |
||

63 | SPLATW %2, %2 |
||

64 | %endmacro |
||

65 | |||

66 | ; in: %2=tc reg |
||

67 | ; out: %1=splatted tc |
||

68 | %macro LOAD_TC 2 |
||

69 | movd %1, [%2] |
||

70 | punpcklbw %1, %1 |
||

71 | %if mmsize == 8 |
||

72 | pshufw %1, %1, 0 |
||

73 | %else |
||

74 | pshuflw %1, %1, 01010000b |
||

75 | pshufd %1, %1, 01010000b |
||

76 | %endif |
||

77 | psraw %1, 6 |
||

78 | %endmacro |
||

79 | |||

80 | ; in: %1=p1, %2=p0, %3=q0, %4=q1 |
||

81 | ; %5=alpha, %6=beta, %7-%9=tmp |
||

82 | ; out: %7=mask |
||

83 | %macro LOAD_MASK 9 |
||

84 | ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha |
||

85 | ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta |
||

86 | pand %8, %9 |
||

87 | ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta |
||

88 | pxor %7, %7 |
||

89 | pand %8, %9 |
||

90 | pcmpgtw %7, %8 |
||

91 | %endmacro |
||

92 | |||

93 | ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp |
||

94 | ; out: %1=p0', m2=q0' |
||

95 | %macro DEBLOCK_P0_Q0 7 |
||

96 | psubw %3, %4 |
||

97 | pxor %7, %7 |
||

98 | paddw %3, [pw_4] |
||

99 | psubw %7, %5 |
||

100 | psubw %6, %2, %1 |
||

101 | psllw %6, 2 |
||

102 | paddw %3, %6 |
||

103 | psraw %3, 3 |
||

104 | mova %6, [pw_pixel_max] |
||

105 | CLIPW %3, %7, %5 |
||

106 | pxor %7, %7 |
||

107 | paddw %1, %3 |
||

108 | psubw %2, %3 |
||

109 | CLIPW %1, %7, %6 |
||

110 | CLIPW %2, %7, %6 |
||

111 | %endmacro |
||

112 | |||

113 | ; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp |
||

114 | %macro LUMA_Q1 6 |
||

115 | pavgw %6, %3, %4 ; (p0+q0+1)>>1 |
||

116 | paddw %1, %6 |
||

117 | pxor %6, %6 |
||

118 | psraw %1, 1 |
||

119 | psubw %6, %5 |
||

120 | psubw %1, %2 |
||

121 | CLIPW %1, %6, %5 |
||

122 | paddw %1, %2 |
||

123 | %endmacro |
||

124 | |||

125 | %macro LUMA_DEBLOCK_ONE 3 |
||

126 | DIFF_LT m5, %1, bm, m4, m6 |
||

127 | pxor m6, m6 |
||

128 | mova %3, m4 |
||

129 | pcmpgtw m6, tcm |
||

130 | pand m4, tcm |
||

131 | pandn m6, m7 |
||

132 | pand m4, m6 |
||

133 | LUMA_Q1 m5, %2, m1, m2, m4, m6 |
||

134 | %endmacro |
||

135 | |||

136 | %macro LUMA_H_STORE 2 |
||

137 | %if mmsize == 8 |
||

138 | movq [r0-4], m0 |
||

139 | movq [r0+r1-4], m1 |
||

140 | movq [r0+r1*2-4], m2 |
||

141 | movq [r0+%2-4], m3 |
||

142 | %else |
||

143 | movq [r0-4], m0 |
||

144 | movhps [r0+r1-4], m0 |
||

145 | movq [r0+r1*2-4], m1 |
||

146 | movhps [%1-4], m1 |
||

147 | movq [%1+r1-4], m2 |
||

148 | movhps [%1+r1*2-4], m2 |
||

149 | movq [%1+%2-4], m3 |
||

150 | movhps [%1+r1*4-4], m3 |
||

151 | %endif |
||

152 | %endmacro |
||

153 | |||

154 | %macro DEBLOCK_LUMA 1 |
||

155 | ;----------------------------------------------------------------------------- |
||

156 | ; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
||

157 | ;----------------------------------------------------------------------------- |
||

158 | cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16) |
||

159 | %assign pad 5*mmsize+12-(stack_offset&15) |
||

160 | %define tcm [rsp] |
||

161 | %define ms1 [rsp+mmsize] |
||

162 | %define ms2 [rsp+mmsize*2] |
||

163 | %define am [rsp+mmsize*3] |
||

164 | %define bm [rsp+mmsize*4] |
||

165 | SUB rsp, pad |
||

166 | shl r2d, 2 |
||

167 | shl r3d, 2 |
||

168 | LOAD_AB m4, m5, r2, r3 |
||

169 | mov r3, 32/mmsize |
||

170 | mov r2, r0 |
||

171 | sub r0, r1 |
||

172 | mova am, m4 |
||

173 | sub r0, r1 |
||

174 | mova bm, m5 |
||

175 | sub r0, r1 |
||

176 | .loop: |
||

177 | mova m0, [r0+r1] |
||

178 | mova m1, [r0+r1*2] |
||

179 | mova m2, [r2] |
||

180 | mova m3, [r2+r1] |
||

181 | |||

182 | LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 |
||

183 | LOAD_TC m6, r4 |
||

184 | mova tcm, m6 |
||

185 | |||

186 | mova m5, [r0] |
||

187 | LUMA_DEBLOCK_ONE m1, m0, ms1 |
||

188 | mova [r0+r1], m5 |
||

189 | |||

190 | mova m5, [r2+r1*2] |
||

191 | LUMA_DEBLOCK_ONE m2, m3, ms2 |
||

192 | mova [r2+r1], m5 |
||

193 | |||

194 | pxor m5, m5 |
||

195 | mova m6, tcm |
||

196 | pcmpgtw m5, tcm |
||

197 | psubw m6, ms1 |
||

198 | pandn m5, m7 |
||

199 | psubw m6, ms2 |
||

200 | pand m5, m6 |
||

201 | DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 |
||

202 | mova [r0+r1*2], m1 |
||

203 | mova [r2], m2 |
||

204 | |||

205 | add r0, mmsize |
||

206 | add r2, mmsize |
||

207 | add r4, mmsize/8 |
||

208 | dec r3 |
||

209 | jg .loop |
||

210 | ADD rsp, pad |
||

211 | RET |
||

212 | |||

213 | cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16) |
||

214 | %assign pad 7*mmsize+12-(stack_offset&15) |
||

215 | %define tcm [rsp] |
||

216 | %define ms1 [rsp+mmsize] |
||

217 | %define ms2 [rsp+mmsize*2] |
||

218 | %define p1m [rsp+mmsize*3] |
||

219 | %define p2m [rsp+mmsize*4] |
||

220 | %define am [rsp+mmsize*5] |
||

221 | %define bm [rsp+mmsize*6] |
||

222 | SUB rsp, pad |
||

223 | shl r2d, 2 |
||

224 | shl r3d, 2 |
||

225 | LOAD_AB m4, m5, r2, r3 |
||

226 | mov r3, r1 |
||

227 | mova am, m4 |
||

228 | add r3, r1 |
||

229 | mov r5, 32/mmsize |
||

230 | mova bm, m5 |
||

231 | add r3, r1 |
||

232 | %if mmsize == 16 |
||

233 | mov r2, r0 |
||

234 | add r2, r3 |
||

235 | %endif |
||

236 | .loop: |
||

237 | %if mmsize == 8 |
||

238 | movq m2, [r0-8] ; y q2 q1 q0 |
||

239 | movq m7, [r0+0] |
||

240 | movq m5, [r0+r1-8] |
||

241 | movq m3, [r0+r1+0] |
||

242 | movq m0, [r0+r1*2-8] |
||

243 | movq m6, [r0+r1*2+0] |
||

244 | movq m1, [r0+r3-8] |
||

245 | TRANSPOSE4x4W 2, 5, 0, 1, 4 |
||

246 | SWAP 2, 7 |
||

247 | movq m7, [r0+r3] |
||

248 | TRANSPOSE4x4W 2, 3, 6, 7, 4 |
||

249 | %else |
||

250 | movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x |
||

251 | movu m0, [r0+r1-8] |
||

252 | movu m2, [r0+r1*2-8] |
||

253 | movu m3, [r2-8] |
||

254 | TRANSPOSE4x4W 5, 0, 2, 3, 6 |
||

255 | mova tcm, m3 |
||

256 | |||

257 | movu m4, [r2+r1-8] |
||

258 | movu m1, [r2+r1*2-8] |
||

259 | movu m3, [r2+r3-8] |
||

260 | movu m7, [r2+r1*4-8] |
||

261 | TRANSPOSE4x4W 4, 1, 3, 7, 6 |
||

262 | |||

263 | mova m6, tcm |
||

264 | punpcklqdq m6, m7 |
||

265 | punpckhqdq m5, m4 |
||

266 | SBUTTERFLY qdq, 0, 1, 7 |
||

267 | SBUTTERFLY qdq, 2, 3, 7 |
||

268 | %endif |
||

269 | |||

270 | mova p2m, m6 |
||

271 | LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 |
||

272 | LOAD_TC m6, r4 |
||

273 | mova tcm, m6 |
||

274 | |||

275 | LUMA_DEBLOCK_ONE m1, m0, ms1 |
||

276 | mova p1m, m5 |
||

277 | |||

278 | mova m5, p2m |
||

279 | LUMA_DEBLOCK_ONE m2, m3, ms2 |
||

280 | mova p2m, m5 |
||

281 | |||

282 | pxor m5, m5 |
||

283 | mova m6, tcm |
||

284 | pcmpgtw m5, tcm |
||

285 | psubw m6, ms1 |
||

286 | pandn m5, m7 |
||

287 | psubw m6, ms2 |
||

288 | pand m5, m6 |
||

289 | DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 |
||

290 | mova m0, p1m |
||

291 | mova m3, p2m |
||

292 | TRANSPOSE4x4W 0, 1, 2, 3, 4 |
||

293 | LUMA_H_STORE r2, r3 |
||

294 | |||

295 | add r4, mmsize/8 |
||

296 | lea r0, [r0+r1*(mmsize/2)] |
||

297 | lea r2, [r2+r1*(mmsize/2)] |
||

298 | dec r5 |
||

299 | jg .loop |
||

300 | ADD rsp, pad |
||

301 | RET |
||

302 | %endmacro |
||

303 | |||

304 | INIT_XMM |
||

305 | %ifdef ARCH_X86_64 |
||

306 | ; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 |
||

307 | ; m12=alpha, m13=beta |
||

308 | ; out: m0=p1', m3=q1', m1=p0', m2=q0' |
||

309 | ; clobbers: m4, m5, m6, m7, m10, m11, m14 |
||

310 | %macro DEBLOCK_LUMA_INTER_SSE2 0 |
||

311 | LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 |
||

312 | LOAD_TC m6, r4 |
||

313 | DIFF_LT m8, m1, m13, m10, m4 |
||

314 | DIFF_LT m9, m2, m13, m11, m4 |
||

315 | pand m6, m7 |
||

316 | |||

317 | mova m14, m6 |
||

318 | pxor m4, m4 |
||

319 | pcmpgtw m6, m4 |
||

320 | pand m6, m14 |
||

321 | |||

322 | mova m5, m10 |
||

323 | pand m5, m6 |
||

324 | LUMA_Q1 m8, m0, m1, m2, m5, m4 |
||

325 | |||

326 | mova m5, m11 |
||

327 | pand m5, m6 |
||

328 | LUMA_Q1 m9, m3, m1, m2, m5, m4 |
||

329 | |||

330 | pxor m4, m4 |
||

331 | psubw m6, m10 |
||

332 | pcmpgtw m4, m14 |
||

333 | pandn m4, m7 |
||

334 | psubw m6, m11 |
||

335 | pand m4, m6 |
||

336 | DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 |
||

337 | |||

338 | SWAP 0, 8 |
||

339 | SWAP 3, 9 |
||

340 | %endmacro |
||

341 | |||

342 | %macro DEBLOCK_LUMA_64 1 |
||

343 | cglobal deblock_v_luma_10_%1, 5,5,15 |
||

344 | %define p2 m8 |
||

345 | %define p1 m0 |
||

346 | %define p0 m1 |
||

347 | %define q0 m2 |
||

348 | %define q1 m3 |
||

349 | %define q2 m9 |
||

350 | %define mask0 m7 |
||

351 | %define mask1 m10 |
||

352 | %define mask2 m11 |
||

353 | shl r2d, 2 |
||

354 | shl r3d, 2 |
||

355 | LOAD_AB m12, m13, r2, r3 |
||

356 | mov r2, r0 |
||

357 | sub r0, r1 |
||

358 | sub r0, r1 |
||

359 | sub r0, r1 |
||

360 | mov r3, 2 |
||

361 | .loop: |
||

362 | mova p2, [r0] |
||

363 | mova p1, [r0+r1] |
||

364 | mova p0, [r0+r1*2] |
||

365 | mova q0, [r2] |
||

366 | mova q1, [r2+r1] |
||

367 | mova q2, [r2+r1*2] |
||

368 | DEBLOCK_LUMA_INTER_SSE2 |
||

369 | mova [r0+r1], p1 |
||

370 | mova [r0+r1*2], p0 |
||

371 | mova [r2], q0 |
||

372 | mova [r2+r1], q1 |
||

373 | add r0, mmsize |
||

374 | add r2, mmsize |
||

375 | add r4, 2 |
||

376 | dec r3 |
||

377 | jg .loop |
||

378 | REP_RET |
||

379 | |||

380 | cglobal deblock_h_luma_10_%1, 5,7,15 |
||

381 | shl r2d, 2 |
||

382 | shl r3d, 2 |
||

383 | LOAD_AB m12, m13, r2, r3 |
||

384 | mov r2, r1 |
||

385 | add r2, r1 |
||

386 | add r2, r1 |
||

387 | mov r5, r0 |
||

388 | add r5, r2 |
||

389 | mov r6, 2 |
||

390 | .loop: |
||

391 | movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x |
||

392 | movu m0, [r0+r1-8] |
||

393 | movu m2, [r0+r1*2-8] |
||

394 | movu m9, [r5-8] |
||

395 | movu m5, [r5+r1-8] |
||

396 | movu m1, [r5+r1*2-8] |
||

397 | movu m3, [r5+r2-8] |
||

398 | movu m7, [r5+r1*4-8] |
||

399 | |||

400 | TRANSPOSE4x4W 8, 0, 2, 9, 10 |
||

401 | TRANSPOSE4x4W 5, 1, 3, 7, 10 |
||

402 | |||

403 | punpckhqdq m8, m5 |
||

404 | SBUTTERFLY qdq, 0, 1, 10 |
||

405 | SBUTTERFLY qdq, 2, 3, 10 |
||

406 | punpcklqdq m9, m7 |
||

407 | |||

408 | DEBLOCK_LUMA_INTER_SSE2 |
||

409 | |||

410 | TRANSPOSE4x4W 0, 1, 2, 3, 4 |
||

411 | LUMA_H_STORE r5, r2 |
||

412 | add r4, 2 |
||

413 | lea r0, [r0+r1*8] |
||

414 | lea r5, [r5+r1*8] |
||

415 | dec r6 |
||

416 | jg .loop |
||

417 | REP_RET |
||

418 | %endmacro |
||

419 | |||

420 | INIT_XMM |
||

421 | DEBLOCK_LUMA_64 sse2 |
||

422 | INIT_AVX |
||

423 | DEBLOCK_LUMA_64 avx |
||

424 | %endif |
||

425 | |||

426 | %macro SWAPMOVA 2 |
||

427 | %ifid %1 |
||

428 | SWAP %1, %2 |
||

429 | %else |
||

430 | mova %1, %2 |
||

431 | %endif |
||

432 | %endmacro |
||

433 | |||

434 | ; in: t0-t2: tmp registers |
||

435 | ; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 |
||

436 | ; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' |
||

437 | %macro LUMA_INTRA_P012 12 ; p0..p3 in memory |
||

438 | %ifdef ARCH_X86_64 |
||

439 | paddw t0, %3, %2 |
||

440 | mova t2, %4 |
||

441 | paddw t2, %3 |
||

442 | %else |
||

443 | mova t0, %3 |
||

444 | mova t2, %4 |
||

445 | paddw t0, %2 |
||

446 | paddw t2, %3 |
||

447 | %endif |
||

448 | paddw t0, %1 |
||

449 | paddw t2, t2 |
||

450 | paddw t0, %5 |
||

451 | paddw t2, %9 |
||

452 | paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) |
||

453 | paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) |
||

454 | |||

455 | psrlw t2, 3 |
||

456 | psrlw t1, t0, 2 |
||

457 | psubw t2, %3 |
||

458 | psubw t1, %2 |
||

459 | pand t2, %8 |
||

460 | pand t1, %8 |
||

461 | paddw t2, %3 |
||

462 | paddw t1, %2 |
||

463 | SWAPMOVA %11, t1 |
||

464 | |||

465 | psubw t1, t0, %3 |
||

466 | paddw t0, t0 |
||

467 | psubw t1, %5 |
||

468 | psubw t0, %3 |
||

469 | paddw t1, %6 |
||

470 | paddw t1, %2 |
||

471 | paddw t0, %6 |
||

472 | psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 |
||

473 | psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 |
||

474 | |||

475 | pxor t0, t1 |
||

476 | pxor t1, %1 |
||

477 | pand t0, %8 |
||

478 | pand t1, %7 |
||

479 | pxor t0, t1 |
||

480 | pxor t0, %1 |
||

481 | SWAPMOVA %10, t0 |
||

482 | SWAPMOVA %12, t2 |
||

483 | %endmacro |
||

484 | |||

485 | %macro LUMA_INTRA_INIT 1 |
||

486 | %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) |
||

487 | %define t0 m4 |
||

488 | %define t1 m5 |
||

489 | %define t2 m6 |
||

490 | %define t3 m7 |
||

491 | %assign i 4 |
||

492 | %rep %1 |
||

493 | CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] |
||

494 | %assign i i+1 |
||

495 | %endrep |
||

496 | SUB rsp, pad |
||

497 | %endmacro |
||

498 | |||

499 | ; in: %1-%3=tmp, %4=p2, %5=q2 |
||

500 | %macro LUMA_INTRA_INTER 5 |
||

501 | LOAD_AB t0, t1, r2d, r3d |
||

502 | mova %1, t0 |
||

503 | LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 |
||

504 | %ifdef ARCH_X86_64 |
||

505 | mova %2, t0 ; mask0 |
||

506 | psrlw t3, %1, 2 |
||

507 | %else |
||

508 | mova t3, %1 |
||

509 | mova %2, t0 ; mask0 |
||

510 | psrlw t3, 2 |
||

511 | %endif |
||

512 | paddw t3, [pw_2] ; alpha/4+2 |
||

513 | DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 |
||

514 | pand t2, %2 |
||

515 | mova t3, %5 ; q2 |
||

516 | mova %1, t2 ; mask1 |
||

517 | DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta |
||

518 | pand t2, %1 |
||

519 | mova t3, %4 ; p2 |
||

520 | mova %3, t2 ; mask1q |
||

521 | DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta |
||

522 | pand t2, %1 |
||

523 | mova %1, t2 ; mask1p |
||

524 | %endmacro |
||

525 | |||

526 | %macro LUMA_H_INTRA_LOAD 0 |
||

527 | %if mmsize == 8 |
||

528 | movu t0, [r0-8] |
||

529 | movu t1, [r0+r1-8] |
||

530 | movu m0, [r0+r1*2-8] |
||

531 | movu m1, [r0+r4-8] |
||

532 | TRANSPOSE4x4W 4, 5, 0, 1, 2 |
||

533 | mova t4, t0 ; p3 |
||

534 | mova t5, t1 ; p2 |
||

535 | |||

536 | movu m2, [r0] |
||

537 | movu m3, [r0+r1] |
||

538 | movu t0, [r0+r1*2] |
||

539 | movu t1, [r0+r4] |
||

540 | TRANSPOSE4x4W 2, 3, 4, 5, 6 |
||

541 | mova t6, t0 ; q2 |
||

542 | mova t7, t1 ; q3 |
||

543 | %else |
||

544 | movu t0, [r0-8] |
||

545 | movu t1, [r0+r1-8] |
||

546 | movu m0, [r0+r1*2-8] |
||

547 | movu m1, [r0+r5-8] |
||

548 | movu m2, [r4-8] |
||

549 | movu m3, [r4+r1-8] |
||

550 | movu t2, [r4+r1*2-8] |
||

551 | movu t3, [r4+r5-8] |
||

552 | TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 |
||

553 | mova t4, t0 ; p3 |
||

554 | mova t5, t1 ; p2 |
||

555 | mova t6, t2 ; q2 |
||

556 | mova t7, t3 ; q3 |
||

557 | %endif |
||

558 | %endmacro |
||

559 | |||

560 | ; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp |
||

561 | %macro LUMA_H_INTRA_STORE 9 |
||

562 | %if mmsize == 8 |
||

563 | TRANSPOSE4x4W %1, %2, %3, %4, %9 |
||

564 | movq [r0-8], m%1 |
||

565 | movq [r0+r1-8], m%2 |
||

566 | movq [r0+r1*2-8], m%3 |
||

567 | movq [r0+r4-8], m%4 |
||

568 | movq m%1, %8 |
||

569 | TRANSPOSE4x4W %5, %6, %7, %1, %9 |
||

570 | movq [r0], m%5 |
||

571 | movq [r0+r1], m%6 |
||

572 | movq [r0+r1*2], m%7 |
||

573 | movq [r0+r4], m%1 |
||

574 | %else |
||

575 | TRANSPOSE2x4x4W %1, %2, %3, %4, %9 |
||

576 | movq [r0-8], m%1 |
||

577 | movq [r0+r1-8], m%2 |
||

578 | movq [r0+r1*2-8], m%3 |
||

579 | movq [r0+r5-8], m%4 |
||

580 | movhps [r4-8], m%1 |
||

581 | movhps [r4+r1-8], m%2 |
||

582 | movhps [r4+r1*2-8], m%3 |
||

583 | movhps [r4+r5-8], m%4 |
||

584 | %ifnum %8 |
||

585 | SWAP %1, %8 |
||

586 | %else |
||

587 | mova m%1, %8 |
||

588 | %endif |
||

589 | TRANSPOSE2x4x4W %5, %6, %7, %1, %9 |
||

590 | movq [r0], m%5 |
||

591 | movq [r0+r1], m%6 |
||

592 | movq [r0+r1*2], m%7 |
||

593 | movq [r0+r5], m%1 |
||

594 | movhps [r4], m%5 |
||

595 | movhps [r4+r1], m%6 |
||

596 | movhps [r4+r1*2], m%7 |
||

597 | movhps [r4+r5], m%1 |
||

598 | %endif |
||

599 | %endmacro |
||

600 | |||

601 | %ifdef ARCH_X86_64 |
||

602 | ;----------------------------------------------------------------------------- |
||

603 | ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |
||

604 | ;----------------------------------------------------------------------------- |
||

605 | %macro DEBLOCK_LUMA_INTRA_64 1 |
||

606 | cglobal deblock_v_luma_intra_10_%1, 4,7,16 |
||

607 | %define t0 m1 |
||

608 | %define t1 m2 |
||

609 | %define t2 m4 |
||

610 | %define p2 m8 |
||

611 | %define p1 m9 |
||

612 | %define p0 m10 |
||

613 | %define q0 m11 |
||

614 | %define q1 m12 |
||

615 | %define q2 m13 |
||

616 | %define aa m5 |
||

617 | %define bb m14 |
||

618 | lea r4, [r1*4] |
||

619 | lea r5, [r1*3] ; 3*stride |
||

620 | neg r4 |
||

621 | add r4, r0 ; pix-4*stride |
||

622 | mov r6, 2 |
||

623 | mova m0, [pw_2] |
||

624 | shl r2d, 2 |
||

625 | shl r3d, 2 |
||

626 | LOAD_AB aa, bb, r2d, r3d |
||

627 | .loop |
||

628 | mova p2, [r4+r1] |
||

629 | mova p1, [r4+2*r1] |
||

630 | mova p0, [r4+r5] |
||

631 | mova q0, [r0] |
||

632 | mova q1, [r0+r1] |
||

633 | mova q2, [r0+2*r1] |
||

634 | |||

635 | LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 |
||

636 | mova t2, aa |
||

637 | psrlw t2, 2 |
||

638 | paddw t2, m0 ; alpha/4+2 |
||

639 | DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 |
||

640 | DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta |
||

641 | DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta |
||

642 | pand m6, m3 |
||

643 | pand m7, m6 |
||

644 | pand m6, t1 |
||

645 | LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] |
||

646 | LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] |
||

647 | add r0, mmsize |
||

648 | add r4, mmsize |
||

649 | dec r6 |
||

650 | jg .loop |
||

651 | REP_RET |
||

652 | |||

653 | ;----------------------------------------------------------------------------- |
||

654 | ; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |
||

655 | ;----------------------------------------------------------------------------- |
||

656 | cglobal deblock_h_luma_intra_10_%1, 4,7,16 |
||

657 | %define t0 m15 |
||

658 | %define t1 m14 |
||

659 | %define t2 m2 |
||

660 | %define q3 m5 |
||

661 | %define q2 m8 |
||

662 | %define q1 m9 |
||

663 | %define q0 m10 |
||

664 | %define p0 m11 |
||

665 | %define p1 m12 |
||

666 | %define p2 m13 |
||

667 | %define p3 m4 |
||

668 | %define spill [rsp] |
||

669 | %assign pad 24-(stack_offset&15) |
||

670 | SUB rsp, pad |
||

671 | lea r4, [r1*4] |
||

672 | lea r5, [r1*3] ; 3*stride |
||

673 | add r4, r0 ; pix+4*stride |
||

674 | mov r6, 2 |
||

675 | mova m0, [pw_2] |
||

676 | shl r2d, 2 |
||

677 | shl r3d, 2 |
||

678 | .loop |
||

679 | movu q3, [r0-8] |
||

680 | movu q2, [r0+r1-8] |
||

681 | movu q1, [r0+r1*2-8] |
||

682 | movu q0, [r0+r5-8] |
||

683 | movu p0, [r4-8] |
||

684 | movu p1, [r4+r1-8] |
||

685 | movu p2, [r4+r1*2-8] |
||

686 | movu p3, [r4+r5-8] |
||

687 | TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 |
||

688 | |||

689 | LOAD_AB m1, m2, r2d, r3d |
||

690 | LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 |
||

691 | psrlw m1, 2 |
||

692 | paddw m1, m0 ; alpha/4+2 |
||

693 | DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 |
||

694 | DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta |
||

695 | DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta |
||

696 | pand m6, m3 |
||

697 | pand m7, m6 |
||

698 | pand m6, t1 |
||

699 | |||

700 | mova spill, q3 |
||

701 | LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 |
||

702 | LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 |
||

703 | mova m7, spill |
||

704 | |||

705 | LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 |
||

706 | |||

707 | lea r0, [r0+r1*8] |
||

708 | lea r4, [r4+r1*8] |
||

709 | dec r6 |
||

710 | jg .loop |
||

711 | ADD rsp, pad |
||

712 | RET |
||

713 | %endmacro |
||

714 | |||

715 | INIT_XMM |
||

716 | DEBLOCK_LUMA_INTRA_64 sse2 |
||

717 | INIT_AVX |
||

718 | DEBLOCK_LUMA_INTRA_64 avx |
||

719 | |||

720 | %endif |
||

721 | |||

722 | %macro DEBLOCK_LUMA_INTRA 1 |
||

723 | ;----------------------------------------------------------------------------- |
||

724 | ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |
||

725 | ;----------------------------------------------------------------------------- |
||

726 | cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16) |
||

727 | LUMA_INTRA_INIT 3 |
||

728 | lea r4, [r1*4] |
||

729 | lea r5, [r1*3] |
||

730 | neg r4 |
||

731 | add r4, r0 |
||

732 | mov r6, 32/mmsize |
||

733 | shl r2d, 2 |
||

734 | shl r3d, 2 |
||

735 | .loop: |
||

736 | mova m0, [r4+r1*2] ; p1 |
||

737 | mova m1, [r4+r5] ; p0 |
||

738 | mova m2, [r0] ; q0 |
||

739 | mova m3, [r0+r1] ; q1 |
||

740 | LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] |
||

741 | LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] |
||

742 | mova t3, [r0+r1*2] ; q2 |
||

743 | LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] |
||

744 | add r0, mmsize |
||

745 | add r4, mmsize |
||

746 | dec r6 |
||

747 | jg .loop |
||

748 | ADD rsp, pad |
||

749 | RET |
||

750 | |||

751 | ;----------------------------------------------------------------------------- |
||

752 | ; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |
||

753 | ;----------------------------------------------------------------------------- |
||

754 | cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16) |
||

755 | LUMA_INTRA_INIT 8 |
||

756 | %if mmsize == 8 |
||

757 | lea r4, [r1*3] |
||

758 | mov r5, 32/mmsize |
||

759 | %else |
||

760 | lea r4, [r1*4] |
||

761 | lea r5, [r1*3] ; 3*stride |
||

762 | add r4, r0 ; pix+4*stride |
||

763 | mov r6, 32/mmsize |
||

764 | %endif |
||

765 | shl r2d, 2 |
||

766 | shl r3d, 2 |
||

767 | .loop: |
||

768 | LUMA_H_INTRA_LOAD |
||

769 | LUMA_INTRA_INTER t8, t9, t10, t5, t6 |
||

770 | |||

771 | LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 |
||

772 | mova t3, t6 ; q2 |
||

773 | LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 |
||

774 | |||

775 | mova m2, t4 |
||

776 | mova m0, t11 |
||

777 | mova m1, t5 |
||

778 | mova m3, t8 |
||

779 | mova m6, t6 |
||

780 | |||

781 | LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 |
||

782 | |||

783 | lea r0, [r0+r1*(mmsize/2)] |
||

784 | %if mmsize == 8 |
||

785 | dec r5 |
||

786 | %else |
||

787 | lea r4, [r4+r1*(mmsize/2)] |
||

788 | dec r6 |
||

789 | %endif |
||

790 | jg .loop |
||

791 | ADD rsp, pad |
||

792 | RET |
||

793 | %endmacro |
||

794 | |||

795 | %ifndef ARCH_X86_64 |
||

796 | INIT_MMX |
||

797 | DEBLOCK_LUMA mmxext |
||

798 | DEBLOCK_LUMA_INTRA mmxext |
||

799 | INIT_XMM |
||

800 | DEBLOCK_LUMA sse2 |
||

801 | DEBLOCK_LUMA_INTRA sse2 |
||

802 | INIT_AVX |
||

803 | DEBLOCK_LUMA avx |
||

804 | DEBLOCK_LUMA_INTRA avx |
||

805 | %endif |
||

806 | 5705b020 | Jason Garrett-Glaser | |

807 | ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp |
||

808 | ; out: %1=p0', %2=q0' |
||

809 | %macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 |
||

810 | mova %6, [pw_2] |
||

811 | paddw %6, %3 |
||

812 | paddw %6, %4 |
||

813 | paddw %7, %6, %2 |
||

814 | paddw %6, %1 |
||

815 | paddw %6, %3 |
||

816 | paddw %7, %4 |
||

817 | psraw %6, 2 |
||

818 | psraw %7, 2 |
||

819 | psubw %6, %1 |
||

820 | psubw %7, %2 |
||

821 | pand %6, %5 |
||

822 | pand %7, %5 |
||

823 | paddw %1, %6 |
||

824 | paddw %2, %7 |
||

825 | %endmacro |
||

826 | |||

827 | %macro CHROMA_V_LOAD 1 |
||

828 | mova m0, [r0] ; p1 |
||

829 | mova m1, [r0+r1] ; p0 |
||

830 | mova m2, [%1] ; q0 |
||

831 | mova m3, [%1+r1] ; q1 |
||

832 | %endmacro |
||

833 | |||

834 | %macro CHROMA_V_STORE 0 |
||

835 | mova [r0+1*r1], m1 |
||

836 | mova [r0+2*r1], m2 |
||

837 | %endmacro |
||

838 | |||

839 | %macro DEBLOCK_CHROMA 1 |
||

840 | ;----------------------------------------------------------------------------- |
||

841 | ; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
||

842 | ;----------------------------------------------------------------------------- |
||

843 | cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16) |
||

844 | mov r5, r0 |
||

845 | sub r0, r1 |
||

846 | sub r0, r1 |
||

847 | shl r2d, 2 |
||

848 | shl r3d, 2 |
||

849 | %if mmsize < 16 |
||

850 | mov r6, 16/mmsize |
||

851 | .loop: |
||

852 | %endif |
||

853 | CHROMA_V_LOAD r5 |
||

854 | LOAD_AB m4, m5, r2, r3 |
||

855 | LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 |
||

856 | pxor m4, m4 |
||

857 | LOAD_TC m6, r4 |
||

858 | psubw m6, [pw_3] |
||

859 | pmaxsw m6, m4 |
||

860 | pand m7, m6 |
||

861 | DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 |
||

862 | CHROMA_V_STORE |
||

863 | %if mmsize < 16 |
||

864 | add r0, mmsize |
||

865 | add r5, mmsize |
||

866 | add r4, mmsize/8 |
||

867 | dec r6 |
||

868 | jg .loop |
||

869 | REP_RET |
||

870 | %else |
||

871 | RET |
||

872 | %endif |
||

873 | |||

874 | ;----------------------------------------------------------------------------- |
||

875 | ; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) |
||

876 | ;----------------------------------------------------------------------------- |
||

877 | cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16) |
||

878 | mov r4, r0 |
||

879 | sub r0, r1 |
||

880 | sub r0, r1 |
||

881 | shl r2d, 2 |
||

882 | shl r3d, 2 |
||

883 | %if mmsize < 16 |
||

884 | mov r5, 16/mmsize |
||

885 | .loop: |
||

886 | %endif |
||

887 | CHROMA_V_LOAD r4 |
||

888 | LOAD_AB m4, m5, r2, r3 |
||

889 | LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 |
||

890 | CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 |
||

891 | CHROMA_V_STORE |
||

892 | %if mmsize < 16 |
||

893 | add r0, mmsize |
||

894 | add r4, mmsize |
||

895 | dec r5 |
||

896 | jg .loop |
||

897 | REP_RET |
||

898 | %else |
||

899 | RET |
||

900 | %endif |
||

901 | %endmacro |
||

902 | |||

903 | %ifndef ARCH_X86_64 |
||

904 | INIT_MMX |
||

905 | DEBLOCK_CHROMA mmxext |
||

906 | %endif |
||

907 | INIT_XMM |
||

908 | DEBLOCK_CHROMA sse2 |
||

909 | INIT_AVX |
||

910 | DEBLOCK_CHROMA avx |