## ffmpeg / libavcodec / x86 / h264_deblock_10bit.asm @ 5705b020

History | View | Annotate | Download (22.2 KB)

1 |
;***************************************************************************** |
---|---|

2 |
;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code |

3 |
;***************************************************************************** |

4 |
;* Copyright (C) 2005-2011 x264 project |

5 |
;* |

6 |
;* Authors: Oskar Arvidsson <oskar@irock.se> |

7 |
;* Loren Merritt <lorenm@u.washington.edu> |

8 |
;* Jason Garrett-Glaser <darkshikari@gmail.com> |

9 |
;* |

10 |
;* This file is part of Libav. |

11 |
;* |

12 |
;* Libav is free software; you can redistribute it and/or |

13 |
;* modify it under the terms of the GNU Lesser General Public |

14 |
;* License as published by the Free Software Foundation; either |

15 |
;* version 2.1 of the License, or (at your option) any later version. |

16 |
;* |

17 |
;* Libav is distributed in the hope that it will be useful, |

18 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

19 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

20 |
;* Lesser General Public License for more details. |

21 |
;* |

22 |
;* You should have received a copy of the GNU Lesser General Public |

23 |
;* License along with Libav; if not, write to the Free Software |

24 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

25 |
;****************************************************************************** |

26 | |

27 |
%include "x86inc.asm" |

28 |
%include "x86util.asm" |

29 | |

30 |
SECTION_RODATA |

31 | |

32 |
pw_pixel_max: times 8 dw ((1 << 10)-1) |

33 | |

34 |
SECTION .text |

35 | |

36 |
cextern pw_2 |

37 |
cextern pw_3 |

38 |
cextern pw_4 |

39 | |

40 |
; out: %4 = |%1-%2|-%3 |

41 |
; clobbers: %5 |

42 |
%macro ABS_SUB 5 |

43 |
psubusw %5, %2, %1 |

44 |
psubusw %4, %1, %2 |

45 |
por %4, %5 |

46 |
psubw %4, %3 |

47 |
%endmacro |

48 | |

49 |
; out: %4 = |%1-%2|<%3 |

50 |
%macro DIFF_LT 5 |

51 |
psubusw %4, %2, %1 |

52 |
psubusw %5, %1, %2 |

53 |
por %5, %4 ; |%1-%2| |

54 |
pxor %4, %4 |

55 |
psubw %5, %3 ; |%1-%2|-%3 |

56 |
pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 |

57 |
%endmacro |

58 | |

59 |
%macro LOAD_AB 4 |

60 |
movd %1, %3 |

61 |
movd %2, %4 |

62 |
SPLATW %1, %1 |

63 |
SPLATW %2, %2 |

64 |
%endmacro |

65 | |

66 |
; in: %2=tc reg |

67 |
; out: %1=splatted tc |

68 |
%macro LOAD_TC 2 |

69 |
movd %1, [%2] |

70 |
punpcklbw %1, %1 |

71 |
%if mmsize == 8 |

72 |
pshufw %1, %1, 0 |

73 |
%else |

74 |
pshuflw %1, %1, 01010000b |

75 |
pshufd %1, %1, 01010000b |

76 |
%endif |

77 |
psraw %1, 6 |

78 |
%endmacro |

79 | |

80 |
; in: %1=p1, %2=p0, %3=q0, %4=q1 |

81 |
; %5=alpha, %6=beta, %7-%9=tmp |

82 |
; out: %7=mask |

83 |
%macro LOAD_MASK 9 |

84 |
ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha |

85 |
ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta |

86 |
pand %8, %9 |

87 |
ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta |

88 |
pxor %7, %7 |

89 |
pand %8, %9 |

90 |
pcmpgtw %7, %8 |

91 |
%endmacro |

92 | |

93 |
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp |

94 |
; out: %1=p0', m2=q0' |

95 |
%macro DEBLOCK_P0_Q0 7 |

96 |
psubw %3, %4 |

97 |
pxor %7, %7 |

98 |
paddw %3, [pw_4] |

99 |
psubw %7, %5 |

100 |
psubw %6, %2, %1 |

101 |
psllw %6, 2 |

102 |
paddw %3, %6 |

103 |
psraw %3, 3 |

104 |
mova %6, [pw_pixel_max] |

105 |
CLIPW %3, %7, %5 |

106 |
pxor %7, %7 |

107 |
paddw %1, %3 |

108 |
psubw %2, %3 |

109 |
CLIPW %1, %7, %6 |

110 |
CLIPW %2, %7, %6 |

111 |
%endmacro |

112 | |

113 |
; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp |

114 |
%macro LUMA_Q1 6 |

115 |
pavgw %6, %3, %4 ; (p0+q0+1)>>1 |

116 |
paddw %1, %6 |

117 |
pxor %6, %6 |

118 |
psraw %1, 1 |

119 |
psubw %6, %5 |

120 |
psubw %1, %2 |

121 |
CLIPW %1, %6, %5 |

122 |
paddw %1, %2 |

123 |
%endmacro |

124 | |

125 |
%macro LUMA_DEBLOCK_ONE 3 |

126 |
DIFF_LT m5, %1, bm, m4, m6 |

127 |
pxor m6, m6 |

128 |
mova %3, m4 |

129 |
pcmpgtw m6, tcm |

130 |
pand m4, tcm |

131 |
pandn m6, m7 |

132 |
pand m4, m6 |

133 |
LUMA_Q1 m5, %2, m1, m2, m4, m6 |

134 |
%endmacro |

135 | |

136 |
%macro LUMA_H_STORE 2 |

137 |
%if mmsize == 8 |

138 |
movq [r0-4], m0 |

139 |
movq [r0+r1-4], m1 |

140 |
movq [r0+r1*2-4], m2 |

141 |
movq [r0+%2-4], m3 |

142 |
%else |

143 |
movq [r0-4], m0 |

144 |
movhps [r0+r1-4], m0 |

145 |
movq [r0+r1*2-4], m1 |

146 |
movhps [%1-4], m1 |

147 |
movq [%1+r1-4], m2 |

148 |
movhps [%1+r1*2-4], m2 |

149 |
movq [%1+%2-4], m3 |

150 |
movhps [%1+r1*4-4], m3 |

151 |
%endif |

152 |
%endmacro |

153 | |

154 |
%macro DEBLOCK_LUMA 1 |

155 |
;----------------------------------------------------------------------------- |

156 |
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

157 |
;----------------------------------------------------------------------------- |

158 |
cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16) |

159 |
%assign pad 5*mmsize+12-(stack_offset&15) |

160 |
%define tcm [rsp] |

161 |
%define ms1 [rsp+mmsize] |

162 |
%define ms2 [rsp+mmsize*2] |

163 |
%define am [rsp+mmsize*3] |

164 |
%define bm [rsp+mmsize*4] |

165 |
SUB rsp, pad |

166 |
shl r2d, 2 |

167 |
shl r3d, 2 |

168 |
LOAD_AB m4, m5, r2, r3 |

169 |
mov r3, 32/mmsize |

170 |
mov r2, r0 |

171 |
sub r0, r1 |

172 |
mova am, m4 |

173 |
sub r0, r1 |

174 |
mova bm, m5 |

175 |
sub r0, r1 |

176 |
.loop: |

177 |
mova m0, [r0+r1] |

178 |
mova m1, [r0+r1*2] |

179 |
mova m2, [r2] |

180 |
mova m3, [r2+r1] |

181 | |

182 |
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 |

183 |
LOAD_TC m6, r4 |

184 |
mova tcm, m6 |

185 | |

186 |
mova m5, [r0] |

187 |
LUMA_DEBLOCK_ONE m1, m0, ms1 |

188 |
mova [r0+r1], m5 |

189 | |

190 |
mova m5, [r2+r1*2] |

191 |
LUMA_DEBLOCK_ONE m2, m3, ms2 |

192 |
mova [r2+r1], m5 |

193 | |

194 |
pxor m5, m5 |

195 |
mova m6, tcm |

196 |
pcmpgtw m5, tcm |

197 |
psubw m6, ms1 |

198 |
pandn m5, m7 |

199 |
psubw m6, ms2 |

200 |
pand m5, m6 |

201 |
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 |

202 |
mova [r0+r1*2], m1 |

203 |
mova [r2], m2 |

204 | |

205 |
add r0, mmsize |

206 |
add r2, mmsize |

207 |
add r4, mmsize/8 |

208 |
dec r3 |

209 |
jg .loop |

210 |
ADD rsp, pad |

211 |
RET |

212 | |

213 |
cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16) |

214 |
%assign pad 7*mmsize+12-(stack_offset&15) |

215 |
%define tcm [rsp] |

216 |
%define ms1 [rsp+mmsize] |

217 |
%define ms2 [rsp+mmsize*2] |

218 |
%define p1m [rsp+mmsize*3] |

219 |
%define p2m [rsp+mmsize*4] |

220 |
%define am [rsp+mmsize*5] |

221 |
%define bm [rsp+mmsize*6] |

222 |
SUB rsp, pad |

223 |
shl r2d, 2 |

224 |
shl r3d, 2 |

225 |
LOAD_AB m4, m5, r2, r3 |

226 |
mov r3, r1 |

227 |
mova am, m4 |

228 |
add r3, r1 |

229 |
mov r5, 32/mmsize |

230 |
mova bm, m5 |

231 |
add r3, r1 |

232 |
%if mmsize == 16 |

233 |
mov r2, r0 |

234 |
add r2, r3 |

235 |
%endif |

236 |
.loop: |

237 |
%if mmsize == 8 |

238 |
movq m2, [r0-8] ; y q2 q1 q0 |

239 |
movq m7, [r0+0] |

240 |
movq m5, [r0+r1-8] |

241 |
movq m3, [r0+r1+0] |

242 |
movq m0, [r0+r1*2-8] |

243 |
movq m6, [r0+r1*2+0] |

244 |
movq m1, [r0+r3-8] |

245 |
TRANSPOSE4x4W 2, 5, 0, 1, 4 |

246 |
SWAP 2, 7 |

247 |
movq m7, [r0+r3] |

248 |
TRANSPOSE4x4W 2, 3, 6, 7, 4 |

249 |
%else |

250 |
movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x |

251 |
movu m0, [r0+r1-8] |

252 |
movu m2, [r0+r1*2-8] |

253 |
movu m3, [r2-8] |

254 |
TRANSPOSE4x4W 5, 0, 2, 3, 6 |

255 |
mova tcm, m3 |

256 | |

257 |
movu m4, [r2+r1-8] |

258 |
movu m1, [r2+r1*2-8] |

259 |
movu m3, [r2+r3-8] |

260 |
movu m7, [r2+r1*4-8] |

261 |
TRANSPOSE4x4W 4, 1, 3, 7, 6 |

262 | |

263 |
mova m6, tcm |

264 |
punpcklqdq m6, m7 |

265 |
punpckhqdq m5, m4 |

266 |
SBUTTERFLY qdq, 0, 1, 7 |

267 |
SBUTTERFLY qdq, 2, 3, 7 |

268 |
%endif |

269 | |

270 |
mova p2m, m6 |

271 |
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 |

272 |
LOAD_TC m6, r4 |

273 |
mova tcm, m6 |

274 | |

275 |
LUMA_DEBLOCK_ONE m1, m0, ms1 |

276 |
mova p1m, m5 |

277 | |

278 |
mova m5, p2m |

279 |
LUMA_DEBLOCK_ONE m2, m3, ms2 |

280 |
mova p2m, m5 |

281 | |

282 |
pxor m5, m5 |

283 |
mova m6, tcm |

284 |
pcmpgtw m5, tcm |

285 |
psubw m6, ms1 |

286 |
pandn m5, m7 |

287 |
psubw m6, ms2 |

288 |
pand m5, m6 |

289 |
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 |

290 |
mova m0, p1m |

291 |
mova m3, p2m |

292 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

293 |
LUMA_H_STORE r2, r3 |

294 | |

295 |
add r4, mmsize/8 |

296 |
lea r0, [r0+r1*(mmsize/2)] |

297 |
lea r2, [r2+r1*(mmsize/2)] |

298 |
dec r5 |

299 |
jg .loop |

300 |
ADD rsp, pad |

301 |
RET |

302 |
%endmacro |

303 | |

304 |
INIT_XMM |

305 |
%ifdef ARCH_X86_64 |

306 |
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 |

307 |
; m12=alpha, m13=beta |

308 |
; out: m0=p1', m3=q1', m1=p0', m2=q0' |

309 |
; clobbers: m4, m5, m6, m7, m10, m11, m14 |

310 |
%macro DEBLOCK_LUMA_INTER_SSE2 0 |

311 |
LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 |

312 |
LOAD_TC m6, r4 |

313 |
DIFF_LT m8, m1, m13, m10, m4 |

314 |
DIFF_LT m9, m2, m13, m11, m4 |

315 |
pand m6, m7 |

316 | |

317 |
mova m14, m6 |

318 |
pxor m4, m4 |

319 |
pcmpgtw m6, m4 |

320 |
pand m6, m14 |

321 | |

322 |
mova m5, m10 |

323 |
pand m5, m6 |

324 |
LUMA_Q1 m8, m0, m1, m2, m5, m4 |

325 | |

326 |
mova m5, m11 |

327 |
pand m5, m6 |

328 |
LUMA_Q1 m9, m3, m1, m2, m5, m4 |

329 | |

330 |
pxor m4, m4 |

331 |
psubw m6, m10 |

332 |
pcmpgtw m4, m14 |

333 |
pandn m4, m7 |

334 |
psubw m6, m11 |

335 |
pand m4, m6 |

336 |
DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 |

337 | |

338 |
SWAP 0, 8 |

339 |
SWAP 3, 9 |

340 |
%endmacro |

341 | |

342 |
%macro DEBLOCK_LUMA_64 1 |

343 |
cglobal deblock_v_luma_10_%1, 5,5,15 |

344 |
%define p2 m8 |

345 |
%define p1 m0 |

346 |
%define p0 m1 |

347 |
%define q0 m2 |

348 |
%define q1 m3 |

349 |
%define q2 m9 |

350 |
%define mask0 m7 |

351 |
%define mask1 m10 |

352 |
%define mask2 m11 |

353 |
shl r2d, 2 |

354 |
shl r3d, 2 |

355 |
LOAD_AB m12, m13, r2, r3 |

356 |
mov r2, r0 |

357 |
sub r0, r1 |

358 |
sub r0, r1 |

359 |
sub r0, r1 |

360 |
mov r3, 2 |

361 |
.loop: |

362 |
mova p2, [r0] |

363 |
mova p1, [r0+r1] |

364 |
mova p0, [r0+r1*2] |

365 |
mova q0, [r2] |

366 |
mova q1, [r2+r1] |

367 |
mova q2, [r2+r1*2] |

368 |
DEBLOCK_LUMA_INTER_SSE2 |

369 |
mova [r0+r1], p1 |

370 |
mova [r0+r1*2], p0 |

371 |
mova [r2], q0 |

372 |
mova [r2+r1], q1 |

373 |
add r0, mmsize |

374 |
add r2, mmsize |

375 |
add r4, 2 |

376 |
dec r3 |

377 |
jg .loop |

378 |
REP_RET |

379 | |

380 |
cglobal deblock_h_luma_10_%1, 5,7,15 |

381 |
shl r2d, 2 |

382 |
shl r3d, 2 |

383 |
LOAD_AB m12, m13, r2, r3 |

384 |
mov r2, r1 |

385 |
add r2, r1 |

386 |
add r2, r1 |

387 |
mov r5, r0 |

388 |
add r5, r2 |

389 |
mov r6, 2 |

390 |
.loop: |

391 |
movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x |

392 |
movu m0, [r0+r1-8] |

393 |
movu m2, [r0+r1*2-8] |

394 |
movu m9, [r5-8] |

395 |
movu m5, [r5+r1-8] |

396 |
movu m1, [r5+r1*2-8] |

397 |
movu m3, [r5+r2-8] |

398 |
movu m7, [r5+r1*4-8] |

399 | |

400 |
TRANSPOSE4x4W 8, 0, 2, 9, 10 |

401 |
TRANSPOSE4x4W 5, 1, 3, 7, 10 |

402 | |

403 |
punpckhqdq m8, m5 |

404 |
SBUTTERFLY qdq, 0, 1, 10 |

405 |
SBUTTERFLY qdq, 2, 3, 10 |

406 |
punpcklqdq m9, m7 |

407 | |

408 |
DEBLOCK_LUMA_INTER_SSE2 |

409 | |

410 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

411 |
LUMA_H_STORE r5, r2 |

412 |
add r4, 2 |

413 |
lea r0, [r0+r1*8] |

414 |
lea r5, [r5+r1*8] |

415 |
dec r6 |

416 |
jg .loop |

417 |
REP_RET |

418 |
%endmacro |

419 | |

420 |
INIT_XMM |

421 |
DEBLOCK_LUMA_64 sse2 |

422 |
INIT_AVX |

423 |
DEBLOCK_LUMA_64 avx |

424 |
%endif |

425 | |

426 |
%macro SWAPMOVA 2 |

427 |
%ifid %1 |

428 |
SWAP %1, %2 |

429 |
%else |

430 |
mova %1, %2 |

431 |
%endif |

432 |
%endmacro |

433 | |

434 |
; in: t0-t2: tmp registers |

435 |
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 |

436 |
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' |

437 |
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory |

438 |
%ifdef ARCH_X86_64 |

439 |
paddw t0, %3, %2 |

440 |
mova t2, %4 |

441 |
paddw t2, %3 |

442 |
%else |

443 |
mova t0, %3 |

444 |
mova t2, %4 |

445 |
paddw t0, %2 |

446 |
paddw t2, %3 |

447 |
%endif |

448 |
paddw t0, %1 |

449 |
paddw t2, t2 |

450 |
paddw t0, %5 |

451 |
paddw t2, %9 |

452 |
paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) |

453 |
paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) |

454 | |

455 |
psrlw t2, 3 |

456 |
psrlw t1, t0, 2 |

457 |
psubw t2, %3 |

458 |
psubw t1, %2 |

459 |
pand t2, %8 |

460 |
pand t1, %8 |

461 |
paddw t2, %3 |

462 |
paddw t1, %2 |

463 |
SWAPMOVA %11, t1 |

464 | |

465 |
psubw t1, t0, %3 |

466 |
paddw t0, t0 |

467 |
psubw t1, %5 |

468 |
psubw t0, %3 |

469 |
paddw t1, %6 |

470 |
paddw t1, %2 |

471 |
paddw t0, %6 |

472 |
psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 |

473 |
psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 |

474 | |

475 |
pxor t0, t1 |

476 |
pxor t1, %1 |

477 |
pand t0, %8 |

478 |
pand t1, %7 |

479 |
pxor t0, t1 |

480 |
pxor t0, %1 |

481 |
SWAPMOVA %10, t0 |

482 |
SWAPMOVA %12, t2 |

483 |
%endmacro |

484 | |

485 |
%macro LUMA_INTRA_INIT 1 |

486 |
%xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) |

487 |
%define t0 m4 |

488 |
%define t1 m5 |

489 |
%define t2 m6 |

490 |
%define t3 m7 |

491 |
%assign i 4 |

492 |
%rep %1 |

493 |
CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] |

494 |
%assign i i+1 |

495 |
%endrep |

496 |
SUB rsp, pad |

497 |
%endmacro |

498 | |

499 |
; in: %1-%3=tmp, %4=p2, %5=q2 |

500 |
%macro LUMA_INTRA_INTER 5 |

501 |
LOAD_AB t0, t1, r2d, r3d |

502 |
mova %1, t0 |

503 |
LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 |

504 |
%ifdef ARCH_X86_64 |

505 |
mova %2, t0 ; mask0 |

506 |
psrlw t3, %1, 2 |

507 |
%else |

508 |
mova t3, %1 |

509 |
mova %2, t0 ; mask0 |

510 |
psrlw t3, 2 |

511 |
%endif |

512 |
paddw t3, [pw_2] ; alpha/4+2 |

513 |
DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 |

514 |
pand t2, %2 |

515 |
mova t3, %5 ; q2 |

516 |
mova %1, t2 ; mask1 |

517 |
DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta |

518 |
pand t2, %1 |

519 |
mova t3, %4 ; p2 |

520 |
mova %3, t2 ; mask1q |

521 |
DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta |

522 |
pand t2, %1 |

523 |
mova %1, t2 ; mask1p |

524 |
%endmacro |

525 | |

526 |
%macro LUMA_H_INTRA_LOAD 0 |

527 |
%if mmsize == 8 |

528 |
movu t0, [r0-8] |

529 |
movu t1, [r0+r1-8] |

530 |
movu m0, [r0+r1*2-8] |

531 |
movu m1, [r0+r4-8] |

532 |
TRANSPOSE4x4W 4, 5, 0, 1, 2 |

533 |
mova t4, t0 ; p3 |

534 |
mova t5, t1 ; p2 |

535 | |

536 |
movu m2, [r0] |

537 |
movu m3, [r0+r1] |

538 |
movu t0, [r0+r1*2] |

539 |
movu t1, [r0+r4] |

540 |
TRANSPOSE4x4W 2, 3, 4, 5, 6 |

541 |
mova t6, t0 ; q2 |

542 |
mova t7, t1 ; q3 |

543 |
%else |

544 |
movu t0, [r0-8] |

545 |
movu t1, [r0+r1-8] |

546 |
movu m0, [r0+r1*2-8] |

547 |
movu m1, [r0+r5-8] |

548 |
movu m2, [r4-8] |

549 |
movu m3, [r4+r1-8] |

550 |
movu t2, [r4+r1*2-8] |

551 |
movu t3, [r4+r5-8] |

552 |
TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 |

553 |
mova t4, t0 ; p3 |

554 |
mova t5, t1 ; p2 |

555 |
mova t6, t2 ; q2 |

556 |
mova t7, t3 ; q3 |

557 |
%endif |

558 |
%endmacro |

559 | |

560 |
; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp |

561 |
%macro LUMA_H_INTRA_STORE 9 |

562 |
%if mmsize == 8 |

563 |
TRANSPOSE4x4W %1, %2, %3, %4, %9 |

564 |
movq [r0-8], m%1 |

565 |
movq [r0+r1-8], m%2 |

566 |
movq [r0+r1*2-8], m%3 |

567 |
movq [r0+r4-8], m%4 |

568 |
movq m%1, %8 |

569 |
TRANSPOSE4x4W %5, %6, %7, %1, %9 |

570 |
movq [r0], m%5 |

571 |
movq [r0+r1], m%6 |

572 |
movq [r0+r1*2], m%7 |

573 |
movq [r0+r4], m%1 |

574 |
%else |

575 |
TRANSPOSE2x4x4W %1, %2, %3, %4, %9 |

576 |
movq [r0-8], m%1 |

577 |
movq [r0+r1-8], m%2 |

578 |
movq [r0+r1*2-8], m%3 |

579 |
movq [r0+r5-8], m%4 |

580 |
movhps [r4-8], m%1 |

581 |
movhps [r4+r1-8], m%2 |

582 |
movhps [r4+r1*2-8], m%3 |

583 |
movhps [r4+r5-8], m%4 |

584 |
%ifnum %8 |

585 |
SWAP %1, %8 |

586 |
%else |

587 |
mova m%1, %8 |

588 |
%endif |

589 |
TRANSPOSE2x4x4W %5, %6, %7, %1, %9 |

590 |
movq [r0], m%5 |

591 |
movq [r0+r1], m%6 |

592 |
movq [r0+r1*2], m%7 |

593 |
movq [r0+r5], m%1 |

594 |
movhps [r4], m%5 |

595 |
movhps [r4+r1], m%6 |

596 |
movhps [r4+r1*2], m%7 |

597 |
movhps [r4+r5], m%1 |

598 |
%endif |

599 |
%endmacro |

600 | |

601 |
%ifdef ARCH_X86_64 |

602 |
;----------------------------------------------------------------------------- |

603 |
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |

604 |
;----------------------------------------------------------------------------- |

605 |
%macro DEBLOCK_LUMA_INTRA_64 1 |

606 |
cglobal deblock_v_luma_intra_10_%1, 4,7,16 |

607 |
%define t0 m1 |

608 |
%define t1 m2 |

609 |
%define t2 m4 |

610 |
%define p2 m8 |

611 |
%define p1 m9 |

612 |
%define p0 m10 |

613 |
%define q0 m11 |

614 |
%define q1 m12 |

615 |
%define q2 m13 |

616 |
%define aa m5 |

617 |
%define bb m14 |

618 |
lea r4, [r1*4] |

619 |
lea r5, [r1*3] ; 3*stride |

620 |
neg r4 |

621 |
add r4, r0 ; pix-4*stride |

622 |
mov r6, 2 |

623 |
mova m0, [pw_2] |

624 |
shl r2d, 2 |

625 |
shl r3d, 2 |

626 |
LOAD_AB aa, bb, r2d, r3d |

627 |
.loop |

628 |
mova p2, [r4+r1] |

629 |
mova p1, [r4+2*r1] |

630 |
mova p0, [r4+r5] |

631 |
mova q0, [r0] |

632 |
mova q1, [r0+r1] |

633 |
mova q2, [r0+2*r1] |

634 | |

635 |
LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 |

636 |
mova t2, aa |

637 |
psrlw t2, 2 |

638 |
paddw t2, m0 ; alpha/4+2 |

639 |
DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 |

640 |
DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta |

641 |
DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta |

642 |
pand m6, m3 |

643 |
pand m7, m6 |

644 |
pand m6, t1 |

645 |
LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] |

646 |
LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] |

647 |
add r0, mmsize |

648 |
add r4, mmsize |

649 |
dec r6 |

650 |
jg .loop |

651 |
REP_RET |

652 | |

653 |
;----------------------------------------------------------------------------- |

654 |
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |

655 |
;----------------------------------------------------------------------------- |

656 |
cglobal deblock_h_luma_intra_10_%1, 4,7,16 |

657 |
%define t0 m15 |

658 |
%define t1 m14 |

659 |
%define t2 m2 |

660 |
%define q3 m5 |

661 |
%define q2 m8 |

662 |
%define q1 m9 |

663 |
%define q0 m10 |

664 |
%define p0 m11 |

665 |
%define p1 m12 |

666 |
%define p2 m13 |

667 |
%define p3 m4 |

668 |
%define spill [rsp] |

669 |
%assign pad 24-(stack_offset&15) |

670 |
SUB rsp, pad |

671 |
lea r4, [r1*4] |

672 |
lea r5, [r1*3] ; 3*stride |

673 |
add r4, r0 ; pix+4*stride |

674 |
mov r6, 2 |

675 |
mova m0, [pw_2] |

676 |
shl r2d, 2 |

677 |
shl r3d, 2 |

678 |
.loop |

679 |
movu q3, [r0-8] |

680 |
movu q2, [r0+r1-8] |

681 |
movu q1, [r0+r1*2-8] |

682 |
movu q0, [r0+r5-8] |

683 |
movu p0, [r4-8] |

684 |
movu p1, [r4+r1-8] |

685 |
movu p2, [r4+r1*2-8] |

686 |
movu p3, [r4+r5-8] |

687 |
TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 |

688 | |

689 |
LOAD_AB m1, m2, r2d, r3d |

690 |
LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 |

691 |
psrlw m1, 2 |

692 |
paddw m1, m0 ; alpha/4+2 |

693 |
DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 |

694 |
DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta |

695 |
DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta |

696 |
pand m6, m3 |

697 |
pand m7, m6 |

698 |
pand m6, t1 |

699 | |

700 |
mova spill, q3 |

701 |
LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 |

702 |
LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 |

703 |
mova m7, spill |

704 | |

705 |
LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 |

706 | |

707 |
lea r0, [r0+r1*8] |

708 |
lea r4, [r4+r1*8] |

709 |
dec r6 |

710 |
jg .loop |

711 |
ADD rsp, pad |

712 |
RET |

713 |
%endmacro |

714 | |

715 |
INIT_XMM |

716 |
DEBLOCK_LUMA_INTRA_64 sse2 |

717 |
INIT_AVX |

718 |
DEBLOCK_LUMA_INTRA_64 avx |

719 | |

720 |
%endif |

721 | |

722 |
%macro DEBLOCK_LUMA_INTRA 1 |

723 |
;----------------------------------------------------------------------------- |

724 |
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |

725 |
;----------------------------------------------------------------------------- |

726 |
cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16) |

727 |
LUMA_INTRA_INIT 3 |

728 |
lea r4, [r1*4] |

729 |
lea r5, [r1*3] |

730 |
neg r4 |

731 |
add r4, r0 |

732 |
mov r6, 32/mmsize |

733 |
shl r2d, 2 |

734 |
shl r3d, 2 |

735 |
.loop: |

736 |
mova m0, [r4+r1*2] ; p1 |

737 |
mova m1, [r4+r5] ; p0 |

738 |
mova m2, [r0] ; q0 |

739 |
mova m3, [r0+r1] ; q1 |

740 |
LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] |

741 |
LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] |

742 |
mova t3, [r0+r1*2] ; q2 |

743 |
LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] |

744 |
add r0, mmsize |

745 |
add r4, mmsize |

746 |
dec r6 |

747 |
jg .loop |

748 |
ADD rsp, pad |

749 |
RET |

750 | |

751 |
;----------------------------------------------------------------------------- |

752 |
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |

753 |
;----------------------------------------------------------------------------- |

754 |
cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16) |

755 |
LUMA_INTRA_INIT 8 |

756 |
%if mmsize == 8 |

757 |
lea r4, [r1*3] |

758 |
mov r5, 32/mmsize |

759 |
%else |

760 |
lea r4, [r1*4] |

761 |
lea r5, [r1*3] ; 3*stride |

762 |
add r4, r0 ; pix+4*stride |

763 |
mov r6, 32/mmsize |

764 |
%endif |

765 |
shl r2d, 2 |

766 |
shl r3d, 2 |

767 |
.loop: |

768 |
LUMA_H_INTRA_LOAD |

769 |
LUMA_INTRA_INTER t8, t9, t10, t5, t6 |

770 | |

771 |
LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 |

772 |
mova t3, t6 ; q2 |

773 |
LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 |

774 | |

775 |
mova m2, t4 |

776 |
mova m0, t11 |

777 |
mova m1, t5 |

778 |
mova m3, t8 |

779 |
mova m6, t6 |

780 | |

781 |
LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 |

782 | |

783 |
lea r0, [r0+r1*(mmsize/2)] |

784 |
%if mmsize == 8 |

785 |
dec r5 |

786 |
%else |

787 |
lea r4, [r4+r1*(mmsize/2)] |

788 |
dec r6 |

789 |
%endif |

790 |
jg .loop |

791 |
ADD rsp, pad |

792 |
RET |

793 |
%endmacro |

794 | |

795 |
%ifndef ARCH_X86_64 |

796 |
INIT_MMX |

797 |
DEBLOCK_LUMA mmxext |

798 |
DEBLOCK_LUMA_INTRA mmxext |

799 |
INIT_XMM |

800 |
DEBLOCK_LUMA sse2 |

801 |
DEBLOCK_LUMA_INTRA sse2 |

802 |
INIT_AVX |

803 |
DEBLOCK_LUMA avx |

804 |
DEBLOCK_LUMA_INTRA avx |

805 |
%endif |

806 | |

807 |
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp |

808 |
; out: %1=p0', %2=q0' |

809 |
%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 |

810 |
mova %6, [pw_2] |

811 |
paddw %6, %3 |

812 |
paddw %6, %4 |

813 |
paddw %7, %6, %2 |

814 |
paddw %6, %1 |

815 |
paddw %6, %3 |

816 |
paddw %7, %4 |

817 |
psraw %6, 2 |

818 |
psraw %7, 2 |

819 |
psubw %6, %1 |

820 |
psubw %7, %2 |

821 |
pand %6, %5 |

822 |
pand %7, %5 |

823 |
paddw %1, %6 |

824 |
paddw %2, %7 |

825 |
%endmacro |

826 | |

827 |
%macro CHROMA_V_LOAD 1 |

828 |
mova m0, [r0] ; p1 |

829 |
mova m1, [r0+r1] ; p0 |

830 |
mova m2, [%1] ; q0 |

831 |
mova m3, [%1+r1] ; q1 |

832 |
%endmacro |

833 | |

834 |
%macro CHROMA_V_STORE 0 |

835 |
mova [r0+1*r1], m1 |

836 |
mova [r0+2*r1], m2 |

837 |
%endmacro |

838 | |

839 |
%macro DEBLOCK_CHROMA 1 |

840 |
;----------------------------------------------------------------------------- |

841 |
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

842 |
;----------------------------------------------------------------------------- |

843 |
cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16) |

844 |
mov r5, r0 |

845 |
sub r0, r1 |

846 |
sub r0, r1 |

847 |
shl r2d, 2 |

848 |
shl r3d, 2 |

849 |
%if mmsize < 16 |

850 |
mov r6, 16/mmsize |

851 |
.loop: |

852 |
%endif |

853 |
CHROMA_V_LOAD r5 |

854 |
LOAD_AB m4, m5, r2, r3 |

855 |
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 |

856 |
pxor m4, m4 |

857 |
LOAD_TC m6, r4 |

858 |
psubw m6, [pw_3] |

859 |
pmaxsw m6, m4 |

860 |
pand m7, m6 |

861 |
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 |

862 |
CHROMA_V_STORE |

863 |
%if mmsize < 16 |

864 |
add r0, mmsize |

865 |
add r5, mmsize |

866 |
add r4, mmsize/8 |

867 |
dec r6 |

868 |
jg .loop |

869 |
REP_RET |

870 |
%else |

871 |
RET |

872 |
%endif |

873 | |

874 |
;----------------------------------------------------------------------------- |

875 |
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) |

876 |
;----------------------------------------------------------------------------- |

877 |
cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16) |

878 |
mov r4, r0 |

879 |
sub r0, r1 |

880 |
sub r0, r1 |

881 |
shl r2d, 2 |

882 |
shl r3d, 2 |

883 |
%if mmsize < 16 |

884 |
mov r5, 16/mmsize |

885 |
.loop: |

886 |
%endif |

887 |
CHROMA_V_LOAD r4 |

888 |
LOAD_AB m4, m5, r2, r3 |

889 |
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 |

890 |
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 |

891 |
CHROMA_V_STORE |

892 |
%if mmsize < 16 |

893 |
add r0, mmsize |

894 |
add r4, mmsize |

895 |
dec r5 |

896 |
jg .loop |

897 |
REP_RET |

898 |
%else |

899 |
RET |

900 |
%endif |

901 |
%endmacro |

902 | |

903 |
%ifndef ARCH_X86_64 |

904 |
INIT_MMX |

905 |
DEBLOCK_CHROMA mmxext |

906 |
%endif |

907 |
INIT_XMM |

908 |
DEBLOCK_CHROMA sse2 |

909 |
INIT_AVX |

910 |
DEBLOCK_CHROMA avx |