1 |
;***************************************************************************** |
---|---|

2 |
;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code |

3 |
;***************************************************************************** |

4 |
;* Copyright (C) 2005-2011 x264 project |

;*
;* |

6 |
;* Authors: Oskar Arvidsson <oskar@irock.se> |

7 |
;* Loren Merritt <lorenm@u.washington.edu> |

8 |
;* Jason Garrett-Glaser <darkshikari@gmail.com> |

;*
;* |

10 |
;* This file is part of Libav. |

;*
;* |

12 |
;* Libav is free software; you can redistribute it and/or |

13 |
;* modify it under the terms of the GNU Lesser General Public |

14 |
;* License as published by the Free Software Foundation; either |

15 |
;* version 2.1 of the License, or (at your option) any later version. |

;*
;* |

17 |
;* Libav is distributed in the hope that it will be useful, |

18 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

19 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

20 |
;* Lesser General Public License for more details. |

;*
;* |

22 |
;* You should have received a copy of the GNU Lesser General Public |

23 |
;* License along with Libav; if not, write to the Free Software |

24 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

25 |
;****************************************************************************** |

26 | |

27 |
%include "x86inc.asm" |

28 |
%include "x86util.asm" |

29 | |

30 |
SECTION_RODATA |

31 | |

32 |
pw_pixel_max: times 8 dw ((1 << 10)-1) |

33 | |

34 |
SECTION .text |

35 | |

36 |
cextern pw_2 |

37 |
cextern pw_4 |

38 | |

39 |
; out: %4 = |%1-%2|-%3 |

40 |
; clobbers: %5 |

41 |
%macro ABS_SUB 5 |

42 |
psubusw %5, %2, %1 |

43 |
psubusw %4, %1, %2 |

44 |
por %4, %5 |

45 |
psubw %4, %3 |

46 |
%endmacro |

47 | |

48 |
; out: %4 = |%1-%2|<%3 |

49 |
%macro DIFF_LT 5 |

50 |
psubusw %4, %2, %1 |

51 |
psubusw %5, %1, %2 |

52 |
por %5, %4 ; |%1-%2| |

53 |
pxor %4, %4 |

54 |
psubw %5, %3 ; |%1-%2|-%3 |

55 |
pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 |

56 |
%endmacro |

57 | |

58 |
%macro LOAD_AB 4 |

59 |
movd %1, %3 |

60 |
movd %2, %4 |

61 |
SPLATW %1, %1 |

62 |
SPLATW %2, %2 |

63 |
%endmacro |

64 | |

65 |
; in: %2=tc reg |

66 |
; out: %1=splatted tc |

67 |
%macro LOAD_TC 2 |

68 |
movd %1, [%2] |

69 |
punpcklbw %1, %1 |

70 |
%if mmsize == 8 |

71 |
pshufw %1, %1, 0 |

%else
%else |

73 |
pshuflw %1, %1, 01010000b |

74 |
pshufd %1, %1, 01010000b |

%endif
%endif |

76 |
psraw %1, 6 |

77 |
%endmacro |

78 | |

79 |
; in: %1=p1, %2=p0, %3=q0, %4=q1 |

80 |
; %5=alpha, %6=beta, %7-%9=tmp |

81 |
; out: %7=mask |

82 |
%macro LOAD_MASK 9 |

83 |
ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha |

84 |
ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta |

85 |
pand %8, %9 |

86 |
ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta |

87 |
pxor %7, %7 |

88 |
pand %8, %9 |

89 |
pcmpgtw %7, %8 |

90 |
%endmacro |

91 | |

92 |
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp |

93 |
; out: %1=p0', m2=q0' |

94 |
%macro DEBLOCK_P0_Q0 7 |

95 |
psubw %3, %4 |

96 |
pxor %7, %7 |

97 |
paddw %3, [pw_4] |

98 |
psubw %7, %5 |

99 |
psubw %6, %2, %1 |

100 |
psllw %6, 2 |

101 |
paddw %3, %6 |

102 |
psraw %3, 3 |

103 |
mova %6, [pw_pixel_max] |

104 |
CLIPW %3, %7, %5 |

105 |
pxor %7, %7 |

106 |
paddw %1, %3 |

107 |
psubw %2, %3 |

108 |
CLIPW %1, %7, %6 |

109 |
CLIPW %2, %7, %6 |

110 |
%endmacro |

111 | |

112 |
; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp |

113 |
%macro LUMA_Q1 6 |

114 |
pavgw %6, %3, %4 ; (p0+q0+1)>>1 |

115 |
paddw %1, %6 |

116 |
pxor %6, %6 |

117 |
psraw %1, 1 |

118 |
psubw %6, %5 |

119 |
psubw %1, %2 |

120 |
CLIPW %1, %6, %5 |

121 |
paddw %1, %2 |

122 |
%endmacro |

123 | |

124 |
%macro LUMA_DEBLOCK_ONE 3 |

125 |
DIFF_LT m5, %1, bm, m4, m6 |

126 |
pxor m6, m6 |

127 |
mova %3, m4 |

128 |
pcmpgtw m6, tcm |

129 |
pand m4, tcm |

130 |
pandn m6, m7 |

131 |
pand m4, m6 |

132 |
LUMA_Q1 m5, %2, m1, m2, m4, m6 |

133 |
%endmacro |

134 | |

135 |
%macro LUMA_H_STORE 2 |

136 |
%if mmsize == 8 |

137 |
movq [r0-4], m0 |

138 |
movq [r0+r1-4], m1 |

139 |
movq [r0+r1*2-4], m2 |

140 |
movq [r0+%2-4], m3 |

%else
%else |

142 |
movq [r0-4], m0 |

143 |
movhps [r0+r1-4], m0 |

144 |
movq [r0+r1*2-4], m1 |

145 |
movhps [%1-4], m1 |

146 |
movq [%1+r1-4], m2 |

147 |
movhps [%1+r1*2-4], m2 |

148 |
movq [%1+%2-4], m3 |

149 |
movhps [%1+r1*4-4], m3 |

%endif
%endif |

151 |
%endmacro |

152 | |

153 |
%macro DEBLOCK_LUMA 1 |

154 |
;----------------------------------------------------------------------------- |

155 |
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

156 |
;----------------------------------------------------------------------------- |

157 |
cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16) |

158 |
%assign pad 5*mmsize+12-(stack_offset&15) |

159 |
%define tcm [rsp] |

160 |
%define ms1 [rsp+mmsize] |

161 |
%define ms2 [rsp+mmsize*2] |

162 |
%define am [rsp+mmsize*3] |

163 |
%define bm [rsp+mmsize*4] |

164 |
SUB rsp, pad |

165 |
shl r2d, 2 |

166 |
shl r3d, 2 |

167 |
LOAD_AB m4, m5, r2, r3 |

168 |
mov r3, 32/mmsize |

169 |
mov r2, r0 |

170 |
sub r0, r1 |

171 |
mova am, m4 |

172 |
sub r0, r1 |

173 |
mova bm, m5 |

174 |
sub r0, r1 |

.loop:
.loop: |

176 |
mova m0, [r0+r1] |

177 |
mova m1, [r0+r1*2] |

178 |
mova m2, [r2] |

179 |
mova m3, [r2+r1] |

180 | |

181 |
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 |

182 |
LOAD_TC m6, r4 |

183 |
mova tcm, m6 |

184 | |

185 |
mova m5, [r0] |

186 |
LUMA_DEBLOCK_ONE m1, m0, ms1 |

187 |
mova [r0+r1], m5 |

188 | |

189 |
mova m5, [r2+r1*2] |

190 |
LUMA_DEBLOCK_ONE m2, m3, ms2 |

191 |
mova [r2+r1], m5 |

192 | |

193 |
pxor m5, m5 |

194 |
mova m6, tcm |

195 |
pcmpgtw m5, tcm |

196 |
psubw m6, ms1 |

197 |
pandn m5, m7 |

198 |
psubw m6, ms2 |

199 |
pand m5, m6 |

200 |
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 |

201 |
mova [r0+r1*2], m1 |

202 |
mova [r2], m2 |

203 | |

204 |
add r0, mmsize |

205 |
add r2, mmsize |

206 |
add r4, mmsize/8 |

207 |
dec r3 |

208 |
jg .loop |

209 |
ADD rsp, pad |

210 |
RET |

211 | |

212 |
cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16) |

213 |
%assign pad 7*mmsize+12-(stack_offset&15) |

214 |
%define tcm [rsp] |

215 |
%define ms1 [rsp+mmsize] |

216 |
%define ms2 [rsp+mmsize*2] |

217 |
%define p1m [rsp+mmsize*3] |

218 |
%define p2m [rsp+mmsize*4] |

219 |
%define am [rsp+mmsize*5] |

220 |
%define bm [rsp+mmsize*6] |

221 |
SUB rsp, pad |

222 |
shl r2d, 2 |

223 |
shl r3d, 2 |

224 |
LOAD_AB m4, m5, r2, r3 |

225 |
mov r3, r1 |

226 |
mova am, m4 |

227 |
add r3, r1 |

228 |
mov r5, 32/mmsize |

229 |
mova bm, m5 |

230 |
add r3, r1 |

231 |
%if mmsize == 16 |

232 |
mov r2, r0 |

233 |
add r2, r3 |

234 |
%endif |

235 |
.loop: |

236 |
%if mmsize == 8 |

237 |
movq m2, [r0-8] ; y q2 q1 q0 |

238 |
movq m7, [r0+0] |

239 |
movq m5, [r0+r1-8] |

240 |
movq m3, [r0+r1+0] |

241 |
movq m0, [r0+r1*2-8] |

242 |
movq m6, [r0+r1*2+0] |

243 |
movq m1, [r0+r3-8] |

244 |
TRANSPOSE4x4W 2, 5, 0, 1, 4 |

245 |
SWAP 2, 7 |

246 |
movq m7, [r0+r3] |

247 |
TRANSPOSE4x4W 2, 3, 6, 7, 4 |

248 |
%else |

249 |
movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x |

250 |
movu m0, [r0+r1-8] |

251 |
movu m2, [r0+r1*2-8] |

252 |
movu m3, [r2-8] |

253 |
TRANSPOSE4x4W 5, 0, 2, 3, 6 |

254 |
mova tcm, m3 |

255 | |

256 |
movu m4, [r2+r1-8] |

257 |
movu m1, [r2+r1*2-8] |

258 |
movu m3, [r2+r3-8] |

259 |
movu m7, [r2+r1*4-8] |

260 |
TRANSPOSE4x4W 4, 1, 3, 7, 6 |

261 | |

262 |
mova m6, tcm |

263 |
punpcklqdq m6, m7 |

264 |
punpckhqdq m5, m4 |

265 |
SBUTTERFLY qdq, 0, 1, 7 |

266 |
SBUTTERFLY qdq, 2, 3, 7 |

267 |
%endif |

268 | |

269 |
mova p2m, m6 |

270 |
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 |

271 |
LOAD_TC m6, r4 |

272 |
mova tcm, m6 |

273 | |

274 |
LUMA_DEBLOCK_ONE m1, m0, ms1 |

275 |
mova p1m, m5 |

276 | |

277 |
mova m5, p2m |

278 |
LUMA_DEBLOCK_ONE m2, m3, ms2 |

279 |
mova p2m, m5 |

280 | |

281 |
pxor m5, m5 |

282 |
mova m6, tcm |

283 |
pcmpgtw m5, tcm |

284 |
psubw m6, ms1 |

285 |
pandn m5, m7 |

286 |
psubw m6, ms2 |

287 |
pand m5, m6 |

288 |
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 |

289 |
mova m0, p1m |

290 |
mova m3, p2m |

291 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

292 |
LUMA_H_STORE r2, r3 |

293 | |

294 |
add r4, mmsize/8 |

295 |
lea r0, [r0+r1*(mmsize/2)] |

296 |
lea r2, [r2+r1*(mmsize/2)] |

297 |
dec r5 |

298 |
jg .loop |

299 |
ADD rsp, pad |

300 |
RET |

301 |
%endmacro |

302 | |

303 |
INIT_XMM |

304 |
%ifdef ARCH_X86_64 |

305 |
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 |

306 |
; m12=alpha, m13=beta |

307 |
; out: m0=p1', m3=q1', m1=p0', m2=q0' |

308 |
; clobbers: m4, m5, m6, m7, m10, m11, m14 |

309 |
%macro DEBLOCK_LUMA_INTER_SSE2 0 |

310 |
LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 |

311 |
LOAD_TC m6, r4 |

312 |
DIFF_LT m8, m1, m13, m10, m4 |

313 |
DIFF_LT m9, m2, m13, m11, m4 |

314 |
pand m6, m7 |

315 | |

316 |
mova m14, m6 |

317 |
pxor m4, m4 |

318 |
pcmpgtw m6, m4 |

319 |
pand m6, m14 |

320 | |

321 |
mova m5, m10 |

322 |
pand m5, m6 |

323 |
LUMA_Q1 m8, m0, m1, m2, m5, m4 |

324 | |

325 |
mova m5, m11 |

326 |
pand m5, m6 |

327 |
LUMA_Q1 m9, m3, m1, m2, m5, m4 |

328 | |

329 |
pxor m4, m4 |

330 |
psubw m6, m10 |

331 |
pcmpgtw m4, m14 |

332 |
pandn m4, m7 |

333 |
psubw m6, m11 |

334 |
pand m4, m6 |

335 |
DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 |

336 | |

337 |
SWAP 0, 8 |

338 |
SWAP 3, 9 |

339 |
%endmacro |

340 | |

341 |
%macro DEBLOCK_LUMA_64 1 |

342 |
cglobal deblock_v_luma_10_%1, 5,5,15 |

343 |
%define p2 m8 |

344 |
%define p1 m0 |

345 |
%define p0 m1 |

346 |
%define q0 m2 |

347 |
%define q1 m3 |

348 |
%define q2 m9 |

349 |
%define mask0 m7 |

350 |
%define mask1 m10 |

351 |
%define mask2 m11 |

352 |
shl r2d, 2 |

353 |
shl r3d, 2 |

354 |
LOAD_AB m12, m13, r2, r3 |

355 |
mov r2, r0 |

356 |
sub r0, r1 |

357 |
sub r0, r1 |

358 |
sub r0, r1 |

359 |
mov r3, 2 |

360 |
.loop: |

361 |
mova p2, [r0] |

362 |
mova p1, [r0+r1] |

363 |
mova p0, [r0+r1*2] |

364 |
mova q0, [r2] |

365 |
mova q1, [r2+r1] |

366 |
mova q2, [r2+r1*2] |

367 |
DEBLOCK_LUMA_INTER_SSE2 |

368 |
mova [r0+r1], p1 |

369 |
mova [r0+r1*2], p0 |

370 |
mova [r2], q0 |

371 |
mova [r2+r1], q1 |

372 |
add r0, mmsize |

373 |
add r2, mmsize |

374 |
add r4, 2 |

375 |
dec r3 |

376 |
jg .loop |

377 |
REP_RET |

378 | |

379 |
cglobal deblock_h_luma_10_%1, 5,7,15 |

380 |
shl r2d, 2 |

381 |
shl r3d, 2 |

382 |
LOAD_AB m12, m13, r2, r3 |

383 |
mov r2, r1 |

384 |
add r2, r1 |

385 |
add r2, r1 |

386 |
mov r5, r0 |

387 |
add r5, r2 |

388 |
mov r6, 2 |

389 |
.loop: |

390 |
movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x |

391 |
movu m0, [r0+r1-8] |

392 |
movu m2, [r0+r1*2-8] |

393 |
movu m9, [r5-8] |

394 |
movu m5, [r5+r1-8] |

395 |
movu m1, [r5+r1*2-8] |

396 |
movu m3, [r5+r2-8] |

397 |
movu m7, [r5+r1*4-8] |

398 | |

399 |
TRANSPOSE4x4W 8, 0, 2, 9, 10 |

400 |
TRANSPOSE4x4W 5, 1, 3, 7, 10 |

401 | |

402 |
punpckhqdq m8, m5 |

403 |
SBUTTERFLY qdq, 0, 1, 10 |

404 |
SBUTTERFLY qdq, 2, 3, 10 |

405 |
punpcklqdq m9, m7 |

406 | |

407 |
DEBLOCK_LUMA_INTER_SSE2 |

408 | |

409 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

410 |
LUMA_H_STORE r5, r2 |

411 |
add r4, 2 |

412 |
lea r0, [r0+r1*8] |

413 |
lea r5, [r5+r1*8] |

414 |
dec r6 |

415 |
jg .loop |

416 |
REP_RET |

417 |
%endmacro |

418 | |

419 |
INIT_XMM |

420 |
DEBLOCK_LUMA_64 sse2 |

421 |
INIT_AVX |

422 |
DEBLOCK_LUMA_64 avx |

423 |
%endif |

424 | |

425 |
%macro SWAPMOVA 2 |

426 |
%ifid %1 |

427 |
SWAP %1, %2 |

428 |
%else |

429 |
mova %1, %2 |

430 |
%endif |

431 |
%endmacro |

432 | |

433 |
; in: t0-t2: tmp registers |

434 |
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 |

435 |
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' |

436 |
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory |

437 |
%ifdef ARCH_X86_64 |

438 |
paddw t0, %3, %2 |

439 |
mova t2, %4 |

440 |
paddw t2, %3 |

441 |
%else |

442 |
mova t0, %3 |

443 |
mova t2, %4 |

444 |
paddw t0, %2 |

445 |
paddw t2, %3 |

446 |
%endif |

447 |
paddw t0, %1 |

448 |
paddw t2, t2 |

449 |
paddw t0, %5 |

450 |
paddw t2, %9 |

451 |
paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) |

452 |
paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) |

453 | |

454 |
psrlw t2, 3 |

455 |
psrlw t1, t0, 2 |

456 |
psubw t2, %3 |

457 |
psubw t1, %2 |

458 |
pand t2, %8 |

459 |
pand t1, %8 |

460 |
paddw t2, %3 |

461 |
paddw t1, %2 |

462 |
SWAPMOVA %11, t1 |

463 | |

464 |
psubw t1, t0, %3 |

465 |
paddw t0, t0 |

466 |
psubw t1, %5 |

467 |
psubw t0, %3 |

468 |
paddw t1, %6 |

469 |
paddw t1, %2 |

470 |
paddw t0, %6 |

471 |
psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 |

472 |
psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 |

473 | |

474 |
pxor t0, t1 |

475 |
pxor t1, %1 |

476 |
pand t0, %8 |

477 |
pand t1, %7 |

478 |
pxor t0, t1 |

479 |
pxor t0, %1 |

480 |
SWAPMOVA %10, t0 |

481 |
SWAPMOVA %12, t2 |

482 |
%endmacro |

483 | |

484 |
%macro LUMA_INTRA_INIT 1 |

485 |
%xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) |

486 |
%define t0 m4 |

487 |
%define t1 m5 |

488 |
%define t2 m6 |

489 |
%define t3 m7 |

490 |
%assign i 4 |

491 |
%rep %1 |

492 |
CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] |

493 |
%assign i i+1 |

494 |
%endrep |

495 |
SUB rsp, pad |

496 |
%endmacro |

497 | |

498 |
; in: %1-%3=tmp, %4=p2, %5=q2 |

499 |
%macro LUMA_INTRA_INTER 5 |

500 |
LOAD_AB t0, t1, r2d, r3d |

501 |
mova %1, t0 |

502 |
LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 |

503 |
%ifdef ARCH_X86_64 |

504 |
mova %2, t0 ; mask0 |

505 |
psrlw t3, %1, 2 |

506 |
%else |

507 |
mova t3, %1 |

508 |
mova %2, t0 ; mask0 |

509 |
psrlw t3, 2 |

510 |
%endif |

511 |
paddw t3, [pw_2] ; alpha/4+2 |

512 |
DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 |

513 |
pand t2, %2 |

514 |
mova t3, %5 ; q2 |

515 |
mova %1, t2 ; mask1 |

516 |
DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta |

517 |
pand t2, %1 |

518 |
mova t3, %4 ; p2 |

519 |
mova %3, t2 ; mask1q |

520 |
DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta |

521 |
pand t2, %1 |

522 |
mova %1, t2 ; mask1p |

523 |
%endmacro |

524 | |

525 |
%macro LUMA_H_INTRA_LOAD 0 |

526 |
%if mmsize == 8 |

527 |
movu t0, [r0-8] |

528 |
movu t1, [r0+r1-8] |

529 |
movu m0, [r0+r1*2-8] |

530 |
movu m1, [r0+r4-8] |

531 |
TRANSPOSE4x4W 4, 5, 0, 1, 2 |

532 |
mova t4, t0 ; p3 |

533 |
mova t5, t1 ; p2 |

534 | |

535 |
movu m2, [r0] |

536 |
movu m3, [r0+r1] |

537 |
movu t0, [r0+r1*2] |

538 |
movu t1, [r0+r4] |

539 |
TRANSPOSE4x4W 2, 3, 4, 5, 6 |

540 |
mova t6, t0 ; q2 |

541 |
mova t7, t1 ; q3 |

542 |
%else |

543 |
movu t0, [r0-8] |

544 |
movu t1, [r0+r1-8] |

545 |
movu m0, [r0+r1*2-8] |

546 |
movu m1, [r0+r5-8] |

547 |
movu m2, [r4-8] |

548 |
movu m3, [r4+r1-8] |

549 |
movu t2, [r4+r1*2-8] |

550 |
movu t3, [r4+r5-8] |

551 |
TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 |

552 |
mova t4, t0 ; p3 |

553 |
mova t5, t1 ; p2 |

554 |
mova t6, t2 ; q2 |

555 |
mova t7, t3 ; q3 |

556 |
%endif |

557 |
%endmacro |

558 | |

559 |
; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp |

560 |
%macro LUMA_H_INTRA_STORE 9 |

561 |
%if mmsize == 8 |

562 |
TRANSPOSE4x4W %1, %2, %3, %4, %9 |

563 |
movq [r0-8], m%1 |

564 |
movq [r0+r1-8], m%2 |

565 |
movq [r0+r1*2-8], m%3 |

566 |
movq [r0+r4-8], m%4 |

567 |
movq m%1, %8 |

568 |
TRANSPOSE4x4W %5, %6, %7, %1, %9 |

569 |
movq [r0], m%5 |

570 |
movq [r0+r1], m%6 |

571 |
movq [r0+r1*2], m%7 |

572 |
movq [r0+r4], m%1 |

573 |
%else |

574 |
TRANSPOSE2x4x4W %1, %2, %3, %4, %9 |

575 |
movq [r0-8], m%1 |

576 |
movq [r0+r1-8], m%2 |

577 |
movq [r0+r1*2-8], m%3 |

578 |
movq [r0+r5-8], m%4 |

579 |
movhps [r4-8], m%1 |

580 |
movhps [r4+r1-8], m%2 |

581 |
movhps [r4+r1*2-8], m%3 |

582 |
movhps [r4+r5-8], m%4 |

583 |
%ifnum %8 |

584 |
SWAP %1, %8 |

585 |
%else |

586 |
mova m%1, %8 |

587 |
%endif |

588 |
TRANSPOSE2x4x4W %5, %6, %7, %1, %9 |

589 |
movq [r0], m%5 |

590 |
movq [r0+r1], m%6 |

591 |
movq [r0+r1*2], m%7 |

592 |
movq [r0+r5], m%1 |

593 |
movhps [r4], m%5 |

594 |
movhps [r4+r1], m%6 |

595 |
movhps [r4+r1*2], m%7 |

596 |
movhps [r4+r5], m%1 |

597 |
%endif |

598 |
%endmacro |

599 | |

600 |
%ifdef ARCH_X86_64 |

601 |
;----------------------------------------------------------------------------- |

602 |
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |

603 |
;----------------------------------------------------------------------------- |

604 |
%macro DEBLOCK_LUMA_INTRA_64 1 |

605 |
cglobal deblock_v_luma_intra_10_%1, 4,7,16 |

606 |
%define t0 m1 |

607 |
%define t1 m2 |

608 |
%define t2 m4 |

609 |
%define p2 m8 |

610 |
%define p1 m9 |

611 |
%define p0 m10 |

612 |
%define q0 m11 |

613 |
%define q1 m12 |

614 |
%define q2 m13 |

615 |
%define aa m5 |

616 |
%define bb m14 |

617 |
lea r4, [r1*4] |

618 |
lea r5, [r1*3] ; 3*stride |

619 |
neg r4 |

620 |
add r4, r0 ; pix-4*stride |

621 |
mov r6, 2 |

622 |
mova m0, [pw_2] |

623 |
shl r2d, 2 |

624 |
shl r3d, 2 |

625 |
LOAD_AB aa, bb, r2d, r3d |

626 |
.loop |

627 |
mova p2, [r4+r1] |

628 |
mova p1, [r4+2*r1] |

629 |
mova p0, [r4+r5] |

630 |
mova q0, [r0] |

631 |
mova q1, [r0+r1] |

632 |
mova q2, [r0+2*r1] |

633 | |

634 |
LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 |

635 |
mova t2, aa |

636 |
psrlw t2, 2 |

637 |
paddw t2, m0 ; alpha/4+2 |

638 |
DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 |

639 |
DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta |

640 |
DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta |

641 |
pand m6, m3 |

642 |
pand m7, m6 |

643 |
pand m6, t1 |

644 |
LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] |

645 |
LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] |

646 |
add r0, mmsize |

647 |
add r4, mmsize |

648 |
dec r6 |

649 |
jg .loop |

650 |
REP_RET |

651 | |

652 |
;----------------------------------------------------------------------------- |

653 |
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |

654 |
;----------------------------------------------------------------------------- |

655 |
cglobal deblock_h_luma_intra_10_%1, 4,7,16 |

656 |
%define t0 m15 |

657 |
%define t1 m14 |

658 |
%define t2 m2 |

659 |
%define q3 m5 |

660 |
%define q2 m8 |

661 |
%define q1 m9 |

662 |
%define q0 m10 |

663 |
%define p0 m11 |

664 |
%define p1 m12 |

665 |
%define p2 m13 |

666 |
%define p3 m4 |

667 |
%define spill [rsp] |

668 |
%assign pad 24-(stack_offset&15) |

669 |
SUB rsp, pad |

670 |
lea r4, [r1*4] |

671 |
lea r5, [r1*3] ; 3*stride |

672 |
add r4, r0 ; pix+4*stride |

673 |
mov r6, 2 |

674 |
mova m0, [pw_2] |

675 |
shl r2d, 2 |

676 |
shl r3d, 2 |

677 |
.loop |

678 |
movu q3, [r0-8] |

679 |
movu q2, [r0+r1-8] |

680 |
movu q1, [r0+r1*2-8] |

681 |
movu q0, [r0+r5-8] |

682 |
movu p0, [r4-8] |

683 |
movu p1, [r4+r1-8] |

684 |
movu p2, [r4+r1*2-8] |

685 |
movu p3, [r4+r5-8] |

686 |
TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 |

687 | |

688 |
LOAD_AB m1, m2, r2d, r3d |

689 |
LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 |

690 |
psrlw m1, 2 |

691 |
paddw m1, m0 ; alpha/4+2 |

692 |
DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 |

693 |
DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta |

694 |
DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta |

695 |
pand m6, m3 |

696 |
pand m7, m6 |

697 |
pand m6, t1 |

698 | |

699 |
mova spill, q3 |

700 |
LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 |

701 |
LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 |

702 |
mova m7, spill |

703 | |

704 |
LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 |

705 | |

706 |
lea r0, [r0+r1*8] |

707 |
lea r4, [r4+r1*8] |

708 |
dec r6 |

709 |
jg .loop |

710 |
ADD rsp, pad |

711 |
RET |

712 |
%endmacro |

713 | |

714 |
INIT_XMM |

715 |
DEBLOCK_LUMA_INTRA_64 sse2 |

716 |
INIT_AVX |

717 |
DEBLOCK_LUMA_INTRA_64 avx |

718 | |

719 |
%endif |

720 | |

721 |
%macro DEBLOCK_LUMA_INTRA 1 |

722 |
;----------------------------------------------------------------------------- |

723 |
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |

724 |
;----------------------------------------------------------------------------- |

725 |
cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16) |

726 |
LUMA_INTRA_INIT 3 |

727 |
lea r4, [r1*4] |

728 |
lea r5, [r1*3] |

729 |
neg r4 |

730 |
add r4, r0 |

731 |
mov r6, 32/mmsize |

732 |
shl r2d, 2 |

733 |
shl r3d, 2 |

734 |
.loop: |

735 |
mova m0, [r4+r1*2] ; p1 |

736 |
mova m1, [r4+r5] ; p0 |

737 |
mova m2, [r0] ; q0 |

738 |
mova m3, [r0+r1] ; q1 |

739 |
LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] |

740 |
LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] |

741 |
mova t3, [r0+r1*2] ; q2 |

742 |
LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] |

743 |
add r0, mmsize |

744 |
add r4, mmsize |

745 |
dec r6 |

746 |
jg .loop |

747 |
ADD rsp, pad |

748 |
RET |

749 | |

750 |
;----------------------------------------------------------------------------- |

751 |
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |

752 |
;----------------------------------------------------------------------------- |

753 |
cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16) |

754 |
LUMA_INTRA_INIT 8 |

755 |
%if mmsize == 8 |

756 |
lea r4, [r1*3] |

757 |
mov r5, 32/mmsize |

758 |
%else |

759 |
lea r4, [r1*4] |

760 |
lea r5, [r1*3] ; 3*stride |

761 |
add r4, r0 ; pix+4*stride |

762 |
mov r6, 32/mmsize |

763 |
%endif |

764 |
shl r2d, 2 |

765 |
shl r3d, 2 |

766 |
.loop: |

767 |
LUMA_H_INTRA_LOAD |

768 |
LUMA_INTRA_INTER t8, t9, t10, t5, t6 |

769 | |

770 |
LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 |

771 |
mova t3, t6 ; q2 |

772 |
LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 |

773 | |

774 |
mova m2, t4 |

775 |
mova m0, t11 |

776 |
mova m1, t5 |

777 |
mova m3, t8 |

778 |
mova m6, t6 |

779 | |

780 |
LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 |

781 | |

782 |
lea r0, [r0+r1*(mmsize/2)] |

783 |
%if mmsize == 8 |

784 |
dec r5 |

785 |
%else |

786 |
lea r4, [r4+r1*(mmsize/2)] |

787 |
dec r6 |

788 |
%endif |

789 |
jg .loop |

790 |
ADD rsp, pad |

791 |
RET |

792 |
%endmacro |

793 | |

794 |
%ifndef ARCH_X86_64 |

795 |
INIT_MMX |

796 |
DEBLOCK_LUMA mmxext |

797 |
DEBLOCK_LUMA_INTRA mmxext |

798 |
INIT_XMM |

799 |
DEBLOCK_LUMA sse2 |

800 |
DEBLOCK_LUMA_INTRA sse2 |

801 |
INIT_AVX |

802 |
DEBLOCK_LUMA avx |

803 |
DEBLOCK_LUMA_INTRA avx |

804 |
%endif |