## ffmpeg / libavcodec / x86 / h264_deblock.asm @ 9f3d6ca4

History | View | Annotate | Download (23.5 KB)

1 |
;***************************************************************************** |
---|---|

2 |
;* MMX/SSE2/AVX-optimized H.264 deblocking code |

3 |
;***************************************************************************** |

4 |
;* Copyright (C) 2005-2011 x264 project |

5 |
;* |

6 |
;* Authors: Loren Merritt <lorenm@u.washington.edu> |

7 |
;* Jason Garrett-Glaser <darkshikari@gmail.com> |

8 |
;* Oskar Arvidsson <oskar@irock.se> |

9 |
;* |

10 |
;* This file is part of Libav. |

11 |
;* |

12 |
;* Libav is free software; you can redistribute it and/or |

13 |
;* modify it under the terms of the GNU Lesser General Public |

14 |
;* License as published by the Free Software Foundation; either |

15 |
;* version 2.1 of the License, or (at your option) any later version. |

16 |
;* |

17 |
;* Libav is distributed in the hope that it will be useful, |

18 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

19 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

20 |
;* Lesser General Public License for more details. |

21 |
;* |

22 |
;* You should have received a copy of the GNU Lesser General Public |

23 |
;* License along with Libav; if not, write to the Free Software |

24 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

25 |
;****************************************************************************** |

26 | |

27 |
%include "x86inc.asm" |

28 |
%include "x86util.asm" |

29 | |

30 |
SECTION .text |

31 | |

32 |
cextern pb_0 |

33 |
cextern pb_1 |

34 |
cextern pb_3 |

35 |
cextern pb_A1 |

36 | |

37 |
; expands to [base],...,[base+7*stride] |

38 |
%define PASS8ROWS(base, base3, stride, stride3) \ |

39 |
[base], [base+stride], [base+stride*2], [base3], \ |

40 |
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] |

41 | |

42 |
%define PASS8ROWS(base, base3, stride, stride3, offset) \ |

43 |
PASS8ROWS(base+offset, base3+offset, stride, stride3) |

44 | |

45 |
; in: 8 rows of 4 bytes in %4..%11 |

46 |
; out: 4 rows of 8 bytes in m0..m3 |

47 |
%macro TRANSPOSE4x8_LOAD 11 |

48 |
movh m0, %4 |

49 |
movh m2, %5 |

50 |
movh m1, %6 |

51 |
movh m3, %7 |

52 |
punpckl%1 m0, m2 |

53 |
punpckl%1 m1, m3 |

54 |
mova m2, m0 |

55 |
punpckl%2 m0, m1 |

56 |
punpckh%2 m2, m1 |

57 | |

58 |
movh m4, %8 |

59 |
movh m6, %9 |

60 |
movh m5, %10 |

61 |
movh m7, %11 |

62 |
punpckl%1 m4, m6 |

63 |
punpckl%1 m5, m7 |

64 |
mova m6, m4 |

65 |
punpckl%2 m4, m5 |

66 |
punpckh%2 m6, m5 |

67 | |

68 |
punpckh%3 m1, m0, m4 |

69 |
punpckh%3 m3, m2, m6 |

70 |
punpckl%3 m0, m4 |

71 |
punpckl%3 m2, m6 |

72 |
%endmacro |

73 | |

74 |
; in: 4 rows of 8 bytes in m0..m3 |

75 |
; out: 8 rows of 4 bytes in %1..%8 |

76 |
%macro TRANSPOSE8x4B_STORE 8 |

77 |
punpckhdq m4, m0, m0 |

78 |
punpckhdq m5, m1, m1 |

79 |
punpckhdq m6, m2, m2 |

80 | |

81 |
punpcklbw m0, m1 |

82 |
punpcklbw m2, m3 |

83 |
punpcklwd m1, m0, m2 |

84 |
punpckhwd m0, m2 |

85 |
movh %1, m1 |

86 |
punpckhdq m1, m1 |

87 |
movh %2, m1 |

88 |
movh %3, m0 |

89 |
punpckhdq m0, m0 |

90 |
movh %4, m0 |

91 | |

92 |
punpckhdq m3, m3 |

93 |
punpcklbw m4, m5 |

94 |
punpcklbw m6, m3 |

95 |
punpcklwd m5, m4, m6 |

96 |
punpckhwd m4, m6 |

97 |
movh %5, m5 |

98 |
punpckhdq m5, m5 |

99 |
movh %6, m5 |

100 |
movh %7, m4 |

101 |
punpckhdq m4, m4 |

102 |
movh %8, m4 |

103 |
%endmacro |

104 | |

105 |
%macro TRANSPOSE4x8B_LOAD 8 |

106 |
TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 |

107 |
%endmacro |

108 | |

109 |
%macro TRANSPOSE4x8W_LOAD 8 |

110 |
%if mmsize==16 |

111 |
TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8 |

112 |
%else |

113 |
SWAP 1, 4, 2, 3 |

114 |
mova m0, [t5] |

115 |
mova m1, [t5+r1] |

116 |
mova m2, [t5+r1*2] |

117 |
mova m3, [t5+t6] |

118 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

119 |
%endif |

120 |
%endmacro |

121 | |

122 |
%macro TRANSPOSE8x2W_STORE 8 |

123 |
punpckhwd m0, m1, m2 |

124 |
punpcklwd m1, m2 |

125 |
%if mmsize==8 |

126 |
movd %3, m0 |

127 |
movd %1, m1 |

128 |
psrlq m1, 32 |

129 |
psrlq m0, 32 |

130 |
movd %2, m1 |

131 |
movd %4, m0 |

132 |
%else |

133 |
movd %5, m0 |

134 |
movd %1, m1 |

135 |
psrldq m1, 4 |

136 |
psrldq m0, 4 |

137 |
movd %2, m1 |

138 |
movd %6, m0 |

139 |
psrldq m1, 4 |

140 |
psrldq m0, 4 |

141 |
movd %3, m1 |

142 |
movd %7, m0 |

143 |
psrldq m1, 4 |

144 |
psrldq m0, 4 |

145 |
movd %4, m1 |

146 |
movd %8, m0 |

147 |
%endif |

148 |
%endmacro |

149 | |

150 |
%macro SBUTTERFLY3 4 |

151 |
punpckh%1 %4, %2, %3 |

152 |
punpckl%1 %2, %3 |

153 |
%endmacro |

154 | |

155 |
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 |

156 |
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] |

157 |
%macro TRANSPOSE6x8_MEM 9 |

158 |
RESET_MM_PERMUTATION |

159 |
movq m0, %1 |

160 |
movq m1, %2 |

161 |
movq m2, %3 |

162 |
movq m3, %4 |

163 |
movq m4, %5 |

164 |
movq m5, %6 |

165 |
movq m6, %7 |

166 |
SBUTTERFLY bw, 0, 1, 7 |

167 |
SBUTTERFLY bw, 2, 3, 7 |

168 |
SBUTTERFLY bw, 4, 5, 7 |

169 |
movq [%9+0x10], m3 |

170 |
SBUTTERFLY3 bw, m6, %8, m7 |

171 |
SBUTTERFLY wd, 0, 2, 3 |

172 |
SBUTTERFLY wd, 4, 6, 3 |

173 |
punpckhdq m0, m4 |

174 |
movq [%9+0x00], m0 |

175 |
SBUTTERFLY3 wd, m1, [%9+0x10], m3 |

176 |
SBUTTERFLY wd, 5, 7, 0 |

177 |
SBUTTERFLY dq, 1, 5, 0 |

178 |
SBUTTERFLY dq, 2, 6, 0 |

179 |
punpckldq m3, m7 |

180 |
movq [%9+0x10], m2 |

181 |
movq [%9+0x20], m6 |

182 |
movq [%9+0x30], m1 |

183 |
movq [%9+0x40], m5 |

184 |
movq [%9+0x50], m3 |

185 |
RESET_MM_PERMUTATION |

186 |
%endmacro |

187 | |

188 |
; in: 8 rows of 8 in %1..%8 |

189 |
; out: 8 rows of 8 in %9..%16 |

190 |
%macro TRANSPOSE8x8_MEM 16 |

191 |
RESET_MM_PERMUTATION |

192 |
movq m0, %1 |

193 |
movq m1, %2 |

194 |
movq m2, %3 |

195 |
movq m3, %4 |

196 |
movq m4, %5 |

197 |
movq m5, %6 |

198 |
movq m6, %7 |

199 |
SBUTTERFLY bw, 0, 1, 7 |

200 |
SBUTTERFLY bw, 2, 3, 7 |

201 |
SBUTTERFLY bw, 4, 5, 7 |

202 |
SBUTTERFLY3 bw, m6, %8, m7 |

203 |
movq %9, m5 |

204 |
SBUTTERFLY wd, 0, 2, 5 |

205 |
SBUTTERFLY wd, 4, 6, 5 |

206 |
SBUTTERFLY wd, 1, 3, 5 |

207 |
movq %11, m6 |

208 |
movq m6, %9 |

209 |
SBUTTERFLY wd, 6, 7, 5 |

210 |
SBUTTERFLY dq, 0, 4, 5 |

211 |
SBUTTERFLY dq, 1, 6, 5 |

212 |
movq %9, m0 |

213 |
movq %10, m4 |

214 |
movq %13, m1 |

215 |
movq %14, m6 |

216 |
SBUTTERFLY3 dq, m2, %11, m0 |

217 |
SBUTTERFLY dq, 3, 7, 4 |

218 |
movq %11, m2 |

219 |
movq %12, m0 |

220 |
movq %15, m3 |

221 |
movq %16, m7 |

222 |
RESET_MM_PERMUTATION |

223 |
%endmacro |

224 | |

225 |
; out: %4 = |%1-%2|>%3 |

226 |
; clobbers: %5 |

227 |
%macro DIFF_GT 5 |

228 |
%if avx_enabled == 0 |

229 |
mova %5, %2 |

230 |
mova %4, %1 |

231 |
psubusb %5, %1 |

232 |
psubusb %4, %2 |

233 |
%else |

234 |
psubusb %5, %2, %1 |

235 |
psubusb %4, %1, %2 |

236 |
%endif |

237 |
por %4, %5 |

238 |
psubusb %4, %3 |

239 |
%endmacro |

240 | |

241 |
; out: %4 = |%1-%2|>%3 |

242 |
; clobbers: %5 |

243 |
%macro DIFF_GT2 5 |

244 |
%ifdef ARCH_X86_64 |

245 |
psubusb %5, %2, %1 |

246 |
psubusb %4, %1, %2 |

247 |
%else |

248 |
mova %5, %2 |

249 |
mova %4, %1 |

250 |
psubusb %5, %1 |

251 |
psubusb %4, %2 |

252 |
%endif |

253 |
psubusb %5, %3 |

254 |
psubusb %4, %3 |

255 |
pcmpeqb %4, %5 |

256 |
%endmacro |

257 | |

258 |
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 |

259 |
; out: m5=beta-1, m7=mask, %3=alpha-1 |

260 |
; clobbers: m4,m6 |

261 |
%macro LOAD_MASK 2-3 |

262 |
movd m4, %1 |

263 |
movd m5, %2 |

264 |
SPLATW m4, m4 |

265 |
SPLATW m5, m5 |

266 |
packuswb m4, m4 ; 16x alpha-1 |

267 |
packuswb m5, m5 ; 16x beta-1 |

268 |
%if %0>2 |

269 |
mova %3, m4 |

270 |
%endif |

271 |
DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 |

272 |
DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 |

273 |
por m7, m4 |

274 |
DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 |

275 |
por m7, m4 |

276 |
pxor m6, m6 |

277 |
pcmpeqb m7, m6 |

278 |
%endmacro |

279 | |

280 |
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) |

281 |
; out: m1=p0' m2=q0' |

282 |
; clobbers: m0,3-6 |

283 |
%macro DEBLOCK_P0_Q0 0 |

284 |
pxor m5, m1, m2 ; p0^q0 |

285 |
pand m5, [pb_1] ; (p0^q0)&1 |

286 |
pcmpeqb m4, m4 |

287 |
pxor m3, m4 |

288 |
pavgb m3, m0 ; (p1 - q1 + 256)>>1 |

289 |
pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 |

290 |
pxor m4, m1 |

291 |
pavgb m4, m2 ; (q0 - p0 + 256)>>1 |

292 |
pavgb m3, m5 |

293 |
paddusb m3, m4 ; d+128+33 |

294 |
mova m6, [pb_A1] |

295 |
psubusb m6, m3 |

296 |
psubusb m3, [pb_A1] |

297 |
pminub m6, m7 |

298 |
pminub m3, m7 |

299 |
psubusb m1, m6 |

300 |
psubusb m2, m3 |

301 |
paddusb m1, m3 |

302 |
paddusb m2, m6 |

303 |
%endmacro |

304 | |

305 |
; in: m1=p0 m2=q0 |

306 |
; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp |

307 |
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) |

308 |
; clobbers: q2, tmp, tc0 |

309 |
%macro LUMA_Q1 6 |

310 |
pavgb %6, m1, m2 |

311 |
pavgb %2, %6 ; avg(p2,avg(p0,q0)) |

312 |
pxor %6, %3 |

313 |
pand %6, [pb_1] ; (p2^avg(p0,q0))&1 |

314 |
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 |

315 |
psubusb %6, %1, %5 |

316 |
paddusb %5, %1 |

317 |
pmaxub %2, %6 |

318 |
pminub %2, %5 |

319 |
mova %4, %2 |

320 |
%endmacro |

321 | |

322 |
%ifdef ARCH_X86_64 |

323 |
;----------------------------------------------------------------------------- |

324 |
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

325 |
;----------------------------------------------------------------------------- |

326 |
%macro DEBLOCK_LUMA 1 |

327 |
cglobal deblock_v_luma_8_%1, 5,5,10 |

328 |
movd m8, [r4] ; tc0 |

329 |
lea r4, [r1*3] |

330 |
dec r2d ; alpha-1 |

331 |
neg r4 |

332 |
dec r3d ; beta-1 |

333 |
add r4, r0 ; pix-3*stride |

334 | |

335 |
mova m0, [r4+r1] ; p1 |

336 |
mova m1, [r4+2*r1] ; p0 |

337 |
mova m2, [r0] ; q0 |

338 |
mova m3, [r0+r1] ; q1 |

339 |
LOAD_MASK r2d, r3d |

340 | |

341 |
punpcklbw m8, m8 |

342 |
punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] |

343 |
pcmpeqb m9, m9 |

344 |
pcmpeqb m9, m8 |

345 |
pandn m9, m7 |

346 |
pand m8, m9 |

347 | |

348 |
movdqa m3, [r4] ; p2 |

349 |
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 |

350 |
pand m6, m9 |

351 |
psubb m7, m8, m6 |

352 |
pand m6, m8 |

353 |
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 |

354 | |

355 |
movdqa m4, [r0+2*r1] ; q2 |

356 |
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 |

357 |
pand m6, m9 |

358 |
pand m8, m6 |

359 |
psubb m7, m6 |

360 |
mova m3, [r0+r1] |

361 |
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 |

362 | |

363 |
DEBLOCK_P0_Q0 |

364 |
mova [r4+2*r1], m1 |

365 |
mova [r0], m2 |

366 |
RET |

367 | |

368 |
;----------------------------------------------------------------------------- |

369 |
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

370 |
;----------------------------------------------------------------------------- |

371 |
INIT_MMX |

372 |
cglobal deblock_h_luma_8_%1, 5,7 |

373 |
movsxd r10, r1d |

374 |
lea r11, [r10+r10*2] |

375 |
lea r6, [r0-4] |

376 |
lea r5, [r0-4+r11] |

377 |
%ifdef WIN64 |

378 |
sub rsp, 0x98 |

379 |
%define pix_tmp rsp+0x30 |

380 |
%else |

381 |
sub rsp, 0x68 |

382 |
%define pix_tmp rsp |

383 |
%endif |

384 | |

385 |
; transpose 6x16 -> tmp space |

386 |
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp |

387 |
lea r6, [r6+r10*8] |

388 |
lea r5, [r5+r10*8] |

389 |
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 |

390 | |

391 |
; vertical filter |

392 |
; alpha, beta, tc0 are still in r2d, r3d, r4 |

393 |
; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them |

394 |
lea r0, [pix_tmp+0x30] |

395 |
mov r1d, 0x10 |

396 |
%ifdef WIN64 |

397 |
mov [rsp+0x20], r4 |

398 |
%endif |

399 |
call deblock_v_luma_8_%1 |

400 | |

401 |
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) |

402 |
add r6, 2 |

403 |
add r5, 2 |

404 |
movq m0, [pix_tmp+0x18] |

405 |
movq m1, [pix_tmp+0x28] |

406 |
movq m2, [pix_tmp+0x38] |

407 |
movq m3, [pix_tmp+0x48] |

408 |
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) |

409 | |

410 |
shl r10, 3 |

411 |
sub r6, r10 |

412 |
sub r5, r10 |

413 |
shr r10, 3 |

414 |
movq m0, [pix_tmp+0x10] |

415 |
movq m1, [pix_tmp+0x20] |

416 |
movq m2, [pix_tmp+0x30] |

417 |
movq m3, [pix_tmp+0x40] |

418 |
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) |

419 | |

420 |
%ifdef WIN64 |

421 |
add rsp, 0x98 |

422 |
%else |

423 |
add rsp, 0x68 |

424 |
%endif |

425 |
RET |

426 |
%endmacro |

427 | |

428 |
INIT_XMM |

429 |
DEBLOCK_LUMA sse2 |

430 |
INIT_AVX |

431 |
DEBLOCK_LUMA avx |

432 | |

433 |
%else |

434 | |

435 |
%macro DEBLOCK_LUMA 3 |

436 |
;----------------------------------------------------------------------------- |

437 |
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

438 |
;----------------------------------------------------------------------------- |

439 |
cglobal deblock_%2_luma_8_%1, 5,5 |

440 |
lea r4, [r1*3] |

441 |
dec r2 ; alpha-1 |

442 |
neg r4 |

443 |
dec r3 ; beta-1 |

444 |
add r4, r0 ; pix-3*stride |

445 |
%assign pad 2*%3+12-(stack_offset&15) |

446 |
SUB esp, pad |

447 | |

448 |
mova m0, [r4+r1] ; p1 |

449 |
mova m1, [r4+2*r1] ; p0 |

450 |
mova m2, [r0] ; q0 |

451 |
mova m3, [r0+r1] ; q1 |

452 |
LOAD_MASK r2, r3 |

453 | |

454 |
mov r3, r4mp |

455 |
movd m4, [r3] ; tc0 |

456 |
punpcklbw m4, m4 |

457 |
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] |

458 |
mova [esp+%3], m4 ; tc |

459 |
pcmpeqb m3, m3 |

460 |
pcmpgtb m4, m3 |

461 |
pand m4, m7 |

462 |
mova [esp], m4 ; mask |

463 | |

464 |
mova m3, [r4] ; p2 |

465 |
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 |

466 |
pand m6, m4 |

467 |
pand m4, [esp+%3] ; tc |

468 |
psubb m7, m4, m6 |

469 |
pand m6, m4 |

470 |
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 |

471 | |

472 |
mova m4, [r0+2*r1] ; q2 |

473 |
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 |

474 |
mova m5, [esp] ; mask |

475 |
pand m6, m5 |

476 |
mova m5, [esp+%3] ; tc |

477 |
pand m5, m6 |

478 |
psubb m7, m6 |

479 |
mova m3, [r0+r1] |

480 |
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 |

481 | |

482 |
DEBLOCK_P0_Q0 |

483 |
mova [r4+2*r1], m1 |

484 |
mova [r0], m2 |

485 |
ADD esp, pad |

486 |
RET |

487 | |

488 |
;----------------------------------------------------------------------------- |

489 |
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

490 |
;----------------------------------------------------------------------------- |

491 |
INIT_MMX |

492 |
cglobal deblock_h_luma_8_%1, 0,5 |

493 |
mov r0, r0mp |

494 |
mov r3, r1m |

495 |
lea r4, [r3*3] |

496 |
sub r0, 4 |

497 |
lea r1, [r0+r4] |

498 |
%assign pad 0x78-(stack_offset&15) |

499 |
SUB esp, pad |

500 |
%define pix_tmp esp+12 |

501 | |

502 |
; transpose 6x16 -> tmp space |

503 |
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp |

504 |
lea r0, [r0+r3*8] |

505 |
lea r1, [r1+r3*8] |

506 |
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 |

507 | |

508 |
; vertical filter |

509 |
lea r0, [pix_tmp+0x30] |

510 |
PUSH dword r4m |

511 |
PUSH dword r3m |

512 |
PUSH dword r2m |

513 |
PUSH dword 16 |

514 |
PUSH dword r0 |

515 |
call deblock_%2_luma_8_%1 |

516 |
%ifidn %2, v8 |

517 |
add dword [esp ], 8 ; pix_tmp+0x38 |

518 |
add dword [esp+16], 2 ; tc0+2 |

519 |
call deblock_%2_luma_8_%1 |

520 |
%endif |

521 |
ADD esp, 20 |

522 | |

523 |
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) |

524 |
mov r0, r0mp |

525 |
sub r0, 2 |

526 |
lea r1, [r0+r4] |

527 | |

528 |
movq m0, [pix_tmp+0x10] |

529 |
movq m1, [pix_tmp+0x20] |

530 |
movq m2, [pix_tmp+0x30] |

531 |
movq m3, [pix_tmp+0x40] |

532 |
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) |

533 | |

534 |
lea r0, [r0+r3*8] |

535 |
lea r1, [r1+r3*8] |

536 |
movq m0, [pix_tmp+0x18] |

537 |
movq m1, [pix_tmp+0x28] |

538 |
movq m2, [pix_tmp+0x38] |

539 |
movq m3, [pix_tmp+0x48] |

540 |
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) |

541 | |

542 |
ADD esp, pad |

543 |
RET |

544 |
%endmacro ; DEBLOCK_LUMA |

545 | |

546 |
INIT_MMX |

547 |
DEBLOCK_LUMA mmxext, v8, 8 |

548 |
INIT_XMM |

549 |
DEBLOCK_LUMA sse2, v, 16 |

550 |
INIT_AVX |

551 |
DEBLOCK_LUMA avx, v, 16 |

552 | |

553 |
%endif ; ARCH |

554 | |

555 | |

556 | |

557 |
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory |

558 |
%ifdef ARCH_X86_64 |

559 |
pavgb t0, p2, p1 |

560 |
pavgb t1, p0, q0 |

561 |
%else |

562 |
mova t0, p2 |

563 |
mova t1, p0 |

564 |
pavgb t0, p1 |

565 |
pavgb t1, q0 |

566 |
%endif |

567 |
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 |

568 |
mova t5, t1 |

569 |
%ifdef ARCH_X86_64 |

570 |
paddb t2, p2, p1 |

571 |
paddb t3, p0, q0 |

572 |
%else |

573 |
mova t2, p2 |

574 |
mova t3, p0 |

575 |
paddb t2, p1 |

576 |
paddb t3, q0 |

577 |
%endif |

578 |
paddb t2, t3 |

579 |
mova t3, t2 |

580 |
mova t4, t2 |

581 |
psrlw t2, 1 |

582 |
pavgb t2, mpb_0 |

583 |
pxor t2, t0 |

584 |
pand t2, mpb_1 |

585 |
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; |

586 | |

587 |
%ifdef ARCH_X86_64 |

588 |
pavgb t1, p2, q1 |

589 |
psubb t2, p2, q1 |

590 |
%else |

591 |
mova t1, p2 |

592 |
mova t2, p2 |

593 |
pavgb t1, q1 |

594 |
psubb t2, q1 |

595 |
%endif |

596 |
paddb t3, t3 |

597 |
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 |

598 |
pand t2, mpb_1 |

599 |
psubb t1, t2 |

600 |
pavgb t1, p1 |

601 |
pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 |

602 |
psrlw t3, 2 |

603 |
pavgb t3, mpb_0 |

604 |
pxor t3, t1 |

605 |
pand t3, mpb_1 |

606 |
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 |

607 | |

608 |
pxor t3, p0, q1 |

609 |
pavgb t2, p0, q1 |

610 |
pand t3, mpb_1 |

611 |
psubb t2, t3 |

612 |
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 |

613 | |

614 |
pxor t1, t2 |

615 |
pxor t2, p0 |

616 |
pand t1, mask1p |

617 |
pand t2, mask0 |

618 |
pxor t1, t2 |

619 |
pxor t1, p0 |

620 |
mova %1, t1 ; store p0 |

621 | |

622 |
mova t1, %4 ; p3 |

623 |
paddb t2, t1, p2 |

624 |
pavgb t1, p2 |

625 |
pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 |

626 |
paddb t2, t2 |

627 |
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 |

628 |
psrlw t2, 2 |

629 |
pavgb t2, mpb_0 |

630 |
pxor t2, t1 |

631 |
pand t2, mpb_1 |

632 |
psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 |

633 | |

634 |
pxor t0, p1 |

635 |
pxor t1, p2 |

636 |
pand t0, mask1p |

637 |
pand t1, mask1p |

638 |
pxor t0, p1 |

639 |
pxor t1, p2 |

640 |
mova %2, t0 ; store p1 |

641 |
mova %3, t1 ; store p2 |

642 |
%endmacro |

643 | |

644 |
%macro LUMA_INTRA_SWAP_PQ 0 |

645 |
%define q1 m0 |

646 |
%define q0 m1 |

647 |
%define p0 m2 |

648 |
%define p1 m3 |

649 |
%define p2 q2 |

650 |
%define mask1p mask1q |

651 |
%endmacro |

652 | |

653 |
%macro DEBLOCK_LUMA_INTRA 2 |

654 |
%define p1 m0 |

655 |
%define p0 m1 |

656 |
%define q0 m2 |

657 |
%define q1 m3 |

658 |
%define t0 m4 |

659 |
%define t1 m5 |

660 |
%define t2 m6 |

661 |
%define t3 m7 |

662 |
%ifdef ARCH_X86_64 |

663 |
%define p2 m8 |

664 |
%define q2 m9 |

665 |
%define t4 m10 |

666 |
%define t5 m11 |

667 |
%define mask0 m12 |

668 |
%define mask1p m13 |

669 |
%define mask1q [rsp-24] |

670 |
%define mpb_0 m14 |

671 |
%define mpb_1 m15 |

672 |
%else |

673 |
%define spill(x) [esp+16*x+((stack_offset+4)&15)] |

674 |
%define p2 [r4+r1] |

675 |
%define q2 [r0+2*r1] |

676 |
%define t4 spill(0) |

677 |
%define t5 spill(1) |

678 |
%define mask0 spill(2) |

679 |
%define mask1p spill(3) |

680 |
%define mask1q spill(4) |

681 |
%define mpb_0 [pb_0] |

682 |
%define mpb_1 [pb_1] |

683 |
%endif |

684 | |

685 |
;----------------------------------------------------------------------------- |

686 |
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) |

687 |
;----------------------------------------------------------------------------- |

688 |
cglobal deblock_%2_luma_intra_8_%1, 4,6,16 |

689 |
%ifndef ARCH_X86_64 |

690 |
sub esp, 0x60 |

691 |
%endif |

692 |
lea r4, [r1*4] |

693 |
lea r5, [r1*3] ; 3*stride |

694 |
dec r2d ; alpha-1 |

695 |
jl .end |

696 |
neg r4 |

697 |
dec r3d ; beta-1 |

698 |
jl .end |

699 |
add r4, r0 ; pix-4*stride |

700 |
mova p1, [r4+2*r1] |

701 |
mova p0, [r4+r5] |

702 |
mova q0, [r0] |

703 |
mova q1, [r0+r1] |

704 |
%ifdef ARCH_X86_64 |

705 |
pxor mpb_0, mpb_0 |

706 |
mova mpb_1, [pb_1] |

707 |
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 |

708 |
SWAP 7, 12 ; m12=mask0 |

709 |
pavgb t5, mpb_0 |

710 |
pavgb t5, mpb_1 ; alpha/4+1 |

711 |
movdqa p2, [r4+r1] |

712 |
movdqa q2, [r0+2*r1] |

713 |
DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 |

714 |
DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 |

715 |
DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 |

716 |
pand t0, mask0 |

717 |
pand t4, t0 |

718 |
pand t2, t0 |

719 |
mova mask1q, t4 |

720 |
mova mask1p, t2 |

721 |
%else |

722 |
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 |

723 |
mova m4, t5 |

724 |
mova mask0, m7 |

725 |
pavgb m4, [pb_0] |

726 |
pavgb m4, [pb_1] ; alpha/4+1 |

727 |
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 |

728 |
pand m6, mask0 |

729 |
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 |

730 |
pand m4, m6 |

731 |
mova mask1p, m4 |

732 |
DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 |

733 |
pand m4, m6 |

734 |
mova mask1q, m4 |

735 |
%endif |

736 |
LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] |

737 |
LUMA_INTRA_SWAP_PQ |

738 |
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] |

739 |
.end: |

740 |
%ifndef ARCH_X86_64 |

741 |
add esp, 0x60 |

742 |
%endif |

743 |
RET |

744 | |

745 |
INIT_MMX |

746 |
%ifdef ARCH_X86_64 |

747 |
;----------------------------------------------------------------------------- |

748 |
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) |

749 |
;----------------------------------------------------------------------------- |

750 |
cglobal deblock_h_luma_intra_8_%1, 4,7 |

751 |
movsxd r10, r1d |

752 |
lea r11, [r10*3] |

753 |
lea r6, [r0-4] |

754 |
lea r5, [r0-4+r11] |

755 |
sub rsp, 0x88 |

756 |
%define pix_tmp rsp |

757 | |

758 |
; transpose 8x16 -> tmp space |

759 |
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) |

760 |
lea r6, [r6+r10*8] |

761 |
lea r5, [r5+r10*8] |

762 |
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) |

763 | |

764 |
lea r0, [pix_tmp+0x40] |

765 |
mov r1, 0x10 |

766 |
call deblock_v_luma_intra_8_%1 |

767 | |

768 |
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) |

769 |
lea r5, [r6+r11] |

770 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) |

771 |
shl r10, 3 |

772 |
sub r6, r10 |

773 |
sub r5, r10 |

774 |
shr r10, 3 |

775 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) |

776 |
add rsp, 0x88 |

777 |
RET |

778 |
%else |

779 |
cglobal deblock_h_luma_intra_8_%1, 2,4 |

780 |
lea r3, [r1*3] |

781 |
sub r0, 4 |

782 |
lea r2, [r0+r3] |

783 |
%assign pad 0x8c-(stack_offset&15) |

784 |
SUB rsp, pad |

785 |
%define pix_tmp rsp |

786 | |

787 |
; transpose 8x16 -> tmp space |

788 |
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) |

789 |
lea r0, [r0+r1*8] |

790 |
lea r2, [r2+r1*8] |

791 |
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) |

792 | |

793 |
lea r0, [pix_tmp+0x40] |

794 |
PUSH dword r3m |

795 |
PUSH dword r2m |

796 |
PUSH dword 16 |

797 |
PUSH r0 |

798 |
call deblock_%2_luma_intra_8_%1 |

799 |
%ifidn %2, v8 |

800 |
add dword [rsp], 8 ; pix_tmp+8 |

801 |
call deblock_%2_luma_intra_8_%1 |

802 |
%endif |

803 |
ADD esp, 16 |

804 | |

805 |
mov r1, r1m |

806 |
mov r0, r0mp |

807 |
lea r3, [r1*3] |

808 |
sub r0, 4 |

809 |
lea r2, [r0+r3] |

810 |
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) |

811 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) |

812 |
lea r0, [r0+r1*8] |

813 |
lea r2, [r2+r1*8] |

814 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) |

815 |
ADD rsp, pad |

816 |
RET |

817 |
%endif ; ARCH_X86_64 |

818 |
%endmacro ; DEBLOCK_LUMA_INTRA |

819 | |

820 |
INIT_XMM |

821 |
DEBLOCK_LUMA_INTRA sse2, v |

822 |
INIT_AVX |

823 |
DEBLOCK_LUMA_INTRA avx , v |

824 |
%ifndef ARCH_X86_64 |

825 |
INIT_MMX |

826 |
DEBLOCK_LUMA_INTRA mmxext, v8 |

827 |
%endif |

828 | |

829 |
INIT_MMX |

830 | |

831 |
%macro CHROMA_V_START 0 |

832 |
dec r2d ; alpha-1 |

833 |
dec r3d ; beta-1 |

834 |
mov t5, r0 |

835 |
sub t5, r1 |

836 |
sub t5, r1 |

837 |
%endmacro |

838 | |

839 |
%macro CHROMA_H_START 0 |

840 |
dec r2d |

841 |
dec r3d |

842 |
sub r0, 2 |

843 |
lea t6, [r1*3] |

844 |
mov t5, r0 |

845 |
add r0, t6 |

846 |
%endmacro |

847 | |

848 |
%define t5 r5 |

849 |
%define t6 r6 |

850 | |

851 |
;----------------------------------------------------------------------------- |

852 |
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

853 |
;----------------------------------------------------------------------------- |

854 |
cglobal deblock_v_chroma_8_mmxext, 5,6 |

855 |
CHROMA_V_START |

856 |
movq m0, [t5] |

857 |
movq m1, [t5+r1] |

858 |
movq m2, [r0] |

859 |
movq m3, [r0+r1] |

860 |
call ff_chroma_inter_body_mmxext |

861 |
movq [t5+r1], m1 |

862 |
movq [r0], m2 |

863 |
RET |

864 | |

865 |
;----------------------------------------------------------------------------- |

866 |
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

867 |
;----------------------------------------------------------------------------- |

868 |
cglobal deblock_h_chroma_8_mmxext, 5,7 |

869 |
%ifdef ARCH_X86_64 |

870 |
%define buf0 [rsp-24] |

871 |
%define buf1 [rsp-16] |

872 |
%else |

873 |
%define buf0 r0m |

874 |
%define buf1 r2m |

875 |
%endif |

876 |
CHROMA_H_START |

877 |
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) |

878 |
movq buf0, m0 |

879 |
movq buf1, m3 |

880 |
call ff_chroma_inter_body_mmxext |

881 |
movq m0, buf0 |

882 |
movq m3, buf1 |

883 |
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) |

884 |
RET |

885 | |

886 |
ALIGN 16 |

887 |
ff_chroma_inter_body_mmxext: |

888 |
LOAD_MASK r2d, r3d |

889 |
movd m6, [r4] ; tc0 |

890 |
punpcklbw m6, m6 |

891 |
pand m7, m6 |

892 |
DEBLOCK_P0_Q0 |

893 |
ret |

894 | |

895 | |

896 | |

897 |
; in: %1=p0 %2=p1 %3=q1 |

898 |
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 |

899 |
%macro CHROMA_INTRA_P0 3 |

900 |
movq m4, %1 |

901 |
pxor m4, %3 |

902 |
pand m4, [pb_1] ; m4 = (p0^q1)&1 |

903 |
pavgb %1, %3 |

904 |
psubusb %1, m4 |

905 |
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) |

906 |
%endmacro |

907 | |

908 |
%define t5 r4 |

909 |
%define t6 r5 |

910 | |

911 |
;----------------------------------------------------------------------------- |

912 |
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) |

913 |
;----------------------------------------------------------------------------- |

914 |
cglobal deblock_v_chroma_intra_8_mmxext, 4,5 |

915 |
CHROMA_V_START |

916 |
movq m0, [t5] |

917 |
movq m1, [t5+r1] |

918 |
movq m2, [r0] |

919 |
movq m3, [r0+r1] |

920 |
call ff_chroma_intra_body_mmxext |

921 |
movq [t5+r1], m1 |

922 |
movq [r0], m2 |

923 |
RET |

924 | |

925 |
;----------------------------------------------------------------------------- |

926 |
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) |

927 |
;----------------------------------------------------------------------------- |

928 |
cglobal deblock_h_chroma_intra_8_mmxext, 4,6 |

929 |
CHROMA_H_START |

930 |
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) |

931 |
call ff_chroma_intra_body_mmxext |

932 |
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) |

933 |
RET |

934 | |

935 |
ALIGN 16 |

936 |
ff_chroma_intra_body_mmxext: |

937 |
LOAD_MASK r2d, r3d |

938 |
movq m5, m1 |

939 |
movq m6, m2 |

940 |
CHROMA_INTRA_P0 m1, m0, m3 |

941 |
CHROMA_INTRA_P0 m2, m3, m0 |

942 |
psubb m1, m5 |

943 |
psubb m2, m6 |

944 |
pand m1, m7 |

945 |
pand m2, m7 |

946 |
paddb m1, m5 |

947 |
paddb m2, m6 |

948 |
ret |