## ffmpeg / libavcodec / x86 / h264_deblock.asm @ 888fa31e

History | View | Annotate | Download (22.7 KB)

1 |
;***************************************************************************** |
---|---|

2 |
;* MMX/SSE2/AVX-optimized H.264 deblocking code |

3 |
;***************************************************************************** |

4 |
;* Copyright (C) 2005-2011 x264 project |

5 |
;* |

6 |
;* Authors: Loren Merritt <lorenm@u.washington.edu> |

7 |
;* Jason Garrett-Glaser <darkshikari@gmail.com> |

8 |
;* Oskar Arvidsson <oskar@irock.se> |

9 |
;* |

10 |
;* This file is part of Libav. |

11 |
;* |

12 |
;* Libav is free software; you can redistribute it and/or |

13 |
;* modify it under the terms of the GNU Lesser General Public |

14 |
;* License as published by the Free Software Foundation; either |

15 |
;* version 2.1 of the License, or (at your option) any later version. |

16 |
;* |

17 |
;* Libav is distributed in the hope that it will be useful, |

18 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

19 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

20 |
;* Lesser General Public License for more details. |

21 |
;* |

22 |
;* You should have received a copy of the GNU Lesser General Public |

23 |
;* License along with Libav; if not, write to the Free Software |

24 |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

25 |
;****************************************************************************** |

26 | |

27 |
%include "x86inc.asm" |

28 |
%include "x86util.asm" |

29 | |

30 |
SECTION .text |

31 | |

32 |
cextern pb_0 |

33 |
cextern pb_1 |

34 |
cextern pb_3 |

35 |
cextern pb_A1 |

36 | |

37 |
; expands to [base],...,[base+7*stride] |

38 |
%define PASS8ROWS(base, base3, stride, stride3) \ |

39 |
[base], [base+stride], [base+stride*2], [base3], \ |

40 |
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] |

41 | |

42 |
%define PASS8ROWS(base, base3, stride, stride3, offset) \ |

43 |
PASS8ROWS(base+offset, base3+offset, stride, stride3) |

44 | |

45 |
; in: 8 rows of 4 bytes in %4..%11 |

46 |
; out: 4 rows of 8 bytes in m0..m3 |

47 |
%macro TRANSPOSE4x8_LOAD 11 |

48 |
movh m0, %4 |

49 |
movh m2, %5 |

50 |
movh m1, %6 |

51 |
movh m3, %7 |

52 |
punpckl%1 m0, m2 |

53 |
punpckl%1 m1, m3 |

54 |
mova m2, m0 |

55 |
punpckl%2 m0, m1 |

56 |
punpckh%2 m2, m1 |

57 | |

58 |
movh m4, %8 |

59 |
movh m6, %9 |

60 |
movh m5, %10 |

61 |
movh m7, %11 |

62 |
punpckl%1 m4, m6 |

63 |
punpckl%1 m5, m7 |

64 |
mova m6, m4 |

65 |
punpckl%2 m4, m5 |

66 |
punpckh%2 m6, m5 |

67 | |

68 |
punpckh%3 m1, m0, m4 |

69 |
punpckh%3 m3, m2, m6 |

70 |
punpckl%3 m0, m4 |

71 |
punpckl%3 m2, m6 |

72 |
%endmacro |

73 | |

74 |
; in: 4 rows of 8 bytes in m0..m3 |

75 |
; out: 8 rows of 4 bytes in %1..%8 |

76 |
%macro TRANSPOSE8x4B_STORE 8 |

77 |
punpckhdq m4, m0, m0 |

78 |
punpckhdq m5, m1, m1 |

79 |
punpckhdq m6, m2, m2 |

80 | |

81 |
punpcklbw m0, m1 |

82 |
punpcklbw m2, m3 |

83 |
punpcklwd m1, m0, m2 |

84 |
punpckhwd m0, m2 |

85 |
movh %1, m1 |

86 |
punpckhdq m1, m1 |

87 |
movh %2, m1 |

88 |
movh %3, m0 |

89 |
punpckhdq m0, m0 |

90 |
movh %4, m0 |

91 | |

92 |
punpckhdq m3, m3 |

93 |
punpcklbw m4, m5 |

94 |
punpcklbw m6, m3 |

95 |
punpcklwd m5, m4, m6 |

96 |
punpckhwd m4, m6 |

97 |
movh %5, m5 |

98 |
punpckhdq m5, m5 |

99 |
movh %6, m5 |

100 |
movh %7, m4 |

101 |
punpckhdq m4, m4 |

102 |
movh %8, m4 |

103 |
%endmacro |

104 | |

105 |
%macro TRANSPOSE4x8B_LOAD 8 |

106 |
TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 |

107 |
%endmacro |

108 | |

109 |
%macro SBUTTERFLY3 4 |

110 |
punpckh%1 %4, %2, %3 |

111 |
punpckl%1 %2, %3 |

112 |
%endmacro |

113 | |

114 |
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 |

115 |
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] |

116 |
%macro TRANSPOSE6x8_MEM 9 |

117 |
RESET_MM_PERMUTATION |

118 |
movq m0, %1 |

119 |
movq m1, %2 |

120 |
movq m2, %3 |

121 |
movq m3, %4 |

122 |
movq m4, %5 |

123 |
movq m5, %6 |

124 |
movq m6, %7 |

125 |
SBUTTERFLY bw, 0, 1, 7 |

126 |
SBUTTERFLY bw, 2, 3, 7 |

127 |
SBUTTERFLY bw, 4, 5, 7 |

128 |
movq [%9+0x10], m3 |

129 |
SBUTTERFLY3 bw, m6, %8, m7 |

130 |
SBUTTERFLY wd, 0, 2, 3 |

131 |
SBUTTERFLY wd, 4, 6, 3 |

132 |
punpckhdq m0, m4 |

133 |
movq [%9+0x00], m0 |

134 |
SBUTTERFLY3 wd, m1, [%9+0x10], m3 |

135 |
SBUTTERFLY wd, 5, 7, 0 |

136 |
SBUTTERFLY dq, 1, 5, 0 |

137 |
SBUTTERFLY dq, 2, 6, 0 |

138 |
punpckldq m3, m7 |

139 |
movq [%9+0x10], m2 |

140 |
movq [%9+0x20], m6 |

141 |
movq [%9+0x30], m1 |

142 |
movq [%9+0x40], m5 |

143 |
movq [%9+0x50], m3 |

144 |
RESET_MM_PERMUTATION |

145 |
%endmacro |

146 | |

147 |
; in: 8 rows of 8 in %1..%8 |

148 |
; out: 8 rows of 8 in %9..%16 |

149 |
%macro TRANSPOSE8x8_MEM 16 |

150 |
RESET_MM_PERMUTATION |

151 |
movq m0, %1 |

152 |
movq m1, %2 |

153 |
movq m2, %3 |

154 |
movq m3, %4 |

155 |
movq m4, %5 |

156 |
movq m5, %6 |

157 |
movq m6, %7 |

158 |
SBUTTERFLY bw, 0, 1, 7 |

159 |
SBUTTERFLY bw, 2, 3, 7 |

160 |
SBUTTERFLY bw, 4, 5, 7 |

161 |
SBUTTERFLY3 bw, m6, %8, m7 |

162 |
movq %9, m5 |

163 |
SBUTTERFLY wd, 0, 2, 5 |

164 |
SBUTTERFLY wd, 4, 6, 5 |

165 |
SBUTTERFLY wd, 1, 3, 5 |

166 |
movq %11, m6 |

167 |
movq m6, %9 |

168 |
SBUTTERFLY wd, 6, 7, 5 |

169 |
SBUTTERFLY dq, 0, 4, 5 |

170 |
SBUTTERFLY dq, 1, 6, 5 |

171 |
movq %9, m0 |

172 |
movq %10, m4 |

173 |
movq %13, m1 |

174 |
movq %14, m6 |

175 |
SBUTTERFLY3 dq, m2, %11, m0 |

176 |
SBUTTERFLY dq, 3, 7, 4 |

177 |
movq %11, m2 |

178 |
movq %12, m0 |

179 |
movq %15, m3 |

180 |
movq %16, m7 |

181 |
RESET_MM_PERMUTATION |

182 |
%endmacro |

183 | |

184 |
; out: %4 = |%1-%2|>%3 |

185 |
; clobbers: %5 |

186 |
%macro DIFF_GT 5 |

187 |
%if avx_enabled == 0 |

188 |
mova %5, %2 |

189 |
mova %4, %1 |

190 |
psubusb %5, %1 |

191 |
psubusb %4, %2 |

192 |
%else |

193 |
psubusb %5, %2, %1 |

194 |
psubusb %4, %1, %2 |

195 |
%endif |

196 |
por %4, %5 |

197 |
psubusb %4, %3 |

198 |
%endmacro |

199 | |

200 |
; out: %4 = |%1-%2|>%3 |

201 |
; clobbers: %5 |

202 |
%macro DIFF_GT2 5 |

203 |
%ifdef ARCH_X86_64 |

204 |
psubusb %5, %2, %1 |

205 |
psubusb %4, %1, %2 |

206 |
%else |

207 |
mova %5, %2 |

208 |
mova %4, %1 |

209 |
psubusb %5, %1 |

210 |
psubusb %4, %2 |

211 |
%endif |

212 |
psubusb %5, %3 |

213 |
psubusb %4, %3 |

214 |
pcmpeqb %4, %5 |

215 |
%endmacro |

216 | |

217 |
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 |

218 |
; out: m5=beta-1, m7=mask, %3=alpha-1 |

219 |
; clobbers: m4,m6 |

220 |
%macro LOAD_MASK 2-3 |

221 |
movd m4, %1 |

222 |
movd m5, %2 |

223 |
SPLATW m4, m4 |

224 |
SPLATW m5, m5 |

225 |
packuswb m4, m4 ; 16x alpha-1 |

226 |
packuswb m5, m5 ; 16x beta-1 |

227 |
%if %0>2 |

228 |
mova %3, m4 |

229 |
%endif |

230 |
DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 |

231 |
DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 |

232 |
por m7, m4 |

233 |
DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 |

234 |
por m7, m4 |

235 |
pxor m6, m6 |

236 |
pcmpeqb m7, m6 |

237 |
%endmacro |

238 | |

239 |
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) |

240 |
; out: m1=p0' m2=q0' |

241 |
; clobbers: m0,3-6 |

242 |
%macro DEBLOCK_P0_Q0 0 |

243 |
pxor m5, m1, m2 ; p0^q0 |

244 |
pand m5, [pb_1] ; (p0^q0)&1 |

245 |
pcmpeqb m4, m4 |

246 |
pxor m3, m4 |

247 |
pavgb m3, m0 ; (p1 - q1 + 256)>>1 |

248 |
pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 |

249 |
pxor m4, m1 |

250 |
pavgb m4, m2 ; (q0 - p0 + 256)>>1 |

251 |
pavgb m3, m5 |

252 |
paddusb m3, m4 ; d+128+33 |

253 |
mova m6, [pb_A1] |

254 |
psubusb m6, m3 |

255 |
psubusb m3, [pb_A1] |

256 |
pminub m6, m7 |

257 |
pminub m3, m7 |

258 |
psubusb m1, m6 |

259 |
psubusb m2, m3 |

260 |
paddusb m1, m3 |

261 |
paddusb m2, m6 |

262 |
%endmacro |

263 | |

264 |
; in: m1=p0 m2=q0 |

265 |
; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp |

266 |
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) |

267 |
; clobbers: q2, tmp, tc0 |

268 |
%macro LUMA_Q1 6 |

269 |
pavgb %6, m1, m2 |

270 |
pavgb %2, %6 ; avg(p2,avg(p0,q0)) |

271 |
pxor %6, %3 |

272 |
pand %6, [pb_1] ; (p2^avg(p0,q0))&1 |

273 |
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 |

274 |
psubusb %6, %1, %5 |

275 |
paddusb %5, %1 |

276 |
pmaxub %2, %6 |

277 |
pminub %2, %5 |

278 |
mova %4, %2 |

279 |
%endmacro |

280 | |

281 |
%ifdef ARCH_X86_64 |

282 |
;----------------------------------------------------------------------------- |

283 |
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

284 |
;----------------------------------------------------------------------------- |

285 |
%macro DEBLOCK_LUMA 1 |

286 |
cglobal deblock_v_luma_8_%1, 5,5,10 |

287 |
movd m8, [r4] ; tc0 |

288 |
lea r4, [r1*3] |

289 |
dec r2d ; alpha-1 |

290 |
neg r4 |

291 |
dec r3d ; beta-1 |

292 |
add r4, r0 ; pix-3*stride |

293 | |

294 |
mova m0, [r4+r1] ; p1 |

295 |
mova m1, [r4+2*r1] ; p0 |

296 |
mova m2, [r0] ; q0 |

297 |
mova m3, [r0+r1] ; q1 |

298 |
LOAD_MASK r2d, r3d |

299 | |

300 |
punpcklbw m8, m8 |

301 |
punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] |

302 |
pcmpeqb m9, m9 |

303 |
pcmpeqb m9, m8 |

304 |
pandn m9, m7 |

305 |
pand m8, m9 |

306 | |

307 |
movdqa m3, [r4] ; p2 |

308 |
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 |

309 |
pand m6, m9 |

310 |
psubb m7, m8, m6 |

311 |
pand m6, m8 |

312 |
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 |

313 | |

314 |
movdqa m4, [r0+2*r1] ; q2 |

315 |
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 |

316 |
pand m6, m9 |

317 |
pand m8, m6 |

318 |
psubb m7, m6 |

319 |
mova m3, [r0+r1] |

320 |
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 |

321 | |

322 |
DEBLOCK_P0_Q0 |

323 |
mova [r4+2*r1], m1 |

324 |
mova [r0], m2 |

325 |
RET |

326 | |

327 |
;----------------------------------------------------------------------------- |

328 |
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

329 |
;----------------------------------------------------------------------------- |

330 |
INIT_MMX |

331 |
cglobal deblock_h_luma_8_%1, 5,7 |

332 |
movsxd r10, r1d |

333 |
lea r11, [r10+r10*2] |

334 |
lea r6, [r0-4] |

335 |
lea r5, [r0-4+r11] |

336 |
%ifdef WIN64 |

337 |
sub rsp, 0x98 |

338 |
%define pix_tmp rsp+0x30 |

339 |
%else |

340 |
sub rsp, 0x68 |

341 |
%define pix_tmp rsp |

342 |
%endif |

343 | |

344 |
; transpose 6x16 -> tmp space |

345 |
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp |

346 |
lea r6, [r6+r10*8] |

347 |
lea r5, [r5+r10*8] |

348 |
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 |

349 | |

350 |
; vertical filter |

351 |
; alpha, beta, tc0 are still in r2d, r3d, r4 |

352 |
; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them |

353 |
lea r0, [pix_tmp+0x30] |

354 |
mov r1d, 0x10 |

355 |
%ifdef WIN64 |

356 |
mov [rsp+0x20], r4 |

357 |
%endif |

358 |
call deblock_v_luma_8_%1 |

359 | |

360 |
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) |

361 |
add r6, 2 |

362 |
add r5, 2 |

363 |
movq m0, [pix_tmp+0x18] |

364 |
movq m1, [pix_tmp+0x28] |

365 |
movq m2, [pix_tmp+0x38] |

366 |
movq m3, [pix_tmp+0x48] |

367 |
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) |

368 | |

369 |
shl r10, 3 |

370 |
sub r6, r10 |

371 |
sub r5, r10 |

372 |
shr r10, 3 |

373 |
movq m0, [pix_tmp+0x10] |

374 |
movq m1, [pix_tmp+0x20] |

375 |
movq m2, [pix_tmp+0x30] |

376 |
movq m3, [pix_tmp+0x40] |

377 |
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) |

378 | |

379 |
%ifdef WIN64 |

380 |
add rsp, 0x98 |

381 |
%else |

382 |
add rsp, 0x68 |

383 |
%endif |

384 |
RET |

385 |
%endmacro |

386 | |

387 |
INIT_XMM |

388 |
DEBLOCK_LUMA sse2 |

389 |
INIT_AVX |

390 |
DEBLOCK_LUMA avx |

391 | |

392 |
%else |

393 | |

394 |
%macro DEBLOCK_LUMA 3 |

395 |
;----------------------------------------------------------------------------- |

396 |
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

397 |
;----------------------------------------------------------------------------- |

398 |
cglobal deblock_%2_luma_8_%1, 5,5 |

399 |
lea r4, [r1*3] |

400 |
dec r2 ; alpha-1 |

401 |
neg r4 |

402 |
dec r3 ; beta-1 |

403 |
add r4, r0 ; pix-3*stride |

404 |
%assign pad 2*%3+12-(stack_offset&15) |

405 |
SUB esp, pad |

406 | |

407 |
mova m0, [r4+r1] ; p1 |

408 |
mova m1, [r4+2*r1] ; p0 |

409 |
mova m2, [r0] ; q0 |

410 |
mova m3, [r0+r1] ; q1 |

411 |
LOAD_MASK r2, r3 |

412 | |

413 |
mov r3, r4mp |

414 |
movd m4, [r3] ; tc0 |

415 |
punpcklbw m4, m4 |

416 |
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] |

417 |
mova [esp+%3], m4 ; tc |

418 |
pcmpeqb m3, m3 |

419 |
pcmpgtb m4, m3 |

420 |
pand m4, m7 |

421 |
mova [esp], m4 ; mask |

422 | |

423 |
mova m3, [r4] ; p2 |

424 |
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 |

425 |
pand m6, m4 |

426 |
pand m4, [esp+%3] ; tc |

427 |
psubb m7, m4, m6 |

428 |
pand m6, m4 |

429 |
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 |

430 | |

431 |
mova m4, [r0+2*r1] ; q2 |

432 |
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 |

433 |
mova m5, [esp] ; mask |

434 |
pand m6, m5 |

435 |
mova m5, [esp+%3] ; tc |

436 |
pand m5, m6 |

437 |
psubb m7, m6 |

438 |
mova m3, [r0+r1] |

439 |
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 |

440 | |

441 |
DEBLOCK_P0_Q0 |

442 |
mova [r4+2*r1], m1 |

443 |
mova [r0], m2 |

444 |
ADD esp, pad |

445 |
RET |

446 | |

447 |
;----------------------------------------------------------------------------- |

448 |
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

449 |
;----------------------------------------------------------------------------- |

450 |
INIT_MMX |

451 |
cglobal deblock_h_luma_8_%1, 0,5 |

452 |
mov r0, r0mp |

453 |
mov r3, r1m |

454 |
lea r4, [r3*3] |

455 |
sub r0, 4 |

456 |
lea r1, [r0+r4] |

457 |
%assign pad 0x78-(stack_offset&15) |

458 |
SUB esp, pad |

459 |
%define pix_tmp esp+12 |

460 | |

461 |
; transpose 6x16 -> tmp space |

462 |
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp |

463 |
lea r0, [r0+r3*8] |

464 |
lea r1, [r1+r3*8] |

465 |
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 |

466 | |

467 |
; vertical filter |

468 |
lea r0, [pix_tmp+0x30] |

469 |
PUSH dword r4m |

470 |
PUSH dword r3m |

471 |
PUSH dword r2m |

472 |
PUSH dword 16 |

473 |
PUSH dword r0 |

474 |
call deblock_%2_luma_8_%1 |

475 |
%ifidn %2, v8 |

476 |
add dword [esp ], 8 ; pix_tmp+0x38 |

477 |
add dword [esp+16], 2 ; tc0+2 |

478 |
call deblock_%2_luma_8_%1 |

479 |
%endif |

480 |
ADD esp, 20 |

481 | |

482 |
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) |

483 |
mov r0, r0mp |

484 |
sub r0, 2 |

485 |
lea r1, [r0+r4] |

486 | |

487 |
movq m0, [pix_tmp+0x10] |

488 |
movq m1, [pix_tmp+0x20] |

489 |
movq m2, [pix_tmp+0x30] |

490 |
movq m3, [pix_tmp+0x40] |

491 |
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) |

492 | |

493 |
lea r0, [r0+r3*8] |

494 |
lea r1, [r1+r3*8] |

495 |
movq m0, [pix_tmp+0x18] |

496 |
movq m1, [pix_tmp+0x28] |

497 |
movq m2, [pix_tmp+0x38] |

498 |
movq m3, [pix_tmp+0x48] |

499 |
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) |

500 | |

501 |
ADD esp, pad |

502 |
RET |

503 |
%endmacro ; DEBLOCK_LUMA |

504 | |

505 |
INIT_MMX |

506 |
DEBLOCK_LUMA mmxext, v8, 8 |

507 |
INIT_XMM |

508 |
DEBLOCK_LUMA sse2, v, 16 |

509 |
INIT_AVX |

510 |
DEBLOCK_LUMA avx, v, 16 |

511 | |

512 |
%endif ; ARCH |

513 | |

514 | |

515 | |

516 |
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory |

517 |
%ifdef ARCH_X86_64 |

518 |
pavgb t0, p2, p1 |

519 |
pavgb t1, p0, q0 |

520 |
%else |

521 |
mova t0, p2 |

522 |
mova t1, p0 |

523 |
pavgb t0, p1 |

524 |
pavgb t1, q0 |

525 |
%endif |

526 |
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 |

527 |
mova t5, t1 |

528 |
%ifdef ARCH_X86_64 |

529 |
paddb t2, p2, p1 |

530 |
paddb t3, p0, q0 |

531 |
%else |

532 |
mova t2, p2 |

533 |
mova t3, p0 |

534 |
paddb t2, p1 |

535 |
paddb t3, q0 |

536 |
%endif |

537 |
paddb t2, t3 |

538 |
mova t3, t2 |

539 |
mova t4, t2 |

540 |
psrlw t2, 1 |

541 |
pavgb t2, mpb_0 |

542 |
pxor t2, t0 |

543 |
pand t2, mpb_1 |

544 |
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; |

545 | |

546 |
%ifdef ARCH_X86_64 |

547 |
pavgb t1, p2, q1 |

548 |
psubb t2, p2, q1 |

549 |
%else |

550 |
mova t1, p2 |

551 |
mova t2, p2 |

552 |
pavgb t1, q1 |

553 |
psubb t2, q1 |

554 |
%endif |

555 |
paddb t3, t3 |

556 |
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 |

557 |
pand t2, mpb_1 |

558 |
psubb t1, t2 |

559 |
pavgb t1, p1 |

560 |
pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 |

561 |
psrlw t3, 2 |

562 |
pavgb t3, mpb_0 |

563 |
pxor t3, t1 |

564 |
pand t3, mpb_1 |

565 |
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 |

566 | |

567 |
pxor t3, p0, q1 |

568 |
pavgb t2, p0, q1 |

569 |
pand t3, mpb_1 |

570 |
psubb t2, t3 |

571 |
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 |

572 | |

573 |
pxor t1, t2 |

574 |
pxor t2, p0 |

575 |
pand t1, mask1p |

576 |
pand t2, mask0 |

577 |
pxor t1, t2 |

578 |
pxor t1, p0 |

579 |
mova %1, t1 ; store p0 |

580 | |

581 |
mova t1, %4 ; p3 |

582 |
paddb t2, t1, p2 |

583 |
pavgb t1, p2 |

584 |
pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 |

585 |
paddb t2, t2 |

586 |
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 |

587 |
psrlw t2, 2 |

588 |
pavgb t2, mpb_0 |

589 |
pxor t2, t1 |

590 |
pand t2, mpb_1 |

591 |
psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 |

592 | |

593 |
pxor t0, p1 |

594 |
pxor t1, p2 |

595 |
pand t0, mask1p |

596 |
pand t1, mask1p |

597 |
pxor t0, p1 |

598 |
pxor t1, p2 |

599 |
mova %2, t0 ; store p1 |

600 |
mova %3, t1 ; store p2 |

601 |
%endmacro |

602 | |

603 |
%macro LUMA_INTRA_SWAP_PQ 0 |

604 |
%define q1 m0 |

605 |
%define q0 m1 |

606 |
%define p0 m2 |

607 |
%define p1 m3 |

608 |
%define p2 q2 |

609 |
%define mask1p mask1q |

610 |
%endmacro |

611 | |

612 |
%macro DEBLOCK_LUMA_INTRA 2 |

613 |
%define p1 m0 |

614 |
%define p0 m1 |

615 |
%define q0 m2 |

616 |
%define q1 m3 |

617 |
%define t0 m4 |

618 |
%define t1 m5 |

619 |
%define t2 m6 |

620 |
%define t3 m7 |

621 |
%ifdef ARCH_X86_64 |

622 |
%define p2 m8 |

623 |
%define q2 m9 |

624 |
%define t4 m10 |

625 |
%define t5 m11 |

626 |
%define mask0 m12 |

627 |
%define mask1p m13 |

628 |
%define mask1q [rsp-24] |

629 |
%define mpb_0 m14 |

630 |
%define mpb_1 m15 |

631 |
%else |

632 |
%define spill(x) [esp+16*x+((stack_offset+4)&15)] |

633 |
%define p2 [r4+r1] |

634 |
%define q2 [r0+2*r1] |

635 |
%define t4 spill(0) |

636 |
%define t5 spill(1) |

637 |
%define mask0 spill(2) |

638 |
%define mask1p spill(3) |

639 |
%define mask1q spill(4) |

640 |
%define mpb_0 [pb_0] |

641 |
%define mpb_1 [pb_1] |

642 |
%endif |

643 | |

644 |
;----------------------------------------------------------------------------- |

645 |
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) |

646 |
;----------------------------------------------------------------------------- |

647 |
cglobal deblock_%2_luma_intra_8_%1, 4,6,16 |

648 |
%ifndef ARCH_X86_64 |

649 |
sub esp, 0x60 |

650 |
%endif |

651 |
lea r4, [r1*4] |

652 |
lea r5, [r1*3] ; 3*stride |

653 |
dec r2d ; alpha-1 |

654 |
jl .end |

655 |
neg r4 |

656 |
dec r3d ; beta-1 |

657 |
jl .end |

658 |
add r4, r0 ; pix-4*stride |

659 |
mova p1, [r4+2*r1] |

660 |
mova p0, [r4+r5] |

661 |
mova q0, [r0] |

662 |
mova q1, [r0+r1] |

663 |
%ifdef ARCH_X86_64 |

664 |
pxor mpb_0, mpb_0 |

665 |
mova mpb_1, [pb_1] |

666 |
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 |

667 |
SWAP 7, 12 ; m12=mask0 |

668 |
pavgb t5, mpb_0 |

669 |
pavgb t5, mpb_1 ; alpha/4+1 |

670 |
movdqa p2, [r4+r1] |

671 |
movdqa q2, [r0+2*r1] |

672 |
DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 |

673 |
DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 |

674 |
DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 |

675 |
pand t0, mask0 |

676 |
pand t4, t0 |

677 |
pand t2, t0 |

678 |
mova mask1q, t4 |

679 |
mova mask1p, t2 |

680 |
%else |

681 |
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 |

682 |
mova m4, t5 |

683 |
mova mask0, m7 |

684 |
pavgb m4, [pb_0] |

685 |
pavgb m4, [pb_1] ; alpha/4+1 |

686 |
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 |

687 |
pand m6, mask0 |

688 |
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 |

689 |
pand m4, m6 |

690 |
mova mask1p, m4 |

691 |
DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 |

692 |
pand m4, m6 |

693 |
mova mask1q, m4 |

694 |
%endif |

695 |
LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] |

696 |
LUMA_INTRA_SWAP_PQ |

697 |
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] |

698 |
.end: |

699 |
%ifndef ARCH_X86_64 |

700 |
add esp, 0x60 |

701 |
%endif |

702 |
RET |

703 | |

704 |
INIT_MMX |

705 |
%ifdef ARCH_X86_64 |

706 |
;----------------------------------------------------------------------------- |

707 |
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) |

708 |
;----------------------------------------------------------------------------- |

709 |
cglobal deblock_h_luma_intra_8_%1, 4,7 |

710 |
movsxd r10, r1d |

711 |
lea r11, [r10*3] |

712 |
lea r6, [r0-4] |

713 |
lea r5, [r0-4+r11] |

714 |
sub rsp, 0x88 |

715 |
%define pix_tmp rsp |

716 | |

717 |
; transpose 8x16 -> tmp space |

718 |
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) |

719 |
lea r6, [r6+r10*8] |

720 |
lea r5, [r5+r10*8] |

721 |
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) |

722 | |

723 |
lea r0, [pix_tmp+0x40] |

724 |
mov r1, 0x10 |

725 |
call deblock_v_luma_intra_8_%1 |

726 | |

727 |
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) |

728 |
lea r5, [r6+r11] |

729 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) |

730 |
shl r10, 3 |

731 |
sub r6, r10 |

732 |
sub r5, r10 |

733 |
shr r10, 3 |

734 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) |

735 |
add rsp, 0x88 |

736 |
RET |

737 |
%else |

738 |
cglobal deblock_h_luma_intra_8_%1, 2,4 |

739 |
lea r3, [r1*3] |

740 |
sub r0, 4 |

741 |
lea r2, [r0+r3] |

742 |
%assign pad 0x8c-(stack_offset&15) |

743 |
SUB rsp, pad |

744 |
%define pix_tmp rsp |

745 | |

746 |
; transpose 8x16 -> tmp space |

747 |
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) |

748 |
lea r0, [r0+r1*8] |

749 |
lea r2, [r2+r1*8] |

750 |
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) |

751 | |

752 |
lea r0, [pix_tmp+0x40] |

753 |
PUSH dword r3m |

754 |
PUSH dword r2m |

755 |
PUSH dword 16 |

756 |
PUSH r0 |

757 |
call deblock_%2_luma_intra_8_%1 |

758 |
%ifidn %2, v8 |

759 |
add dword [rsp], 8 ; pix_tmp+8 |

760 |
call deblock_%2_luma_intra_8_%1 |

761 |
%endif |

762 |
ADD esp, 16 |

763 | |

764 |
mov r1, r1m |

765 |
mov r0, r0mp |

766 |
lea r3, [r1*3] |

767 |
sub r0, 4 |

768 |
lea r2, [r0+r3] |

769 |
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) |

770 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) |

771 |
lea r0, [r0+r1*8] |

772 |
lea r2, [r2+r1*8] |

773 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) |

774 |
ADD rsp, pad |

775 |
RET |

776 |
%endif ; ARCH_X86_64 |

777 |
%endmacro ; DEBLOCK_LUMA_INTRA |

778 | |

779 |
INIT_XMM |

780 |
DEBLOCK_LUMA_INTRA sse2, v |

781 |
INIT_AVX |

782 |
DEBLOCK_LUMA_INTRA avx , v |

783 |
%ifndef ARCH_X86_64 |

784 |
INIT_MMX |

785 |
DEBLOCK_LUMA_INTRA mmxext, v8 |

786 |
%endif |

787 | |

788 |
INIT_MMX |

789 | |

790 |
%macro CHROMA_V_START 0 |

791 |
dec r2d ; alpha-1 |

792 |
dec r3d ; beta-1 |

793 |
mov t5, r0 |

794 |
sub t5, r1 |

795 |
sub t5, r1 |

796 |
%endmacro |

797 | |

798 |
%macro CHROMA_H_START 0 |

799 |
dec r2d |

800 |
dec r3d |

801 |
sub r0, 2 |

802 |
lea t6, [r1*3] |

803 |
mov t5, r0 |

804 |
add r0, t6 |

805 |
%endmacro |

806 | |

807 |
%define t5 r5 |

808 |
%define t6 r6 |

809 | |

810 |
;----------------------------------------------------------------------------- |

811 |
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

812 |
;----------------------------------------------------------------------------- |

813 |
cglobal deblock_v_chroma_8_mmxext, 5,6 |

814 |
CHROMA_V_START |

815 |
movq m0, [t5] |

816 |
movq m1, [t5+r1] |

817 |
movq m2, [r0] |

818 |
movq m3, [r0+r1] |

819 |
call ff_chroma_inter_body_mmxext |

820 |
movq [t5+r1], m1 |

821 |
movq [r0], m2 |

822 |
RET |

823 | |

824 |
;----------------------------------------------------------------------------- |

825 |
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

826 |
;----------------------------------------------------------------------------- |

827 |
cglobal deblock_h_chroma_8_mmxext, 5,7 |

828 |
%ifdef ARCH_X86_64 |

829 |
%define buf0 [rsp-24] |

830 |
%define buf1 [rsp-16] |

831 |
%else |

832 |
%define buf0 r0m |

833 |
%define buf1 r2m |

834 |
%endif |

835 |
CHROMA_H_START |

836 |
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) |

837 |
movq buf0, m0 |

838 |
movq buf1, m3 |

839 |
call ff_chroma_inter_body_mmxext |

840 |
movq m0, buf0 |

841 |
movq m3, buf1 |

842 |
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) |

843 |
RET |

844 | |

845 |
ALIGN 16 |

846 |
ff_chroma_inter_body_mmxext: |

847 |
LOAD_MASK r2d, r3d |

848 |
movd m6, [r4] ; tc0 |

849 |
punpcklbw m6, m6 |

850 |
pand m7, m6 |

851 |
DEBLOCK_P0_Q0 |

852 |
ret |

853 | |

854 | |

855 | |

856 |
; in: %1=p0 %2=p1 %3=q1 |

857 |
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 |

858 |
%macro CHROMA_INTRA_P0 3 |

859 |
movq m4, %1 |

860 |
pxor m4, %3 |

861 |
pand m4, [pb_1] ; m4 = (p0^q1)&1 |

862 |
pavgb %1, %3 |

863 |
psubusb %1, m4 |

864 |
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) |

865 |
%endmacro |

866 | |

867 |
%define t5 r4 |

868 |
%define t6 r5 |

869 | |

870 |
;----------------------------------------------------------------------------- |

871 |
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) |

872 |
;----------------------------------------------------------------------------- |

873 |
cglobal deblock_v_chroma_intra_8_mmxext, 4,5 |

874 |
CHROMA_V_START |

875 |
movq m0, [t5] |

876 |
movq m1, [t5+r1] |

877 |
movq m2, [r0] |

878 |
movq m3, [r0+r1] |

879 |
call ff_chroma_intra_body_mmxext |

880 |
movq [t5+r1], m1 |

881 |
movq [r0], m2 |

882 |
RET |

883 | |

884 |
;----------------------------------------------------------------------------- |

885 |
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) |

886 |
;----------------------------------------------------------------------------- |

887 |
cglobal deblock_h_chroma_intra_8_mmxext, 4,6 |

888 |
CHROMA_H_START |

889 |
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) |

890 |
call ff_chroma_intra_body_mmxext |

891 |
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) |

892 |
RET |

893 | |

894 |
ALIGN 16 |

895 |
ff_chroma_intra_body_mmxext: |

896 |
LOAD_MASK r2d, r3d |

897 |
movq m5, m1 |

898 |
movq m6, m2 |

899 |
CHROMA_INTRA_P0 m1, m0, m3 |

900 |
CHROMA_INTRA_P0 m2, m3, m0 |

901 |
psubb m1, m5 |

902 |
psubb m2, m6 |

903 |
pand m1, m7 |

904 |
pand m2, m7 |

905 |
paddb m1, m5 |

906 |
paddb m2, m6 |

907 |
ret |