## ffmpeg / libavcodec / x86 / h264_deblock.asm @ 98c6053c

History | View | Annotate | Download (22.4 KB)

1 |
;***************************************************************************** |
---|---|

2 |
;* MMX/SSE2-optimized H.264 deblocking code |

3 |
;***************************************************************************** |

4 |
;* Copyright (C) 2005-2008 x264 project |

5 |
;* |

6 |
;* Authors: Loren Merritt <lorenm@u.washington.edu> |

7 |
;* Jason Garrett-Glaser <darkshikari@gmail.com> |

8 |
;* |

9 |
;* This file is part of FFmpeg. |

10 |
;* |

11 |
;* FFmpeg is free software; you can redistribute it and/or |

12 |
;* modify it under the terms of the GNU Lesser General Public |

13 |
;* License as published by the Free Software Foundation; either |

14 |
;* version 2.1 of the License, or (at your option) any later version. |

15 |
;* |

16 |
;* FFmpeg is distributed in the hope that it will be useful, |

17 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

18 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

19 |
;* Lesser General Public License for more details. |

20 |
;* |

21 |
;* You should have received a copy of the GNU Lesser General Public |

22 |
;* License along with FFmpeg; if not, write to the Free Software |

23 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

24 |
;****************************************************************************** |

25 | |

26 |
%include "x86inc.asm" |

27 |
%include "x86util.asm" |

28 | |

29 |
SECTION_RODATA |

30 | |

31 |
cextern pb_0 |

32 |
cextern pb_1 |

33 |
cextern pb_3 |

34 |
cextern pb_A1 |

35 | |

36 |
SECTION .text |

37 | |

38 |
; expands to [base],...,[base+7*stride] |

39 |
%define PASS8ROWS(base, base3, stride, stride3) \ |

40 |
[base], [base+stride], [base+stride*2], [base3], \ |

41 |
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] |

42 | |

43 |
; in: 8 rows of 4 bytes in %1..%8 |

44 |
; out: 4 rows of 8 bytes in m0..m3 |

45 |
%macro TRANSPOSE4x8_LOAD 8 |

46 |
movd m0, %1 |

47 |
movd m2, %2 |

48 |
movd m1, %3 |

49 |
movd m3, %4 |

50 |
punpcklbw m0, m2 |

51 |
punpcklbw m1, m3 |

52 |
movq m2, m0 |

53 |
punpcklwd m0, m1 |

54 |
punpckhwd m2, m1 |

55 | |

56 |
movd m4, %5 |

57 |
movd m6, %6 |

58 |
movd m5, %7 |

59 |
movd m7, %8 |

60 |
punpcklbw m4, m6 |

61 |
punpcklbw m5, m7 |

62 |
movq m6, m4 |

63 |
punpcklwd m4, m5 |

64 |
punpckhwd m6, m5 |

65 | |

66 |
movq m1, m0 |

67 |
movq m3, m2 |

68 |
punpckldq m0, m4 |

69 |
punpckhdq m1, m4 |

70 |
punpckldq m2, m6 |

71 |
punpckhdq m3, m6 |

72 |
%endmacro |

73 | |

74 |
; in: 4 rows of 8 bytes in m0..m3 |

75 |
; out: 8 rows of 4 bytes in %1..%8 |

76 |
%macro TRANSPOSE8x4_STORE 8 |

77 |
movq m4, m0 |

78 |
movq m5, m1 |

79 |
movq m6, m2 |

80 |
punpckhdq m4, m4 |

81 |
punpckhdq m5, m5 |

82 |
punpckhdq m6, m6 |

83 | |

84 |
punpcklbw m0, m1 |

85 |
punpcklbw m2, m3 |

86 |
movq m1, m0 |

87 |
punpcklwd m0, m2 |

88 |
punpckhwd m1, m2 |

89 |
movd %1, m0 |

90 |
punpckhdq m0, m0 |

91 |
movd %2, m0 |

92 |
movd %3, m1 |

93 |
punpckhdq m1, m1 |

94 |
movd %4, m1 |

95 | |

96 |
punpckhdq m3, m3 |

97 |
punpcklbw m4, m5 |

98 |
punpcklbw m6, m3 |

99 |
movq m5, m4 |

100 |
punpcklwd m4, m6 |

101 |
punpckhwd m5, m6 |

102 |
movd %5, m4 |

103 |
punpckhdq m4, m4 |

104 |
movd %6, m4 |

105 |
movd %7, m5 |

106 |
punpckhdq m5, m5 |

107 |
movd %8, m5 |

108 |
%endmacro |

109 | |

110 |
%macro SBUTTERFLY3 4 |

111 |
movq %4, %2 |

112 |
punpckl%1 %2, %3 |

113 |
punpckh%1 %4, %3 |

114 |
%endmacro |

115 | |

116 |
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 |

117 |
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] |

118 |
%macro TRANSPOSE6x8_MEM 9 |

119 |
movq m0, %1 |

120 |
movq m1, %2 |

121 |
movq m2, %3 |

122 |
movq m3, %4 |

123 |
movq m4, %5 |

124 |
movq m5, %6 |

125 |
movq m6, %7 |

126 |
SBUTTERFLY3 bw, m0, m1, m7 |

127 |
SBUTTERFLY3 bw, m2, m3, m1 |

128 |
SBUTTERFLY3 bw, m4, m5, m3 |

129 |
movq [%9+0x10], m1 |

130 |
SBUTTERFLY3 bw, m6, %8, m5 |

131 |
SBUTTERFLY3 wd, m0, m2, m1 |

132 |
SBUTTERFLY3 wd, m4, m6, m2 |

133 |
punpckhdq m0, m4 |

134 |
movq [%9+0x00], m0 |

135 |
SBUTTERFLY3 wd, m7, [%9+0x10], m6 |

136 |
SBUTTERFLY3 wd, m3, m5, m4 |

137 |
SBUTTERFLY3 dq, m7, m3, m0 |

138 |
SBUTTERFLY3 dq, m1, m2, m5 |

139 |
punpckldq m6, m4 |

140 |
movq [%9+0x10], m1 |

141 |
movq [%9+0x20], m5 |

142 |
movq [%9+0x30], m7 |

143 |
movq [%9+0x40], m0 |

144 |
movq [%9+0x50], m6 |

145 |
%endmacro |

146 | |

147 |
; in: 8 rows of 8 in %1..%8 |

148 |
; out: 8 rows of 8 in %9..%16 |

149 |
%macro TRANSPOSE8x8_MEM 16 |

150 |
movq m0, %1 |

151 |
movq m1, %2 |

152 |
movq m2, %3 |

153 |
movq m3, %4 |

154 |
movq m4, %5 |

155 |
movq m5, %6 |

156 |
movq m6, %7 |

157 |
SBUTTERFLY3 bw, m0, m1, m7 |

158 |
SBUTTERFLY3 bw, m2, m3, m1 |

159 |
SBUTTERFLY3 bw, m4, m5, m3 |

160 |
SBUTTERFLY3 bw, m6, %8, m5 |

161 |
movq %9, m3 |

162 |
SBUTTERFLY3 wd, m0, m2, m3 |

163 |
SBUTTERFLY3 wd, m4, m6, m2 |

164 |
SBUTTERFLY3 wd, m7, m1, m6 |

165 |
movq %11, m2 |

166 |
movq m2, %9 |

167 |
SBUTTERFLY3 wd, m2, m5, m1 |

168 |
SBUTTERFLY3 dq, m0, m4, m5 |

169 |
SBUTTERFLY3 dq, m7, m2, m4 |

170 |
movq %9, m0 |

171 |
movq %10, m5 |

172 |
movq %13, m7 |

173 |
movq %14, m4 |

174 |
SBUTTERFLY3 dq, m3, %11, m0 |

175 |
SBUTTERFLY3 dq, m6, m1, m5 |

176 |
movq %11, m3 |

177 |
movq %12, m0 |

178 |
movq %15, m6 |

179 |
movq %16, m5 |

180 |
%endmacro |

181 | |

182 |
; out: %4 = |%1-%2|>%3 |

183 |
; clobbers: %5 |

184 |
%macro DIFF_GT 5 |

185 |
mova %5, %2 |

186 |
mova %4, %1 |

187 |
psubusb %5, %1 |

188 |
psubusb %4, %2 |

189 |
por %4, %5 |

190 |
psubusb %4, %3 |

191 |
%endmacro |

192 | |

193 |
; out: %4 = |%1-%2|>%3 |

194 |
; clobbers: %5 |

195 |
%macro DIFF_GT2 5 |

196 |
mova %5, %2 |

197 |
mova %4, %1 |

198 |
psubusb %5, %1 |

199 |
psubusb %4, %2 |

200 |
psubusb %5, %3 |

201 |
psubusb %4, %3 |

202 |
pcmpeqb %4, %5 |

203 |
%endmacro |

204 | |

205 |
%macro SPLATW 1 |

206 |
%ifidn m0, xmm0 |

207 |
pshuflw %1, %1, 0 |

208 |
punpcklqdq %1, %1 |

209 |
%else |

210 |
pshufw %1, %1, 0 |

211 |
%endif |

212 |
%endmacro |

213 | |

214 |
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 |

215 |
; out: m5=beta-1, m7=mask, %3=alpha-1 |

216 |
; clobbers: m4,m6 |

217 |
%macro LOAD_MASK 2-3 |

218 |
movd m4, %1 |

219 |
movd m5, %2 |

220 |
SPLATW m4 |

221 |
SPLATW m5 |

222 |
packuswb m4, m4 ; 16x alpha-1 |

223 |
packuswb m5, m5 ; 16x beta-1 |

224 |
%if %0>2 |

225 |
mova %3, m4 |

226 |
%endif |

227 |
DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 |

228 |
DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 |

229 |
por m7, m4 |

230 |
DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 |

231 |
por m7, m4 |

232 |
pxor m6, m6 |

233 |
pcmpeqb m7, m6 |

234 |
%endmacro |

235 | |

236 |
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) |

237 |
; out: m1=p0' m2=q0' |

238 |
; clobbers: m0,3-6 |

239 |
%macro DEBLOCK_P0_Q0 0 |

240 |
mova m5, m1 |

241 |
pxor m5, m2 ; p0^q0 |

242 |
pand m5, [pb_1] ; (p0^q0)&1 |

243 |
pcmpeqb m4, m4 |

244 |
pxor m3, m4 |

245 |
pavgb m3, m0 ; (p1 - q1 + 256)>>1 |

246 |
pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 |

247 |
pxor m4, m1 |

248 |
pavgb m4, m2 ; (q0 - p0 + 256)>>1 |

249 |
pavgb m3, m5 |

250 |
paddusb m3, m4 ; d+128+33 |

251 |
mova m6, [pb_A1] |

252 |
psubusb m6, m3 |

253 |
psubusb m3, [pb_A1] |

254 |
pminub m6, m7 |

255 |
pminub m3, m7 |

256 |
psubusb m1, m6 |

257 |
psubusb m2, m3 |

258 |
paddusb m1, m3 |

259 |
paddusb m2, m6 |

260 |
%endmacro |

261 | |

262 |
; in: m1=p0 m2=q0 |

263 |
; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp |

264 |
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) |

265 |
; clobbers: q2, tmp, tc0 |

266 |
%macro LUMA_Q1 6 |

267 |
mova %6, m1 |

268 |
pavgb %6, m2 |

269 |
pavgb %2, %6 ; avg(p2,avg(p0,q0)) |

270 |
pxor %6, %3 |

271 |
pand %6, [pb_1] ; (p2^avg(p0,q0))&1 |

272 |
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 |

273 |
mova %6, %1 |

274 |
psubusb %6, %5 |

275 |
paddusb %5, %1 |

276 |
pmaxub %2, %6 |

277 |
pminub %2, %5 |

278 |
mova %4, %2 |

279 |
%endmacro |

280 | |

281 |
%ifdef ARCH_X86_64 |

282 |
;----------------------------------------------------------------------------- |

283 |
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

284 |
;----------------------------------------------------------------------------- |

285 |
INIT_XMM |

286 |
cglobal x264_deblock_v_luma_sse2, 5,5,10 |

287 |
movd m8, [r4] ; tc0 |

288 |
lea r4, [r1*3] |

289 |
dec r2d ; alpha-1 |

290 |
neg r4 |

291 |
dec r3d ; beta-1 |

292 |
add r4, r0 ; pix-3*stride |

293 | |

294 |
mova m0, [r4+r1] ; p1 |

295 |
mova m1, [r4+2*r1] ; p0 |

296 |
mova m2, [r0] ; q0 |

297 |
mova m3, [r0+r1] ; q1 |

298 |
LOAD_MASK r2d, r3d |

299 | |

300 |
punpcklbw m8, m8 |

301 |
punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] |

302 |
pcmpeqb m9, m9 |

303 |
pcmpeqb m9, m8 |

304 |
pandn m9, m7 |

305 |
pand m8, m9 |

306 | |

307 |
movdqa m3, [r4] ; p2 |

308 |
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 |

309 |
pand m6, m9 |

310 |
mova m7, m8 |

311 |
psubb m7, m6 |

312 |
pand m6, m8 |

313 |
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 |

314 | |

315 |
movdqa m4, [r0+2*r1] ; q2 |

316 |
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 |

317 |
pand m6, m9 |

318 |
pand m8, m6 |

319 |
psubb m7, m6 |

320 |
mova m3, [r0+r1] |

321 |
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 |

322 | |

323 |
DEBLOCK_P0_Q0 |

324 |
mova [r4+2*r1], m1 |

325 |
mova [r0], m2 |

326 |
RET |

327 | |

328 |
;----------------------------------------------------------------------------- |

329 |
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

330 |
;----------------------------------------------------------------------------- |

331 |
INIT_MMX |

332 |
cglobal x264_deblock_h_luma_sse2, 5,7 |

333 |
movsxd r10, r1d |

334 |
lea r11, [r10+r10*2] |

335 |
lea r6, [r0-4] |

336 |
lea r5, [r0-4+r11] |

337 |
%ifdef WIN64 |

338 |
sub rsp, 0x98 |

339 |
%define pix_tmp rsp+0x30 |

340 |
%else |

341 |
sub rsp, 0x68 |

342 |
%define pix_tmp rsp |

343 |
%endif |

344 | |

345 |
; transpose 6x16 -> tmp space |

346 |
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp |

347 |
lea r6, [r6+r10*8] |

348 |
lea r5, [r5+r10*8] |

349 |
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 |

350 | |

351 |
; vertical filter |

352 |
; alpha, beta, tc0 are still in r2d, r3d, r4 |

353 |
; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them |

354 |
lea r0, [pix_tmp+0x30] |

355 |
mov r1d, 0x10 |

356 |
%ifdef WIN64 |

357 |
mov [rsp+0x20], r4 |

358 |
%endif |

359 |
call x264_deblock_v_luma_sse2 |

360 | |

361 |
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) |

362 |
add r6, 2 |

363 |
add r5, 2 |

364 |
movq m0, [pix_tmp+0x18] |

365 |
movq m1, [pix_tmp+0x28] |

366 |
movq m2, [pix_tmp+0x38] |

367 |
movq m3, [pix_tmp+0x48] |

368 |
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) |

369 | |

370 |
shl r10, 3 |

371 |
sub r6, r10 |

372 |
sub r5, r10 |

373 |
shr r10, 3 |

374 |
movq m0, [pix_tmp+0x10] |

375 |
movq m1, [pix_tmp+0x20] |

376 |
movq m2, [pix_tmp+0x30] |

377 |
movq m3, [pix_tmp+0x40] |

378 |
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) |

379 | |

380 |
%ifdef WIN64 |

381 |
add rsp, 0x98 |

382 |
%else |

383 |
add rsp, 0x68 |

384 |
%endif |

385 |
RET |

386 | |

387 |
%else |

388 | |

389 |
%macro DEBLOCK_LUMA 3 |

390 |
;----------------------------------------------------------------------------- |

391 |
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

392 |
;----------------------------------------------------------------------------- |

393 |
cglobal x264_deblock_%2_luma_%1, 5,5 |

394 |
lea r4, [r1*3] |

395 |
dec r2 ; alpha-1 |

396 |
neg r4 |

397 |
dec r3 ; beta-1 |

398 |
add r4, r0 ; pix-3*stride |

399 |
%assign pad 2*%3+12-(stack_offset&15) |

400 |
SUB esp, pad |

401 | |

402 |
mova m0, [r4+r1] ; p1 |

403 |
mova m1, [r4+2*r1] ; p0 |

404 |
mova m2, [r0] ; q0 |

405 |
mova m3, [r0+r1] ; q1 |

406 |
LOAD_MASK r2, r3 |

407 | |

408 |
mov r3, r4mp |

409 |
movd m4, [r3] ; tc0 |

410 |
punpcklbw m4, m4 |

411 |
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] |

412 |
mova [esp+%3], m4 ; tc |

413 |
pcmpeqb m3, m3 |

414 |
pcmpgtb m4, m3 |

415 |
pand m4, m7 |

416 |
mova [esp], m4 ; mask |

417 | |

418 |
mova m3, [r4] ; p2 |

419 |
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 |

420 |
pand m6, m4 |

421 |
pand m4, [esp+%3] ; tc |

422 |
mova m7, m4 |

423 |
psubb m7, m6 |

424 |
pand m6, m4 |

425 |
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 |

426 | |

427 |
mova m4, [r0+2*r1] ; q2 |

428 |
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 |

429 |
mova m5, [esp] ; mask |

430 |
pand m6, m5 |

431 |
mova m5, [esp+%3] ; tc |

432 |
pand m5, m6 |

433 |
psubb m7, m6 |

434 |
mova m3, [r0+r1] |

435 |
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 |

436 | |

437 |
DEBLOCK_P0_Q0 |

438 |
mova [r4+2*r1], m1 |

439 |
mova [r0], m2 |

440 |
ADD esp, pad |

441 |
RET |

442 | |

443 |
;----------------------------------------------------------------------------- |

444 |
; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

445 |
;----------------------------------------------------------------------------- |

446 |
INIT_MMX |

447 |
cglobal x264_deblock_h_luma_%1, 0,5 |

448 |
mov r0, r0mp |

449 |
mov r3, r1m |

450 |
lea r4, [r3*3] |

451 |
sub r0, 4 |

452 |
lea r1, [r0+r4] |

453 |
%assign pad 0x78-(stack_offset&15) |

454 |
SUB esp, pad |

455 |
%define pix_tmp esp+12 |

456 | |

457 |
; transpose 6x16 -> tmp space |

458 |
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp |

459 |
lea r0, [r0+r3*8] |

460 |
lea r1, [r1+r3*8] |

461 |
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 |

462 | |

463 |
; vertical filter |

464 |
lea r0, [pix_tmp+0x30] |

465 |
PUSH dword r4m |

466 |
PUSH dword r3m |

467 |
PUSH dword r2m |

468 |
PUSH dword 16 |

469 |
PUSH dword r0 |

470 |
call x264_deblock_%2_luma_%1 |

471 |
%ifidn %2, v8 |

472 |
add dword [esp ], 8 ; pix_tmp+0x38 |

473 |
add dword [esp+16], 2 ; tc0+2 |

474 |
call x264_deblock_%2_luma_%1 |

475 |
%endif |

476 |
ADD esp, 20 |

477 | |

478 |
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) |

479 |
mov r0, r0mp |

480 |
sub r0, 2 |

481 |
lea r1, [r0+r4] |

482 | |

483 |
movq m0, [pix_tmp+0x10] |

484 |
movq m1, [pix_tmp+0x20] |

485 |
movq m2, [pix_tmp+0x30] |

486 |
movq m3, [pix_tmp+0x40] |

487 |
TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) |

488 | |

489 |
lea r0, [r0+r3*8] |

490 |
lea r1, [r1+r3*8] |

491 |
movq m0, [pix_tmp+0x18] |

492 |
movq m1, [pix_tmp+0x28] |

493 |
movq m2, [pix_tmp+0x38] |

494 |
movq m3, [pix_tmp+0x48] |

495 |
TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) |

496 | |

497 |
ADD esp, pad |

498 |
RET |

499 |
%endmacro ; DEBLOCK_LUMA |

500 | |

501 |
INIT_MMX |

502 |
DEBLOCK_LUMA mmxext, v8, 8 |

503 |
INIT_XMM |

504 |
DEBLOCK_LUMA sse2, v, 16 |

505 | |

506 |
%endif ; ARCH |

507 | |

508 | |

509 | |

510 |
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory |

511 |
mova t0, p2 |

512 |
mova t1, p0 |

513 |
pavgb t0, p1 |

514 |
pavgb t1, q0 |

515 |
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 |

516 |
mova t5, t1 |

517 |
mova t2, p2 |

518 |
mova t3, p0 |

519 |
paddb t2, p1 |

520 |
paddb t3, q0 |

521 |
paddb t2, t3 |

522 |
mova t3, t2 |

523 |
mova t4, t2 |

524 |
psrlw t2, 1 |

525 |
pavgb t2, mpb_0 |

526 |
pxor t2, t0 |

527 |
pand t2, mpb_1 |

528 |
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; |

529 | |

530 |
mova t1, p2 |

531 |
mova t2, p2 |

532 |
pavgb t1, q1 |

533 |
psubb t2, q1 |

534 |
paddb t3, t3 |

535 |
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 |

536 |
pand t2, mpb_1 |

537 |
psubb t1, t2 |

538 |
pavgb t1, p1 |

539 |
pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 |

540 |
psrlw t3, 2 |

541 |
pavgb t3, mpb_0 |

542 |
pxor t3, t1 |

543 |
pand t3, mpb_1 |

544 |
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 |

545 | |

546 |
mova t3, p0 |

547 |
mova t2, p0 |

548 |
pxor t3, q1 |

549 |
pavgb t2, q1 |

550 |
pand t3, mpb_1 |

551 |
psubb t2, t3 |

552 |
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 |

553 | |

554 |
pxor t1, t2 |

555 |
pxor t2, p0 |

556 |
pand t1, mask1p |

557 |
pand t2, mask0 |

558 |
pxor t1, t2 |

559 |
pxor t1, p0 |

560 |
mova %1, t1 ; store p0 |

561 | |

562 |
mova t1, %4 ; p3 |

563 |
mova t2, t1 |

564 |
pavgb t1, p2 |

565 |
paddb t2, p2 |

566 |
pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 |

567 |
paddb t2, t2 |

568 |
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 |

569 |
psrlw t2, 2 |

570 |
pavgb t2, mpb_0 |

571 |
pxor t2, t1 |

572 |
pand t2, mpb_1 |

573 |
psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 |

574 | |

575 |
pxor t0, p1 |

576 |
pxor t1, p2 |

577 |
pand t0, mask1p |

578 |
pand t1, mask1p |

579 |
pxor t0, p1 |

580 |
pxor t1, p2 |

581 |
mova %2, t0 ; store p1 |

582 |
mova %3, t1 ; store p2 |

583 |
%endmacro |

584 | |

585 |
%macro LUMA_INTRA_SWAP_PQ 0 |

586 |
%define q1 m0 |

587 |
%define q0 m1 |

588 |
%define p0 m2 |

589 |
%define p1 m3 |

590 |
%define p2 q2 |

591 |
%define mask1p mask1q |

592 |
%endmacro |

593 | |

594 |
%macro DEBLOCK_LUMA_INTRA 2 |

595 |
%define p1 m0 |

596 |
%define p0 m1 |

597 |
%define q0 m2 |

598 |
%define q1 m3 |

599 |
%define t0 m4 |

600 |
%define t1 m5 |

601 |
%define t2 m6 |

602 |
%define t3 m7 |

603 |
%ifdef ARCH_X86_64 |

604 |
%define p2 m8 |

605 |
%define q2 m9 |

606 |
%define t4 m10 |

607 |
%define t5 m11 |

608 |
%define mask0 m12 |

609 |
%define mask1p m13 |

610 |
%define mask1q [rsp-24] |

611 |
%define mpb_0 m14 |

612 |
%define mpb_1 m15 |

613 |
%else |

614 |
%define spill(x) [esp+16*x+((stack_offset+4)&15)] |

615 |
%define p2 [r4+r1] |

616 |
%define q2 [r0+2*r1] |

617 |
%define t4 spill(0) |

618 |
%define t5 spill(1) |

619 |
%define mask0 spill(2) |

620 |
%define mask1p spill(3) |

621 |
%define mask1q spill(4) |

622 |
%define mpb_0 [pb_0] |

623 |
%define mpb_1 [pb_1] |

624 |
%endif |

625 | |

626 |
;----------------------------------------------------------------------------- |

627 |
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) |

628 |
;----------------------------------------------------------------------------- |

629 |
cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 |

630 |
%ifndef ARCH_X86_64 |

631 |
sub esp, 0x60 |

632 |
%endif |

633 |
lea r4, [r1*4] |

634 |
lea r5, [r1*3] ; 3*stride |

635 |
dec r2d ; alpha-1 |

636 |
jl .end |

637 |
neg r4 |

638 |
dec r3d ; beta-1 |

639 |
jl .end |

640 |
add r4, r0 ; pix-4*stride |

641 |
mova p1, [r4+2*r1] |

642 |
mova p0, [r4+r5] |

643 |
mova q0, [r0] |

644 |
mova q1, [r0+r1] |

645 |
%ifdef ARCH_X86_64 |

646 |
pxor mpb_0, mpb_0 |

647 |
mova mpb_1, [pb_1] |

648 |
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 |

649 |
SWAP 7, 12 ; m12=mask0 |

650 |
pavgb t5, mpb_0 |

651 |
pavgb t5, mpb_1 ; alpha/4+1 |

652 |
movdqa p2, [r4+r1] |

653 |
movdqa q2, [r0+2*r1] |

654 |
DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 |

655 |
DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 |

656 |
DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 |

657 |
pand t0, mask0 |

658 |
pand t4, t0 |

659 |
pand t2, t0 |

660 |
mova mask1q, t4 |

661 |
mova mask1p, t2 |

662 |
%else |

663 |
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 |

664 |
mova m4, t5 |

665 |
mova mask0, m7 |

666 |
pavgb m4, [pb_0] |

667 |
pavgb m4, [pb_1] ; alpha/4+1 |

668 |
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 |

669 |
pand m6, mask0 |

670 |
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 |

671 |
pand m4, m6 |

672 |
mova mask1p, m4 |

673 |
DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 |

674 |
pand m4, m6 |

675 |
mova mask1q, m4 |

676 |
%endif |

677 |
LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] |

678 |
LUMA_INTRA_SWAP_PQ |

679 |
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] |

680 |
.end: |

681 |
%ifndef ARCH_X86_64 |

682 |
add esp, 0x60 |

683 |
%endif |

684 |
RET |

685 | |

686 |
INIT_MMX |

687 |
%ifdef ARCH_X86_64 |

688 |
;----------------------------------------------------------------------------- |

689 |
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) |

690 |
;----------------------------------------------------------------------------- |

691 |
cglobal x264_deblock_h_luma_intra_%1, 4,7 |

692 |
movsxd r10, r1d |

693 |
lea r11, [r10*3] |

694 |
lea r6, [r0-4] |

695 |
lea r5, [r0-4+r11] |

696 |
sub rsp, 0x88 |

697 |
%define pix_tmp rsp |

698 | |

699 |
; transpose 8x16 -> tmp space |

700 |
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) |

701 |
lea r6, [r6+r10*8] |

702 |
lea r5, [r5+r10*8] |

703 |
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) |

704 | |

705 |
lea r0, [pix_tmp+0x40] |

706 |
mov r1, 0x10 |

707 |
call x264_deblock_v_luma_intra_%1 |

708 | |

709 |
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) |

710 |
lea r5, [r6+r11] |

711 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) |

712 |
shl r10, 3 |

713 |
sub r6, r10 |

714 |
sub r5, r10 |

715 |
shr r10, 3 |

716 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) |

717 |
add rsp, 0x88 |

718 |
RET |

719 |
%else |

720 |
cglobal x264_deblock_h_luma_intra_%1, 2,4 |

721 |
lea r3, [r1*3] |

722 |
sub r0, 4 |

723 |
lea r2, [r0+r3] |

724 |
%assign pad 0x8c-(stack_offset&15) |

725 |
SUB rsp, pad |

726 |
%define pix_tmp rsp |

727 | |

728 |
; transpose 8x16 -> tmp space |

729 |
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) |

730 |
lea r0, [r0+r1*8] |

731 |
lea r2, [r2+r1*8] |

732 |
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) |

733 | |

734 |
lea r0, [pix_tmp+0x40] |

735 |
PUSH dword r3m |

736 |
PUSH dword r2m |

737 |
PUSH dword 16 |

738 |
PUSH r0 |

739 |
call x264_deblock_%2_luma_intra_%1 |

740 |
%ifidn %2, v8 |

741 |
add dword [rsp], 8 ; pix_tmp+8 |

742 |
call x264_deblock_%2_luma_intra_%1 |

743 |
%endif |

744 |
ADD esp, 16 |

745 | |

746 |
mov r1, r1m |

747 |
mov r0, r0mp |

748 |
lea r3, [r1*3] |

749 |
sub r0, 4 |

750 |
lea r2, [r0+r3] |

751 |
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) |

752 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) |

753 |
lea r0, [r0+r1*8] |

754 |
lea r2, [r2+r1*8] |

755 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) |

756 |
ADD rsp, pad |

757 |
RET |

758 |
%endif ; ARCH_X86_64 |

759 |
%endmacro ; DEBLOCK_LUMA_INTRA |

760 | |

761 |
INIT_XMM |

762 |
DEBLOCK_LUMA_INTRA sse2, v |

763 |
%ifndef ARCH_X86_64 |

764 |
INIT_MMX |

765 |
DEBLOCK_LUMA_INTRA mmxext, v8 |

766 |
%endif |

767 | |

768 | |

769 | |

770 |
INIT_MMX |

771 | |

772 |
%macro CHROMA_V_START 0 |

773 |
dec r2d ; alpha-1 |

774 |
dec r3d ; beta-1 |

775 |
mov t5, r0 |

776 |
sub t5, r1 |

777 |
sub t5, r1 |

778 |
%endmacro |

779 | |

780 |
%macro CHROMA_H_START 0 |

781 |
dec r2d |

782 |
dec r3d |

783 |
sub r0, 2 |

784 |
lea t6, [r1*3] |

785 |
mov t5, r0 |

786 |
add r0, t6 |

787 |
%endmacro |

788 | |

789 |
%define t5 r5 |

790 |
%define t6 r6 |

791 | |

792 |
;----------------------------------------------------------------------------- |

793 |
; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

794 |
;----------------------------------------------------------------------------- |

795 |
cglobal x264_deblock_v_chroma_mmxext, 5,6 |

796 |
CHROMA_V_START |

797 |
movq m0, [t5] |

798 |
movq m1, [t5+r1] |

799 |
movq m2, [r0] |

800 |
movq m3, [r0+r1] |

801 |
call x264_chroma_inter_body_mmxext |

802 |
movq [t5+r1], m1 |

803 |
movq [r0], m2 |

804 |
RET |

805 | |

806 |
;----------------------------------------------------------------------------- |

807 |
; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |

808 |
;----------------------------------------------------------------------------- |

809 |
cglobal x264_deblock_h_chroma_mmxext, 5,7 |

810 |
%ifdef ARCH_X86_64 |

811 |
%define buf0 [rsp-24] |

812 |
%define buf1 [rsp-16] |

813 |
%else |

814 |
%define buf0 r0m |

815 |
%define buf1 r2m |

816 |
%endif |

817 |
CHROMA_H_START |

818 |
TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) |

819 |
movq buf0, m0 |

820 |
movq buf1, m3 |

821 |
call x264_chroma_inter_body_mmxext |

822 |
movq m0, buf0 |

823 |
movq m3, buf1 |

824 |
TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) |

825 |
RET |

826 | |

827 |
ALIGN 16 |

828 |
x264_chroma_inter_body_mmxext: |

829 |
LOAD_MASK r2d, r3d |

830 |
movd m6, [r4] ; tc0 |

831 |
punpcklbw m6, m6 |

832 |
pand m7, m6 |

833 |
DEBLOCK_P0_Q0 |

834 |
ret |

835 | |

836 | |

837 | |

838 |
; in: %1=p0 %2=p1 %3=q1 |

839 |
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 |

840 |
%macro CHROMA_INTRA_P0 3 |

841 |
movq m4, %1 |

842 |
pxor m4, %3 |

843 |
pand m4, [pb_1] ; m4 = (p0^q1)&1 |

844 |
pavgb %1, %3 |

845 |
psubusb %1, m4 |

846 |
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) |

847 |
%endmacro |

848 | |

849 |
%define t5 r4 |

850 |
%define t6 r5 |

851 | |

852 |
;----------------------------------------------------------------------------- |

853 |
; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) |

854 |
;----------------------------------------------------------------------------- |

855 |
cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 |

856 |
CHROMA_V_START |

857 |
movq m0, [t5] |

858 |
movq m1, [t5+r1] |

859 |
movq m2, [r0] |

860 |
movq m3, [r0+r1] |

861 |
call x264_chroma_intra_body_mmxext |

862 |
movq [t5+r1], m1 |

863 |
movq [r0], m2 |

864 |
RET |

865 | |

866 |
;----------------------------------------------------------------------------- |

867 |
; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) |

868 |
;----------------------------------------------------------------------------- |

869 |
cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 |

870 |
CHROMA_H_START |

871 |
TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) |

872 |
call x264_chroma_intra_body_mmxext |

873 |
TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) |

874 |
RET |

875 | |

876 |
ALIGN 16 |

877 |
x264_chroma_intra_body_mmxext: |

878 |
LOAD_MASK r2d, r3d |

879 |
movq m5, m1 |

880 |
movq m6, m2 |

881 |
CHROMA_INTRA_P0 m1, m0, m3 |

882 |
CHROMA_INTRA_P0 m2, m3, m0 |

883 |
psubb m1, m5 |

884 |
psubb m2, m6 |

885 |
pand m1, m7 |

886 |
pand m2, m7 |

887 |
paddb m1, m5 |

888 |
paddb m2, m6 |

889 |
ret |