## ffmpeg / libavcodec / x86 / vp8dsp.asm @ 268821e7

History | View | Annotate | Download (53.3 KB)

1 |
;****************************************************************************** |
---|---|

2 |
;* VP8 MMXEXT optimizations |

3 |
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |

4 |
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |

5 |
;* |

6 |
;* This file is part of FFmpeg. |

7 |
;* |

8 |
;* FFmpeg is free software; you can redistribute it and/or |

9 |
;* modify it under the terms of the GNU Lesser General Public |

10 |
;* License as published by the Free Software Foundation; either |

11 |
;* version 2.1 of the License, or (at your option) any later version. |

12 |
;* |

13 |
;* FFmpeg is distributed in the hope that it will be useful, |

14 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

15 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

16 |
;* Lesser General Public License for more details. |

17 |
;* |

18 |
;* You should have received a copy of the GNU Lesser General Public |

19 |
;* License along with FFmpeg; if not, write to the Free Software |

20 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

21 |
;****************************************************************************** |

22 | |

23 |
%include "x86inc.asm" |

24 |
%include "x86util.asm" |

25 | |

26 |
SECTION_RODATA |

27 | |

28 |
fourtap_filter_hw_m: times 4 dw -6, 123 |

29 |
times 4 dw 12, -1 |

30 |
times 4 dw -9, 93 |

31 |
times 4 dw 50, -6 |

32 |
times 4 dw -6, 50 |

33 |
times 4 dw 93, -9 |

34 |
times 4 dw -1, 12 |

35 |
times 4 dw 123, -6 |

36 | |

37 |
sixtap_filter_hw_m: times 4 dw 2, -11 |

38 |
times 4 dw 108, 36 |

39 |
times 4 dw -8, 1 |

40 |
times 4 dw 3, -16 |

41 |
times 4 dw 77, 77 |

42 |
times 4 dw -16, 3 |

43 |
times 4 dw 1, -8 |

44 |
times 4 dw 36, 108 |

45 |
times 4 dw -11, 2 |

46 | |

47 |
fourtap_filter_hb_m: times 8 db -6, 123 |

48 |
times 8 db 12, -1 |

49 |
times 8 db -9, 93 |

50 |
times 8 db 50, -6 |

51 |
times 8 db -6, 50 |

52 |
times 8 db 93, -9 |

53 |
times 8 db -1, 12 |

54 |
times 8 db 123, -6 |

55 | |

56 |
sixtap_filter_hb_m: times 8 db 2, 1 |

57 |
times 8 db -11, 108 |

58 |
times 8 db 36, -8 |

59 |
times 8 db 3, 3 |

60 |
times 8 db -16, 77 |

61 |
times 8 db 77, -16 |

62 |
times 8 db 1, 2 |

63 |
times 8 db -8, 36 |

64 |
times 8 db 108, -11 |

65 | |

66 |
fourtap_filter_v_m: times 8 dw -6 |

67 |
times 8 dw 123 |

68 |
times 8 dw 12 |

69 |
times 8 dw -1 |

70 |
times 8 dw -9 |

71 |
times 8 dw 93 |

72 |
times 8 dw 50 |

73 |
times 8 dw -6 |

74 |
times 8 dw -6 |

75 |
times 8 dw 50 |

76 |
times 8 dw 93 |

77 |
times 8 dw -9 |

78 |
times 8 dw -1 |

79 |
times 8 dw 12 |

80 |
times 8 dw 123 |

81 |
times 8 dw -6 |

82 | |

83 |
sixtap_filter_v_m: times 8 dw 2 |

84 |
times 8 dw -11 |

85 |
times 8 dw 108 |

86 |
times 8 dw 36 |

87 |
times 8 dw -8 |

88 |
times 8 dw 1 |

89 |
times 8 dw 3 |

90 |
times 8 dw -16 |

91 |
times 8 dw 77 |

92 |
times 8 dw 77 |

93 |
times 8 dw -16 |

94 |
times 8 dw 3 |

95 |
times 8 dw 1 |

96 |
times 8 dw -8 |

97 |
times 8 dw 36 |

98 |
times 8 dw 108 |

99 |
times 8 dw -11 |

100 |
times 8 dw 2 |

101 | |

102 |
bilinear_filter_vw_m: times 8 dw 1 |

103 |
times 8 dw 2 |

104 |
times 8 dw 3 |

105 |
times 8 dw 4 |

106 |
times 8 dw 5 |

107 |
times 8 dw 6 |

108 |
times 8 dw 7 |

109 | |

110 |
bilinear_filter_vb_m: times 8 db 7, 1 |

111 |
times 8 db 6, 2 |

112 |
times 8 db 5, 3 |

113 |
times 8 db 4, 4 |

114 |
times 8 db 3, 5 |

115 |
times 8 db 2, 6 |

116 |
times 8 db 1, 7 |

117 | |

118 |
%ifdef PIC |

119 |
%define fourtap_filter_hw r11 |

120 |
%define sixtap_filter_hw r11 |

121 |
%define fourtap_filter_hb r11 |

122 |
%define sixtap_filter_hb r11 |

123 |
%define fourtap_filter_v r11 |

124 |
%define sixtap_filter_v r11 |

125 |
%define bilinear_filter_vw r11 |

126 |
%define bilinear_filter_vb r11 |

127 |
%else |

128 |
%define fourtap_filter_hw fourtap_filter_hw_m |

129 |
%define sixtap_filter_hw sixtap_filter_hw_m |

130 |
%define fourtap_filter_hb fourtap_filter_hb_m |

131 |
%define sixtap_filter_hb sixtap_filter_hb_m |

132 |
%define fourtap_filter_v fourtap_filter_v_m |

133 |
%define sixtap_filter_v sixtap_filter_v_m |

134 |
%define bilinear_filter_vw bilinear_filter_vw_m |

135 |
%define bilinear_filter_vb bilinear_filter_vb_m |

136 |
%endif |

137 | |

138 |
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |

139 |
filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |

140 | |

141 |
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |

142 |
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |

143 |
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |

144 | |

145 |
pw_20091: times 4 dw 20091 |

146 |
pw_17734: times 4 dw 17734 |

147 | |

148 |
cextern pb_1 |

149 |
cextern pw_3 |

150 |
cextern pb_3 |

151 |
cextern pw_4 |

152 |
cextern pb_4 |

153 |
cextern pw_64 |

154 |
cextern pb_80 |

155 |
cextern pb_F8 |

156 |
cextern pb_FE |

157 | |

158 |
SECTION .text |

159 | |

160 |
;----------------------------------------------------------------------------- |

161 |
; subpel MC functions: |

162 |
; |

163 |
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, |

164 |
; uint8_t *src, int srcstride, |

165 |
; int height, int mx, int my); |

166 |
;----------------------------------------------------------------------------- |

167 | |

168 |
%macro FILTER_SSSE3 3 |

169 |
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 |

170 |
lea r5d, [r5*3] |

171 |
mova m3, [filter_h6_shuf2] |

172 |
mova m4, [filter_h6_shuf3] |

173 |
%ifdef PIC |

174 |
lea r11, [sixtap_filter_hb_m] |

175 |
%endif |

176 |
mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |

177 |
mova m6, [sixtap_filter_hb+r5*8-32] |

178 |
mova m7, [sixtap_filter_hb+r5*8-16] |

179 | |

180 |
.nextrow |

181 |
movu m0, [r2-2] |

182 |
mova m1, m0 |

183 |
mova m2, m0 |

184 |
%ifidn %1, 4 |

185 |
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the |

186 |
; shuffle with a memory operand |

187 |
punpcklbw m0, [r2+3] |

188 |
%else |

189 |
pshufb m0, [filter_h6_shuf1] |

190 |
%endif |

191 |
pshufb m1, m3 |

192 |
pshufb m2, m4 |

193 |
pmaddubsw m0, m5 |

194 |
pmaddubsw m1, m6 |

195 |
pmaddubsw m2, m7 |

196 |
paddsw m0, m1 |

197 |
paddsw m0, m2 |

198 |
paddsw m0, [pw_64] |

199 |
psraw m0, 7 |

200 |
packuswb m0, m0 |

201 |
movh [r0], m0 ; store |

202 | |

203 |
; go to next line |

204 |
add r0, r1 |

205 |
add r2, r3 |

206 |
dec r4 ; next row |

207 |
jg .nextrow |

208 |
REP_RET |

209 | |

210 |
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 |

211 |
shl r5d, 4 |

212 |
mova m2, [pw_64] |

213 |
mova m3, [filter_h2_shuf] |

214 |
mova m4, [filter_h4_shuf] |

215 |
%ifdef PIC |

216 |
lea r11, [fourtap_filter_hb_m] |

217 |
%endif |

218 |
mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |

219 |
mova m6, [fourtap_filter_hb+r5] |

220 | |

221 |
.nextrow |

222 |
movu m0, [r2-1] |

223 |
mova m1, m0 |

224 |
pshufb m0, m3 |

225 |
pshufb m1, m4 |

226 |
pmaddubsw m0, m5 |

227 |
pmaddubsw m1, m6 |

228 |
paddsw m0, m2 |

229 |
paddsw m0, m1 |

230 |
psraw m0, 7 |

231 |
packuswb m0, m0 |

232 |
movh [r0], m0 ; store |

233 | |

234 |
; go to next line |

235 |
add r0, r1 |

236 |
add r2, r3 |

237 |
dec r4 ; next row |

238 |
jg .nextrow |

239 |
REP_RET |

240 | |

241 |
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 |

242 |
shl r6d, 4 |

243 |
%ifdef PIC |

244 |
lea r11, [fourtap_filter_hb_m] |

245 |
%endif |

246 |
mova m5, [fourtap_filter_hb+r6-16] |

247 |
mova m6, [fourtap_filter_hb+r6] |

248 |
mova m7, [pw_64] |

249 | |

250 |
; read 3 lines |

251 |
sub r2, r3 |

252 |
movh m0, [r2] |

253 |
movh m1, [r2+ r3] |

254 |
movh m2, [r2+2*r3] |

255 |
add r2, r3 |

256 | |

257 |
.nextrow |

258 |
movh m3, [r2+2*r3] ; read new row |

259 |
mova m4, m0 |

260 |
mova m0, m1 |

261 |
punpcklbw m4, m1 |

262 |
mova m1, m2 |

263 |
punpcklbw m2, m3 |

264 |
pmaddubsw m4, m5 |

265 |
pmaddubsw m2, m6 |

266 |
paddsw m4, m2 |

267 |
mova m2, m3 |

268 |
paddsw m4, m7 |

269 |
psraw m4, 7 |

270 |
packuswb m4, m4 |

271 |
movh [r0], m4 |

272 | |

273 |
; go to next line |

274 |
add r0, r1 |

275 |
add r2, r3 |

276 |
dec r4 ; next row |

277 |
jg .nextrow |

278 |
REP_RET |

279 | |

280 |
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 |

281 |
lea r6d, [r6*3] |

282 |
%ifdef PIC |

283 |
lea r11, [sixtap_filter_hb_m] |

284 |
%endif |

285 |
lea r6, [sixtap_filter_hb+r6*8] |

286 | |

287 |
; read 5 lines |

288 |
sub r2, r3 |

289 |
sub r2, r3 |

290 |
movh m0, [r2] |

291 |
movh m1, [r2+r3] |

292 |
movh m2, [r2+r3*2] |

293 |
lea r2, [r2+r3*2] |

294 |
add r2, r3 |

295 |
movh m3, [r2] |

296 |
movh m4, [r2+r3] |

297 | |

298 |
.nextrow |

299 |
movh m5, [r2+2*r3] ; read new row |

300 |
mova m6, m0 |

301 |
punpcklbw m6, m5 |

302 |
mova m0, m1 |

303 |
punpcklbw m1, m2 |

304 |
mova m7, m3 |

305 |
punpcklbw m7, m4 |

306 |
pmaddubsw m6, [r6-48] |

307 |
pmaddubsw m1, [r6-32] |

308 |
pmaddubsw m7, [r6-16] |

309 |
paddsw m6, m1 |

310 |
paddsw m6, m7 |

311 |
mova m1, m2 |

312 |
paddsw m6, [pw_64] |

313 |
mova m2, m3 |

314 |
psraw m6, 7 |

315 |
mova m3, m4 |

316 |
packuswb m6, m6 |

317 |
mova m4, m5 |

318 |
movh [r0], m6 |

319 | |

320 |
; go to next line |

321 |
add r0, r1 |

322 |
add r2, r3 |

323 |
dec r4 ; next row |

324 |
jg .nextrow |

325 |
REP_RET |

326 |
%endmacro |

327 | |

328 |
INIT_MMX |

329 |
FILTER_SSSE3 4, 0, 0 |

330 |
INIT_XMM |

331 |
FILTER_SSSE3 8, 8, 7 |

332 | |

333 |
; 4x4 block, H-only 4-tap filter |

334 |
cglobal put_vp8_epel4_h4_mmxext, 6, 6 |

335 |
shl r5d, 4 |

336 |
%ifdef PIC |

337 |
lea r11, [fourtap_filter_hw_m] |

338 |
%endif |

339 |
movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |

340 |
movq mm5, [fourtap_filter_hw+r5] |

341 |
movq mm7, [pw_64] |

342 |
pxor mm6, mm6 |

343 | |

344 |
.nextrow |

345 |
movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels |

346 | |

347 |
; first set of 2 pixels |

348 |
movq mm2, mm1 ; byte ABCD.. |

349 |
punpcklbw mm1, mm6 ; byte->word ABCD |

350 |
pshufw mm0, mm2, 9 ; byte CDEF.. |

351 |
punpcklbw mm0, mm6 ; byte->word CDEF |

352 |
pshufw mm3, mm1, 0x94 ; word ABBC |

353 |
pshufw mm1, mm0, 0x94 ; word CDDE |

354 |
pmaddwd mm3, mm4 ; multiply 2px with F0/F1 |

355 |
movq mm0, mm1 ; backup for second set of pixels |

356 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |

357 |
paddd mm3, mm1 ; finish 1st 2px |

358 | |

359 |
; second set of 2 pixels, use backup of above |

360 |
punpckhbw mm2, mm6 ; byte->word EFGH |

361 |
pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 |

362 |
pshufw mm1, mm2, 0x94 ; word EFFG |

363 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |

364 |
paddd mm0, mm1 ; finish 2nd 2px |

365 | |

366 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |

367 |
packssdw mm3, mm0 ; merge dword->word (4px) |

368 |
paddsw mm3, mm7 ; rounding |

369 |
psraw mm3, 7 |

370 |
packuswb mm3, mm6 ; clip and word->bytes |

371 |
movd [r0], mm3 ; store |

372 | |

373 |
; go to next line |

374 |
add r0, r1 |

375 |
add r2, r3 |

376 |
dec r4 ; next row |

377 |
jg .nextrow |

378 |
REP_RET |

379 | |

380 |
; 4x4 block, H-only 6-tap filter |

381 |
cglobal put_vp8_epel4_h6_mmxext, 6, 6 |

382 |
lea r5d, [r5*3] |

383 |
%ifdef PIC |

384 |
lea r11, [sixtap_filter_hw_m] |

385 |
%endif |

386 |
movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words |

387 |
movq mm5, [sixtap_filter_hw+r5*8-32] |

388 |
movq mm6, [sixtap_filter_hw+r5*8-16] |

389 |
movq mm7, [pw_64] |

390 |
pxor mm3, mm3 |

391 | |

392 |
.nextrow |

393 |
movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels |

394 | |

395 |
; first set of 2 pixels |

396 |
movq mm2, mm1 ; byte ABCD.. |

397 |
punpcklbw mm1, mm3 ; byte->word ABCD |

398 |
pshufw mm0, mm2, 0x9 ; byte CDEF.. |

399 |
punpckhbw mm2, mm3 ; byte->word EFGH |

400 |
punpcklbw mm0, mm3 ; byte->word CDEF |

401 |
pshufw mm1, mm1, 0x94 ; word ABBC |

402 |
pshufw mm2, mm2, 0x94 ; word EFFG |

403 |
pmaddwd mm1, mm4 ; multiply 2px with F0/F1 |

404 |
pshufw mm3, mm0, 0x94 ; word CDDE |

405 |
movq mm0, mm3 ; backup for second set of pixels |

406 |
pmaddwd mm3, mm5 ; multiply 2px with F2/F3 |

407 |
paddd mm1, mm3 ; add to 1st 2px cache |

408 |
movq mm3, mm2 ; backup for second set of pixels |

409 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |

410 |
paddd mm1, mm2 ; finish 1st 2px |

411 | |

412 |
; second set of 2 pixels, use backup of above |

413 |
movd mm2, [r2+3] ; byte FGHI (prevent overreads) |

414 |
pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 |

415 |
pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 |

416 |
paddd mm0, mm3 ; add to 2nd 2px cache |

417 |
pxor mm3, mm3 |

418 |
punpcklbw mm2, mm3 ; byte->word FGHI |

419 |
pshufw mm2, mm2, 0xE9 ; word GHHI |

420 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |

421 |
paddd mm0, mm2 ; finish 2nd 2px |

422 | |

423 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |

424 |
packssdw mm1, mm0 ; merge dword->word (4px) |

425 |
paddsw mm1, mm7 ; rounding |

426 |
psraw mm1, 7 |

427 |
packuswb mm1, mm3 ; clip and word->bytes |

428 |
movd [r0], mm1 ; store |

429 | |

430 |
; go to next line |

431 |
add r0, r1 |

432 |
add r2, r3 |

433 |
dec r4 ; next row |

434 |
jg .nextrow |

435 |
REP_RET |

436 | |

437 |
; 4x4 block, H-only 4-tap filter |

438 |
INIT_XMM |

439 |
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 |

440 |
shl r5d, 4 |

441 |
%ifdef PIC |

442 |
lea r11, [fourtap_filter_hw_m] |

443 |
%endif |

444 |
mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |

445 |
mova m6, [fourtap_filter_hw+r5] |

446 |
pxor m7, m7 |

447 | |

448 |
.nextrow |

449 |
movh m0, [r2-1] |

450 |
punpcklbw m0, m7 ; ABCDEFGH |

451 |
mova m1, m0 |

452 |
mova m2, m0 |

453 |
mova m3, m0 |

454 |
psrldq m1, 2 ; BCDEFGH |

455 |
psrldq m2, 4 ; CDEFGH |

456 |
psrldq m3, 6 ; DEFGH |

457 |
punpcklwd m0, m1 ; ABBCCDDE |

458 |
punpcklwd m2, m3 ; CDDEEFFG |

459 |
pmaddwd m0, m5 |

460 |
pmaddwd m2, m6 |

461 |
paddd m0, m2 |

462 | |

463 |
movh m1, [r2+3] |

464 |
punpcklbw m1, m7 ; ABCDEFGH |

465 |
mova m2, m1 |

466 |
mova m3, m1 |

467 |
mova m4, m1 |

468 |
psrldq m2, 2 ; BCDEFGH |

469 |
psrldq m3, 4 ; CDEFGH |

470 |
psrldq m4, 6 ; DEFGH |

471 |
punpcklwd m1, m2 ; ABBCCDDE |

472 |
punpcklwd m3, m4 ; CDDEEFFG |

473 |
pmaddwd m1, m5 |

474 |
pmaddwd m3, m6 |

475 |
paddd m1, m3 |

476 | |

477 |
packssdw m0, m1 |

478 |
paddsw m0, [pw_64] |

479 |
psraw m0, 7 |

480 |
packuswb m0, m7 |

481 |
movh [r0], m0 ; store |

482 | |

483 |
; go to next line |

484 |
add r0, r1 |

485 |
add r2, r3 |

486 |
dec r4 ; next row |

487 |
jg .nextrow |

488 |
REP_RET |

489 | |

490 |
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 |

491 |
lea r5d, [r5*3] |

492 |
%ifdef PIC |

493 |
lea r11, [sixtap_filter_hw_m] |

494 |
%endif |

495 |
lea r5, [sixtap_filter_hw+r5*8] |

496 |
pxor m7, m7 |

497 | |

498 |
.nextrow |

499 |
movu m0, [r2-2] |

500 |
mova m6, m0 |

501 |
mova m4, m0 |

502 |
punpcklbw m0, m7 ; ABCDEFGHI |

503 |
mova m1, m0 |

504 |
mova m2, m0 |

505 |
mova m3, m0 |

506 |
psrldq m1, 2 ; BCDEFGH |

507 |
psrldq m2, 4 ; CDEFGH |

508 |
psrldq m3, 6 ; DEFGH |

509 |
psrldq m4, 4 |

510 |
punpcklbw m4, m7 ; EFGH |

511 |
mova m5, m4 |

512 |
psrldq m5, 2 ; FGH |

513 |
punpcklwd m0, m1 ; ABBCCDDE |

514 |
punpcklwd m2, m3 ; CDDEEFFG |

515 |
punpcklwd m4, m5 ; EFFGGHHI |

516 |
pmaddwd m0, [r5-48] |

517 |
pmaddwd m2, [r5-32] |

518 |
pmaddwd m4, [r5-16] |

519 |
paddd m0, m2 |

520 |
paddd m0, m4 |

521 | |

522 |
psrldq m6, 4 |

523 |
mova m4, m6 |

524 |
punpcklbw m6, m7 ; ABCDEFGHI |

525 |
mova m1, m6 |

526 |
mova m2, m6 |

527 |
mova m3, m6 |

528 |
psrldq m1, 2 ; BCDEFGH |

529 |
psrldq m2, 4 ; CDEFGH |

530 |
psrldq m3, 6 ; DEFGH |

531 |
psrldq m4, 4 |

532 |
punpcklbw m4, m7 ; EFGH |

533 |
mova m5, m4 |

534 |
psrldq m5, 2 ; FGH |

535 |
punpcklwd m6, m1 ; ABBCCDDE |

536 |
punpcklwd m2, m3 ; CDDEEFFG |

537 |
punpcklwd m4, m5 ; EFFGGHHI |

538 |
pmaddwd m6, [r5-48] |

539 |
pmaddwd m2, [r5-32] |

540 |
pmaddwd m4, [r5-16] |

541 |
paddd m6, m2 |

542 |
paddd m6, m4 |

543 | |

544 |
packssdw m0, m6 |

545 |
paddsw m0, [pw_64] |

546 |
psraw m0, 7 |

547 |
packuswb m0, m7 |

548 |
movh [r0], m0 ; store |

549 | |

550 |
; go to next line |

551 |
add r0, r1 |

552 |
add r2, r3 |

553 |
dec r4 ; next row |

554 |
jg .nextrow |

555 |
REP_RET |

556 | |

557 |
%macro FILTER_V 3 |

558 |
; 4x4 block, V-only 4-tap filter |

559 |
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 |

560 |
shl r6d, 5 |

561 |
%ifdef PIC |

562 |
lea r11, [fourtap_filter_v_m] |

563 |
%endif |

564 |
lea r6, [fourtap_filter_v+r6-32] |

565 |
mova m6, [pw_64] |

566 |
pxor m7, m7 |

567 |
mova m5, [r6+48] |

568 | |

569 |
; read 3 lines |

570 |
sub r2, r3 |

571 |
movh m0, [r2] |

572 |
movh m1, [r2+ r3] |

573 |
movh m2, [r2+2*r3] |

574 |
add r2, r3 |

575 |
punpcklbw m0, m7 |

576 |
punpcklbw m1, m7 |

577 |
punpcklbw m2, m7 |

578 | |

579 |
.nextrow |

580 |
; first calculate negative taps (to prevent losing positive overflows) |

581 |
movh m4, [r2+2*r3] ; read new row |

582 |
punpcklbw m4, m7 |

583 |
mova m3, m4 |

584 |
pmullw m0, [r6+0] |

585 |
pmullw m4, m5 |

586 |
paddsw m4, m0 |

587 | |

588 |
; then calculate positive taps |

589 |
mova m0, m1 |

590 |
pmullw m1, [r6+16] |

591 |
paddsw m4, m1 |

592 |
mova m1, m2 |

593 |
pmullw m2, [r6+32] |

594 |
paddsw m4, m2 |

595 |
mova m2, m3 |

596 | |

597 |
; round/clip/store |

598 |
paddsw m4, m6 |

599 |
psraw m4, 7 |

600 |
packuswb m4, m7 |

601 |
movh [r0], m4 |

602 | |

603 |
; go to next line |

604 |
add r0, r1 |

605 |
add r2, r3 |

606 |
dec r4 ; next row |

607 |
jg .nextrow |

608 |
REP_RET |

609 | |

610 | |

611 |
; 4x4 block, V-only 6-tap filter |

612 |
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 |

613 |
shl r6d, 4 |

614 |
lea r6, [r6*3] |

615 |
%ifdef PIC |

616 |
lea r11, [sixtap_filter_v_m] |

617 |
%endif |

618 |
lea r6, [sixtap_filter_v+r6-96] |

619 |
pxor m7, m7 |

620 | |

621 |
; read 5 lines |

622 |
sub r2, r3 |

623 |
sub r2, r3 |

624 |
movh m0, [r2] |

625 |
movh m1, [r2+r3] |

626 |
movh m2, [r2+r3*2] |

627 |
lea r2, [r2+r3*2] |

628 |
add r2, r3 |

629 |
movh m3, [r2] |

630 |
movh m4, [r2+r3] |

631 |
punpcklbw m0, m7 |

632 |
punpcklbw m1, m7 |

633 |
punpcklbw m2, m7 |

634 |
punpcklbw m3, m7 |

635 |
punpcklbw m4, m7 |

636 | |

637 |
.nextrow |

638 |
; first calculate negative taps (to prevent losing positive overflows) |

639 |
mova m5, m1 |

640 |
pmullw m5, [r6+16] |

641 |
mova m6, m4 |

642 |
pmullw m6, [r6+64] |

643 |
paddsw m6, m5 |

644 | |

645 |
; then calculate positive taps |

646 |
movh m5, [r2+2*r3] ; read new row |

647 |
punpcklbw m5, m7 |

648 |
pmullw m0, [r6+0] |

649 |
paddsw m6, m0 |

650 |
mova m0, m1 |

651 |
mova m1, m2 |

652 |
pmullw m2, [r6+32] |

653 |
paddsw m6, m2 |

654 |
mova m2, m3 |

655 |
pmullw m3, [r6+48] |

656 |
paddsw m6, m3 |

657 |
mova m3, m4 |

658 |
mova m4, m5 |

659 |
pmullw m5, [r6+80] |

660 |
paddsw m6, m5 |

661 | |

662 |
; round/clip/store |

663 |
paddsw m6, [pw_64] |

664 |
psraw m6, 7 |

665 |
packuswb m6, m7 |

666 |
movh [r0], m6 |

667 | |

668 |
; go to next line |

669 |
add r0, r1 |

670 |
add r2, r3 |

671 |
dec r4 ; next row |

672 |
jg .nextrow |

673 |
REP_RET |

674 |
%endmacro |

675 | |

676 |
INIT_MMX |

677 |
FILTER_V mmxext, 4, 0 |

678 |
INIT_XMM |

679 |
FILTER_V sse2, 8, 8 |

680 | |

681 |
%macro FILTER_BILINEAR 3 |

682 |
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 |

683 |
mov r5d, 8*16 |

684 |
shl r6d, 4 |

685 |
sub r5d, r6d |

686 |
%ifdef PIC |

687 |
lea r11, [bilinear_filter_vw_m] |

688 |
%endif |

689 |
pxor m6, m6 |

690 |
mova m4, [bilinear_filter_vw+r5-16] |

691 |
mova m5, [bilinear_filter_vw+r6-16] |

692 |
.nextrow |

693 |
movh m0, [r2+r3*0] |

694 |
movh m1, [r2+r3*1] |

695 |
movh m3, [r2+r3*2] |

696 |
punpcklbw m0, m6 |

697 |
punpcklbw m1, m6 |

698 |
punpcklbw m3, m6 |

699 |
mova m2, m1 |

700 |
pmullw m0, m4 |

701 |
pmullw m1, m5 |

702 |
pmullw m2, m4 |

703 |
pmullw m3, m5 |

704 |
paddsw m0, m1 |

705 |
paddsw m2, m3 |

706 |
psraw m0, 2 |

707 |
psraw m2, 2 |

708 |
pavgw m0, m6 |

709 |
pavgw m2, m6 |

710 |
%ifidn %1, mmxext |

711 |
packuswb m0, m0 |

712 |
packuswb m2, m2 |

713 |
movh [r0+r1*0], m0 |

714 |
movh [r0+r1*1], m2 |

715 |
%else |

716 |
packuswb m0, m2 |

717 |
movh [r0+r1*0], m0 |

718 |
movhps [r0+r1*1], m0 |

719 |
%endif |

720 | |

721 |
lea r0, [r0+r1*2] |

722 |
lea r2, [r2+r3*2] |

723 |
sub r4, 2 |

724 |
jg .nextrow |

725 |
REP_RET |

726 | |

727 |
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 |

728 |
mov r6d, 8*16 |

729 |
shl r5d, 4 |

730 |
sub r6d, r5d |

731 |
%ifdef PIC |

732 |
lea r11, [bilinear_filter_vw_m] |

733 |
%endif |

734 |
pxor m6, m6 |

735 |
mova m4, [bilinear_filter_vw+r6-16] |

736 |
mova m5, [bilinear_filter_vw+r5-16] |

737 |
.nextrow |

738 |
movh m0, [r2+r3*0+0] |

739 |
movh m1, [r2+r3*0+1] |

740 |
movh m2, [r2+r3*1+0] |

741 |
movh m3, [r2+r3*1+1] |

742 |
punpcklbw m0, m6 |

743 |
punpcklbw m1, m6 |

744 |
punpcklbw m2, m6 |

745 |
punpcklbw m3, m6 |

746 |
pmullw m0, m4 |

747 |
pmullw m1, m5 |

748 |
pmullw m2, m4 |

749 |
pmullw m3, m5 |

750 |
paddsw m0, m1 |

751 |
paddsw m2, m3 |

752 |
psraw m0, 2 |

753 |
psraw m2, 2 |

754 |
pavgw m0, m6 |

755 |
pavgw m2, m6 |

756 |
%ifidn %1, mmxext |

757 |
packuswb m0, m0 |

758 |
packuswb m2, m2 |

759 |
movh [r0+r1*0], m0 |

760 |
movh [r0+r1*1], m2 |

761 |
%else |

762 |
packuswb m0, m2 |

763 |
movh [r0+r1*0], m0 |

764 |
movhps [r0+r1*1], m0 |

765 |
%endif |

766 | |

767 |
lea r0, [r0+r1*2] |

768 |
lea r2, [r2+r3*2] |

769 |
sub r4, 2 |

770 |
jg .nextrow |

771 |
REP_RET |

772 |
%endmacro |

773 | |

774 |
INIT_MMX |

775 |
FILTER_BILINEAR mmxext, 4, 0 |

776 |
INIT_XMM |

777 |
FILTER_BILINEAR sse2, 8, 7 |

778 | |

779 |
%macro FILTER_BILINEAR_SSSE3 1 |

780 |
cglobal put_vp8_bilinear%1_v_ssse3, 7,7 |

781 |
shl r6d, 4 |

782 |
%ifdef PIC |

783 |
lea r11, [bilinear_filter_vb_m] |

784 |
%endif |

785 |
pxor m4, m4 |

786 |
mova m3, [bilinear_filter_vb+r6-16] |

787 |
.nextrow |

788 |
movh m0, [r2+r3*0] |

789 |
movh m1, [r2+r3*1] |

790 |
movh m2, [r2+r3*2] |

791 |
punpcklbw m0, m1 |

792 |
punpcklbw m1, m2 |

793 |
pmaddubsw m0, m3 |

794 |
pmaddubsw m1, m3 |

795 |
psraw m0, 2 |

796 |
psraw m1, 2 |

797 |
pavgw m0, m4 |

798 |
pavgw m1, m4 |

799 |
%if mmsize==8 |

800 |
packuswb m0, m0 |

801 |
packuswb m1, m1 |

802 |
movh [r0+r1*0], m0 |

803 |
movh [r0+r1*1], m1 |

804 |
%else |

805 |
packuswb m0, m1 |

806 |
movh [r0+r1*0], m0 |

807 |
movhps [r0+r1*1], m0 |

808 |
%endif |

809 | |

810 |
lea r0, [r0+r1*2] |

811 |
lea r2, [r2+r3*2] |

812 |
sub r4, 2 |

813 |
jg .nextrow |

814 |
REP_RET |

815 | |

816 |
cglobal put_vp8_bilinear%1_h_ssse3, 7,7 |

817 |
shl r5d, 4 |

818 |
%ifdef PIC |

819 |
lea r11, [bilinear_filter_vb_m] |

820 |
%endif |

821 |
pxor m4, m4 |

822 |
mova m2, [filter_h2_shuf] |

823 |
mova m3, [bilinear_filter_vb+r5-16] |

824 |
.nextrow |

825 |
movu m0, [r2+r3*0] |

826 |
movu m1, [r2+r3*1] |

827 |
pshufb m0, m2 |

828 |
pshufb m1, m2 |

829 |
pmaddubsw m0, m3 |

830 |
pmaddubsw m1, m3 |

831 |
psraw m0, 2 |

832 |
psraw m1, 2 |

833 |
pavgw m0, m4 |

834 |
pavgw m1, m4 |

835 |
%if mmsize==8 |

836 |
packuswb m0, m0 |

837 |
packuswb m1, m1 |

838 |
movh [r0+r1*0], m0 |

839 |
movh [r0+r1*1], m1 |

840 |
%else |

841 |
packuswb m0, m1 |

842 |
movh [r0+r1*0], m0 |

843 |
movhps [r0+r1*1], m0 |

844 |
%endif |

845 | |

846 |
lea r0, [r0+r1*2] |

847 |
lea r2, [r2+r3*2] |

848 |
sub r4, 2 |

849 |
jg .nextrow |

850 |
REP_RET |

851 |
%endmacro |

852 | |

853 |
INIT_MMX |

854 |
FILTER_BILINEAR_SSSE3 4 |

855 |
INIT_XMM |

856 |
FILTER_BILINEAR_SSSE3 8 |

857 | |

858 |
cglobal put_vp8_pixels8_mmx, 5,5 |

859 |
.nextrow: |

860 |
movq mm0, [r2+r3*0] |

861 |
movq mm1, [r2+r3*1] |

862 |
lea r2, [r2+r3*2] |

863 |
movq [r0+r1*0], mm0 |

864 |
movq [r0+r1*1], mm1 |

865 |
lea r0, [r0+r1*2] |

866 |
sub r4d, 2 |

867 |
jg .nextrow |

868 |
REP_RET |

869 | |

870 |
cglobal put_vp8_pixels16_mmx, 5,5 |

871 |
.nextrow: |

872 |
movq mm0, [r2+r3*0+0] |

873 |
movq mm1, [r2+r3*0+8] |

874 |
movq mm2, [r2+r3*1+0] |

875 |
movq mm3, [r2+r3*1+8] |

876 |
lea r2, [r2+r3*2] |

877 |
movq [r0+r1*0+0], mm0 |

878 |
movq [r0+r1*0+8], mm1 |

879 |
movq [r0+r1*1+0], mm2 |

880 |
movq [r0+r1*1+8], mm3 |

881 |
lea r0, [r0+r1*2] |

882 |
sub r4d, 2 |

883 |
jg .nextrow |

884 |
REP_RET |

885 | |

886 |
cglobal put_vp8_pixels16_sse, 5,5,2 |

887 |
.nextrow: |

888 |
movups xmm0, [r2+r3*0] |

889 |
movups xmm1, [r2+r3*1] |

890 |
lea r2, [r2+r3*2] |

891 |
movaps [r0+r1*0], xmm0 |

892 |
movaps [r0+r1*1], xmm1 |

893 |
lea r0, [r0+r1*2] |

894 |
sub r4d, 2 |

895 |
jg .nextrow |

896 |
REP_RET |

897 | |

898 |
;----------------------------------------------------------------------------- |

899 |
; IDCT functions: |

900 |
; |

901 |
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |

902 |
;----------------------------------------------------------------------------- |

903 | |

904 |
cglobal vp8_idct_dc_add_mmx, 3, 3 |

905 |
; load data |

906 |
movd mm0, [r1] |

907 | |

908 |
; calculate DC |

909 |
paddw mm0, [pw_4] |

910 |
pxor mm1, mm1 |

911 |
psraw mm0, 3 |

912 |
psubw mm1, mm0 |

913 |
packuswb mm0, mm0 |

914 |
packuswb mm1, mm1 |

915 |
punpcklbw mm0, mm0 |

916 |
punpcklbw mm1, mm1 |

917 |
punpcklwd mm0, mm0 |

918 |
punpcklwd mm1, mm1 |

919 | |

920 |
; add DC |

921 |
lea r1, [r0+r2*2] |

922 |
movd mm2, [r0] |

923 |
movd mm3, [r0+r2] |

924 |
movd mm4, [r1] |

925 |
movd mm5, [r1+r2] |

926 |
paddusb mm2, mm0 |

927 |
paddusb mm3, mm0 |

928 |
paddusb mm4, mm0 |

929 |
paddusb mm5, mm0 |

930 |
psubusb mm2, mm1 |

931 |
psubusb mm3, mm1 |

932 |
psubusb mm4, mm1 |

933 |
psubusb mm5, mm1 |

934 |
movd [r0], mm2 |

935 |
movd [r0+r2], mm3 |

936 |
movd [r1], mm4 |

937 |
movd [r1+r2], mm5 |

938 |
RET |

939 | |

940 |
cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |

941 |
; load data |

942 |
movd xmm0, [r1] |

943 |
lea r1, [r0+r2*2] |

944 |
pxor xmm1, xmm1 |

945 |
movq xmm2, [pw_4] |

946 | |

947 |
; calculate DC |

948 |
paddw xmm0, xmm2 |

949 |
movd xmm2, [r0] |

950 |
movd xmm3, [r0+r2] |

951 |
movd xmm4, [r1] |

952 |
movd xmm5, [r1+r2] |

953 |
psraw xmm0, 3 |

954 |
pshuflw xmm0, xmm0, 0 |

955 |
punpcklqdq xmm0, xmm0 |

956 |
punpckldq xmm2, xmm3 |

957 |
punpckldq xmm4, xmm5 |

958 |
punpcklbw xmm2, xmm1 |

959 |
punpcklbw xmm4, xmm1 |

960 |
paddw xmm2, xmm0 |

961 |
paddw xmm4, xmm0 |

962 |
packuswb xmm2, xmm4 |

963 |
movd [r0], xmm2 |

964 |
pextrd [r0+r2], xmm2, 1 |

965 |
pextrd [r1], xmm2, 2 |

966 |
pextrd [r1+r2], xmm2, 3 |

967 |
RET |

968 | |

969 |
;----------------------------------------------------------------------------- |

970 |
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |

971 |
;----------------------------------------------------------------------------- |

972 | |

973 |
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) |

974 |
; this macro assumes that m6/m7 have words for 20091/17734 loaded |

975 |
%macro VP8_MULTIPLY_SUMSUB 4 |

976 |
mova %3, %1 |

977 |
mova %4, %2 |

978 |
pmulhw %3, m6 ;20091(1) |

979 |
pmulhw %4, m6 ;20091(2) |

980 |
paddw %3, %1 |

981 |
paddw %4, %2 |

982 |
paddw %1, %1 |

983 |
paddw %2, %2 |

984 |
pmulhw %1, m7 ;35468(1) |

985 |
pmulhw %2, m7 ;35468(2) |

986 |
psubw %1, %4 |

987 |
paddw %2, %3 |

988 |
%endmacro |

989 | |

990 |
; calculate x0=%1+%3; x1=%1-%3 |

991 |
; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) |

992 |
; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) |

993 |
; %5/%6 are temporary registers |

994 |
; we assume m6/m7 have constant words 20091/17734 loaded in them |

995 |
%macro VP8_IDCT_TRANSFORM4x4_1D 6 |

996 |
SUMSUB_BA m%3, m%1, m%5 ;t0, t1 |

997 |
VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 |

998 |
SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 |

999 |
SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 |

1000 |
SWAP %4, %1 |

1001 |
SWAP %4, %3 |

1002 |
%endmacro |

1003 | |

1004 |
INIT_MMX |

1005 |
cglobal vp8_idct_add_mmx, 3, 3 |

1006 |
; load block data |

1007 |
movq m0, [r1] |

1008 |
movq m1, [r1+8] |

1009 |
movq m2, [r1+16] |

1010 |
movq m3, [r1+24] |

1011 |
movq m6, [pw_20091] |

1012 |
movq m7, [pw_17734] |

1013 | |

1014 |
; actual IDCT |

1015 |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |

1016 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1017 |
paddw m0, [pw_4] |

1018 |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |

1019 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1020 | |

1021 |
; store |

1022 |
pxor m4, m4 |

1023 |
lea r1, [r0+2*r2] |

1024 |
STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 |

1025 |
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 |

1026 | |

1027 |
RET |

1028 | |

1029 |
;----------------------------------------------------------------------------- |

1030 |
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |

1031 |
;----------------------------------------------------------------------------- |

1032 | |

1033 |
%macro SCATTER_WHT 1 |

1034 |
pextrw r1d, m0, %1 |

1035 |
pextrw r2d, m1, %1 |

1036 |
mov [r0+2*16*0], r1w |

1037 |
mov [r0+2*16*1], r2w |

1038 |
pextrw r1d, m2, %1 |

1039 |
pextrw r2d, m3, %1 |

1040 |
mov [r0+2*16*2], r1w |

1041 |
mov [r0+2*16*3], r2w |

1042 |
%endmacro |

1043 | |

1044 |
%macro HADAMARD4_1D 4 |

1045 |
SUMSUB_BADC m%2, m%1, m%4, m%3 |

1046 |
SUMSUB_BADC m%4, m%2, m%3, m%1 |

1047 |
SWAP %1, %4, %3 |

1048 |
%endmacro |

1049 | |

1050 |
INIT_MMX |

1051 |
cglobal vp8_luma_dc_wht_mmxext, 2,3 |

1052 |
movq m0, [r1] |

1053 |
movq m1, [r1+8] |

1054 |
movq m2, [r1+16] |

1055 |
movq m3, [r1+24] |

1056 |
HADAMARD4_1D 0, 1, 2, 3 |

1057 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1058 |
paddw m0, [pw_3] |

1059 |
HADAMARD4_1D 0, 1, 2, 3 |

1060 |
psraw m0, 3 |

1061 |
psraw m1, 3 |

1062 |
psraw m2, 3 |

1063 |
psraw m3, 3 |

1064 |
SCATTER_WHT 0 |

1065 |
add r0, 2*16*4 |

1066 |
SCATTER_WHT 1 |

1067 |
add r0, 2*16*4 |

1068 |
SCATTER_WHT 2 |

1069 |
add r0, 2*16*4 |

1070 |
SCATTER_WHT 3 |

1071 |
RET |

1072 | |

1073 |
;----------------------------------------------------------------------------- |

1074 |
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |

1075 |
;----------------------------------------------------------------------------- |

1076 | |

1077 |
; macro called with 7 mm register indexes as argument, and 4 regular registers |

1078 |
; |

1079 |
; first 4 mm registers will carry the transposed pixel data |

1080 |
; the other three are scratchspace (one would be sufficient, but this allows |

1081 |
; for more spreading/pipelining and thus faster execution on OOE CPUs) |

1082 |
; |

1083 |
; first two regular registers are buf+4*stride and buf+5*stride |

1084 |
; third is -stride, fourth is +stride |

1085 |
%macro READ_8x4_INTERLEAVED 11 |

1086 |
; interleave 8 (A-H) rows of 4 pixels each |

1087 |
movd m%1, [%8+%10*4] ; A0-3 |

1088 |
movd m%5, [%9+%10*4] ; B0-3 |

1089 |
movd m%2, [%8+%10*2] ; C0-3 |

1090 |
movd m%6, [%8+%10] ; D0-3 |

1091 |
movd m%3, [%8] ; E0-3 |

1092 |
movd m%7, [%9] ; F0-3 |

1093 |
movd m%4, [%9+%11] ; G0-3 |

1094 |
punpcklbw m%1, m%5 ; A/B interleaved |

1095 |
movd m%5, [%9+%11*2] ; H0-3 |

1096 |
punpcklbw m%2, m%6 ; C/D interleaved |

1097 |
punpcklbw m%3, m%7 ; E/F interleaved |

1098 |
punpcklbw m%4, m%5 ; G/H interleaved |

1099 |
%endmacro |

1100 | |

1101 |
; macro called with 7 mm register indexes as argument, and 5 regular registers |

1102 |
; first 11 mean the same as READ_8x4_TRANSPOSED above |

1103 |
; fifth regular register is scratchspace to reach the bottom 8 rows, it |

1104 |
; will be set to second regular register + 8*stride at the end |

1105 |
%macro READ_16x4_INTERLEAVED 12 |

1106 |
; transpose 16 (A-P) rows of 4 pixels each |

1107 |
lea %12, [r0+8*r2] |

1108 | |

1109 |
; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |

1110 |
movd m%1, [%8+%10*4] ; A0-3 |

1111 |
movd m%3, [%12+%10*4] ; I0-3 |

1112 |
movd m%2, [%8+%10*2] ; C0-3 |

1113 |
movd m%4, [%12+%10*2] ; K0-3 |

1114 |
movd m%6, [%8+%10] ; D0-3 |

1115 |
movd m%5, [%12+%10] ; L0-3 |

1116 |
movd m%7, [%12] ; M0-3 |

1117 |
add %12, %11 |

1118 |
punpcklbw m%1, m%3 ; A/I |

1119 |
movd m%3, [%8] ; E0-3 |

1120 |
punpcklbw m%2, m%4 ; C/K |

1121 |
punpcklbw m%6, m%5 ; D/L |

1122 |
punpcklbw m%3, m%7 ; E/M |

1123 |
punpcklbw m%2, m%6 ; C/D/K/L interleaved |

1124 | |

1125 |
; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |

1126 |
movd m%5, [%9+%10*4] ; B0-3 |

1127 |
movd m%4, [%12+%10*4] ; J0-3 |

1128 |
movd m%7, [%9] ; F0-3 |

1129 |
movd m%6, [%12] ; N0-3 |

1130 |
punpcklbw m%5, m%4 ; B/J |

1131 |
punpcklbw m%7, m%6 ; F/N |

1132 |
punpcklbw m%1, m%5 ; A/B/I/J interleaved |

1133 |
punpcklbw m%3, m%7 ; E/F/M/N interleaved |

1134 |
movd m%4, [%9+%11] ; G0-3 |

1135 |
movd m%6, [%12+%11] ; O0-3 |

1136 |
movd m%5, [%9+%11*2] ; H0-3 |

1137 |
movd m%7, [%12+%11*2] ; P0-3 |

1138 |
punpcklbw m%4, m%6 ; G/O |

1139 |
punpcklbw m%5, m%7 ; H/P |

1140 |
punpcklbw m%4, m%5 ; G/H/O/P interleaved |

1141 |
%endmacro |

1142 | |

1143 |
; write 4 mm registers of 2 dwords each |

1144 |
; first four arguments are mm register indexes containing source data |

1145 |
; last four are registers containing buf+4*stride, buf+5*stride, |

1146 |
; -stride and +stride |

1147 |
%macro WRITE_4x2D 8 |

1148 |
; write out (2 dwords per register) |

1149 |
movd [%5+%7*4], m%1 |

1150 |
movd [%5+%7*2], m%2 |

1151 |
movd [%5], m%3 |

1152 |
movd [%6+%8], m%4 |

1153 |
punpckhdq m%1, m%1 |

1154 |
punpckhdq m%2, m%2 |

1155 |
punpckhdq m%3, m%3 |

1156 |
punpckhdq m%4, m%4 |

1157 |
movd [%6+%7*4], m%1 |

1158 |
movd [%5+%7], m%2 |

1159 |
movd [%6], m%3 |

1160 |
movd [%6+%8*2], m%4 |

1161 |
%endmacro |

1162 | |

1163 |
; write 4 xmm registers of 4 dwords each |

1164 |
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |

1165 |
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |

1166 |
; we add 1*stride to the third regular registry in the process |

1167 |
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the |

1168 |
; same memory region), or 8 if they cover two separate buffers (third one points to |

1169 |
; a different memory region than the first two), allowing for more optimal code for |

1170 |
; the 16-width case |

1171 |
%macro WRITE_4x4D 10 |

1172 |
; write out (4 dwords per register), start with dwords zero |

1173 |
movd [%5+%8*4], m%1 |

1174 |
movd [%5], m%2 |

1175 |
movd [%7+%8*4], m%3 |

1176 |
movd [%7], m%4 |

1177 | |

1178 |
; store dwords 1 |

1179 |
psrldq m%1, 4 |

1180 |
psrldq m%2, 4 |

1181 |
psrldq m%3, 4 |

1182 |
psrldq m%4, 4 |

1183 |
movd [%6+%8*4], m%1 |

1184 |
movd [%6], m%2 |

1185 |
%if %10 == 16 |

1186 |
movd [%6+%9*4], m%3 |

1187 |
%endif |

1188 |
movd [%7+%9], m%4 |

1189 | |

1190 |
; write dwords 2 |

1191 |
psrldq m%1, 4 |

1192 |
psrldq m%2, 4 |

1193 |
%if %10 == 8 |

1194 |
movd [%5+%8*2], m%1 |

1195 |
movd %5, m%3 |

1196 |
%endif |

1197 |
psrldq m%3, 4 |

1198 |
psrldq m%4, 4 |

1199 |
%if %10 == 16 |

1200 |
movd [%5+%8*2], m%1 |

1201 |
%endif |

1202 |
movd [%6+%9], m%2 |

1203 |
movd [%7+%8*2], m%3 |

1204 |
movd [%7+%9*2], m%4 |

1205 |
add %7, %9 |

1206 | |

1207 |
; store dwords 3 |

1208 |
psrldq m%1, 4 |

1209 |
psrldq m%2, 4 |

1210 |
psrldq m%3, 4 |

1211 |
psrldq m%4, 4 |

1212 |
%if %10 == 8 |

1213 |
mov [%7+%8*4], %5d |

1214 |
movd [%6+%8*2], m%1 |

1215 |
%else |

1216 |
movd [%5+%8], m%1 |

1217 |
%endif |

1218 |
movd [%6+%9*2], m%2 |

1219 |
movd [%7+%8*2], m%3 |

1220 |
movd [%7+%9*2], m%4 |

1221 |
%endmacro |

1222 | |

1223 |
%macro SPLATB_REG 3 |

1224 |
movd %1, %2 |

1225 |
punpcklbw %1, %1 |

1226 |
%if mmsize == 16 ; sse2 |

1227 |
punpcklwd %1, %1 |

1228 |
pshufd %1, %1, 0x0 |

1229 |
%elifidn %3, mmx |

1230 |
punpcklwd %1, %1 |

1231 |
punpckldq %1, %1 |

1232 |
%else ; mmxext |

1233 |
pshufw %1, %1, 0x0 |

1234 |
%endif |

1235 |
%endmacro |

1236 | |

1237 |
%macro SIMPLE_LOOPFILTER 3 |

1238 |
cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |

1239 |
%ifidn %2, h |

1240 |
mov r5, rsp ; backup stack pointer |

1241 |
and rsp, ~(mmsize-1) ; align stack |

1242 |
%endif |

1243 |
%if mmsize == 8 ; mmx/mmxext |

1244 |
mov r3, 2 |

1245 |
%endif |

1246 |
SPLATB_REG m7, r2, %1 ; splat "flim" into register |

1247 | |

1248 |
; set up indexes to address 4 rows |

1249 |
mov r2, r1 |

1250 |
neg r1 |

1251 |
%ifidn %2, h |

1252 |
lea r0, [r0+4*r2-2] |

1253 |
sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 |

1254 |
%endif |

1255 | |

1256 |
%if mmsize == 8 ; mmx / mmxext |

1257 |
.next8px |

1258 |
%endif |

1259 |
%ifidn %2, v |

1260 |
; read 4 half/full rows of pixels |

1261 |
mova m0, [r0+r1*2] ; p1 |

1262 |
mova m1, [r0+r1] ; p0 |

1263 |
mova m2, [r0] ; q0 |

1264 |
mova m3, [r0+r2] ; q1 |

1265 |
%else ; h |

1266 |
lea r4, [r0+r2] |

1267 | |

1268 |
%if mmsize == 8 ; mmx/mmxext |

1269 |
READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |

1270 |
%else ; sse2 |

1271 |
READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |

1272 |
%endif |

1273 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1274 | |

1275 |
mova [rsp], m0 ; store p1 |

1276 |
mova [rsp+mmsize], m3 ; store q1 |

1277 |
%endif |

1278 | |

1279 |
; simple_limit |

1280 |
mova m5, m2 ; m5=backup of q0 |

1281 |
mova m6, m1 ; m6=backup of p0 |

1282 |
psubusb m1, m2 ; p0-q0 |

1283 |
psubusb m2, m6 ; q0-p0 |

1284 |
por m1, m2 ; FFABS(p0-q0) |

1285 |
paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |

1286 | |

1287 |
mova m4, m3 |

1288 |
mova m2, m0 |

1289 |
psubusb m3, m0 ; q1-p1 |

1290 |
psubusb m0, m4 ; p1-q1 |

1291 |
por m3, m0 ; FFABS(p1-q1) |

1292 |
mova m0, [pb_80] |

1293 |
pxor m2, m0 |

1294 |
pxor m4, m0 |

1295 |
psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |

1296 |
pand m3, [pb_FE] |

1297 |
psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |

1298 |
paddusb m3, m1 |

1299 |
psubusb m3, m7 |

1300 |
pxor m1, m1 |

1301 |
pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |

1302 | |

1303 |
; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |

1304 |
mova m4, m5 |

1305 |
pxor m5, m0 |

1306 |
pxor m0, m6 |

1307 |
psubsb m5, m0 ; q0-p0 (signed) |

1308 |
paddsb m2, m5 |

1309 |
paddsb m2, m5 |

1310 |
paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |

1311 |
pand m2, m3 ; apply filter mask (m3) |

1312 | |

1313 |
mova m3, [pb_F8] |

1314 |
mova m1, m2 |

1315 |
paddsb m2, [pb_4] ; f1<<3=a+4 |

1316 |
paddsb m1, [pb_3] ; f2<<3=a+3 |

1317 |
pand m2, m3 |

1318 |
pand m1, m3 ; cache f2<<3 |

1319 | |

1320 |
pxor m0, m0 |

1321 |
pxor m3, m3 |

1322 |
pcmpgtb m0, m2 ; which values are <0? |

1323 |
psubb m3, m2 ; -f1<<3 |

1324 |
psrlq m2, 3 ; +f1 |

1325 |
psrlq m3, 3 ; -f1 |

1326 |
pand m3, m0 |

1327 |
pandn m0, m2 |

1328 |
psubusb m4, m0 |

1329 |
paddusb m4, m3 ; q0-f1 |

1330 | |

1331 |
pxor m0, m0 |

1332 |
pxor m3, m3 |

1333 |
pcmpgtb m0, m1 ; which values are <0? |

1334 |
psubb m3, m1 ; -f2<<3 |

1335 |
psrlq m1, 3 ; +f2 |

1336 |
psrlq m3, 3 ; -f2 |

1337 |
pand m3, m0 |

1338 |
pandn m0, m1 |

1339 |
paddusb m6, m0 |

1340 |
psubusb m6, m3 ; p0+f2 |

1341 | |

1342 |
; store |

1343 |
%ifidn %2, v |

1344 |
mova [r0], m4 |

1345 |
mova [r0+r1], m6 |

1346 |
%else ; h |

1347 |
mova m0, [rsp] ; p1 |

1348 |
SWAP 2, 4 ; p0 |

1349 |
SWAP 1, 6 ; q0 |

1350 |
mova m3, [rsp+mmsize] ; q1 |

1351 | |

1352 |
TRANSPOSE4x4B 0, 1, 2, 3, 4 |

1353 |
%if mmsize == 16 ; sse2 |

1354 |
add r3, r1 ; change from r4*8*stride to r0+8*stride |

1355 |
WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16 |

1356 |
%else ; mmx/mmxext |

1357 |
WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 |

1358 |
%endif |

1359 |
%endif |

1360 | |

1361 |
%if mmsize == 8 ; mmx/mmxext |

1362 |
; next 8 pixels |

1363 |
%ifidn %2, v |

1364 |
add r0, 8 ; advance 8 cols = pixels |

1365 |
%else ; h |

1366 |
lea r0, [r0+r2*8] ; advance 8 rows = lines |

1367 |
%endif |

1368 |
dec r3 |

1369 |
jg .next8px |

1370 |
%ifidn %2, v |

1371 |
REP_RET |

1372 |
%else ; h |

1373 |
mov rsp, r5 ; restore stack pointer |

1374 |
RET |

1375 |
%endif |

1376 |
%else ; sse2 |

1377 |
%ifidn %2, h |

1378 |
mov rsp, r5 ; restore stack pointer |

1379 |
%endif |

1380 |
RET |

1381 |
%endif |

1382 |
%endmacro |

1383 | |

1384 |
INIT_MMX |

1385 |
SIMPLE_LOOPFILTER mmx, v, 4 |

1386 |
SIMPLE_LOOPFILTER mmx, h, 6 |

1387 |
SIMPLE_LOOPFILTER mmxext, v, 4 |

1388 |
SIMPLE_LOOPFILTER mmxext, h, 6 |

1389 |
INIT_XMM |

1390 |
SIMPLE_LOOPFILTER sse2, v, 3 |

1391 |
SIMPLE_LOOPFILTER sse2, h, 6 |

1392 | |

1393 |
;----------------------------------------------------------------------------- |

1394 |
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |

1395 |
; int flimE, int flimI, int hev_thr); |

1396 |
;----------------------------------------------------------------------------- |

1397 | |

1398 |
%macro INNER_LOOPFILTER 5 |

1399 |
%if %4 == 8 ; chroma |

1400 |
cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 |

1401 |
%define dst8_reg r1 |

1402 |
%define mstride_reg r2 |

1403 |
%define E_reg r3 |

1404 |
%define I_reg r4 |

1405 |
%define hev_thr_reg r5 |

1406 |
%else ; luma |

1407 |
cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 |

1408 |
%define mstride_reg r1 |

1409 |
%define E_reg r2 |

1410 |
%define I_reg r3 |

1411 |
%define hev_thr_reg r4 |

1412 |
%ifdef m8 ; x86-64, sse2 |

1413 |
%define dst8_reg r4 |

1414 |
%elif mmsize == 16 ; x86-32, sse2 |

1415 |
%define dst8_reg r5 |

1416 |
%else ; x86-32, mmx/mmxext |

1417 |
%define cnt_reg r5 |

1418 |
%endif |

1419 |
%endif |

1420 |
%define dst_reg r0 |

1421 |
%define stride_reg E_reg |

1422 |
%define dst2_reg I_reg |

1423 |
%ifndef m8 |

1424 |
%define stack_reg hev_thr_reg |

1425 |
%endif |

1426 | |

1427 |
%ifndef m8 ; mmx/mmxext or sse2 on x86-32 |

1428 |
; splat function arguments |

1429 |
SPLATB_REG m0, E_reg, %1 ; E |

1430 |
SPLATB_REG m1, I_reg, %1 ; I |

1431 |
SPLATB_REG m2, hev_thr_reg, %1 ; hev_thresh |

1432 | |

1433 |
; align stack |

1434 |
mov stack_reg, rsp ; backup stack pointer |

1435 |
and rsp, ~(mmsize-1) ; align stack |

1436 |
%ifidn %2, v |

1437 |
sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |

1438 |
; [3]=hev() result |

1439 |
%else ; h |

1440 |
sub rsp, mmsize * 5 ; extra storage space for transposes |

1441 |
%endif |

1442 | |

1443 |
%define flim_E [rsp] |

1444 |
%define flim_I [rsp+mmsize] |

1445 |
%define hev_thr [rsp+mmsize*2] |

1446 |
%define mask_res [rsp+mmsize*3] |

1447 |
%define p0backup [rsp+mmsize*3] |

1448 |
%define q0backup [rsp+mmsize*4] |

1449 | |

1450 |
mova flim_E, m0 |

1451 |
mova flim_I, m1 |

1452 |
mova hev_thr, m2 |

1453 | |

1454 |
%else ; sse2 on x86-64 |

1455 | |

1456 |
%define flim_E m9 |

1457 |
%define flim_I m10 |

1458 |
%define hev_thr m11 |

1459 |
%define mask_res m12 |

1460 |
%define p0backup m12 |

1461 |
%define q0backup m8 |

1462 | |

1463 |
; splat function arguments |

1464 |
SPLATB_REG flim_E, E_reg, %1 ; E |

1465 |
SPLATB_REG flim_I, I_reg, %1 ; I |

1466 |
SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh |

1467 |
%endif |

1468 | |

1469 |
%if mmsize == 8 && %4 == 16 ; mmx/mmxext |

1470 |
mov cnt_reg, 2 |

1471 |
%endif |

1472 |
mov stride_reg, mstride_reg |

1473 |
neg mstride_reg |

1474 |
%ifidn %2, h |

1475 |
lea dst_reg, [dst_reg + stride_reg*4-4] |

1476 |
%if %4 == 8 |

1477 |
lea dst8_reg, [dst8_reg+ stride_reg*4-4] |

1478 |
%endif |

1479 |
%endif |

1480 | |

1481 |
%if mmsize == 8 |

1482 |
.next8px |

1483 |
%endif |

1484 |
; read |

1485 |
lea dst2_reg, [dst_reg + stride_reg] |

1486 |
%ifidn %2, v |

1487 |
%if %4 == 8 && mmsize == 16 |

1488 |
%define movrow movh |

1489 |
%else |

1490 |
%define movrow mova |

1491 |
%endif |

1492 |
movrow m0, [dst_reg +mstride_reg*4] ; p3 |

1493 |
movrow m1, [dst2_reg+mstride_reg*4] ; p2 |

1494 |
movrow m2, [dst_reg +mstride_reg*2] ; p1 |

1495 |
movrow m5, [dst2_reg] ; q1 |

1496 |
movrow m6, [dst2_reg+ stride_reg] ; q2 |

1497 |
movrow m7, [dst2_reg+ stride_reg*2] ; q3 |

1498 |
%if mmsize == 16 && %4 == 8 |

1499 |
movhps m0, [dst8_reg+mstride_reg*4] |

1500 |
movhps m2, [dst8_reg+mstride_reg*2] |

1501 |
add dst8_reg, stride_reg |

1502 |
movhps m1, [dst8_reg+mstride_reg*4] |

1503 |
movhps m5, [dst8_reg] |

1504 |
movhps m6, [dst8_reg+ stride_reg] |

1505 |
movhps m7, [dst8_reg+ stride_reg*2] |

1506 |
add dst8_reg, mstride_reg |

1507 |
%endif |

1508 |
%elif mmsize == 8 ; mmx/mmxext (h) |

1509 |
; read 8 rows of 8px each |

1510 |
movu m0, [dst_reg +mstride_reg*4] |

1511 |
movu m1, [dst2_reg+mstride_reg*4] |

1512 |
movu m2, [dst_reg +mstride_reg*2] |

1513 |
movu m3, [dst_reg +mstride_reg] |

1514 |
movu m4, [dst_reg] |

1515 |
movu m5, [dst2_reg] |

1516 |
movu m6, [dst2_reg+ stride_reg] |

1517 | |

1518 |
; 8x8 transpose |

1519 |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |

1520 |
mova q0backup, m1 |

1521 |
movu m7, [dst2_reg+ stride_reg*2] |

1522 |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |

1523 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |

1524 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |

1525 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |

1526 |
mova m1, q0backup |

1527 |
mova q0backup, m2 ; store q0 |

1528 |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |

1529 |
mova p0backup, m5 ; store p0 |

1530 |
SWAP 1, 4 |

1531 |
SWAP 2, 4 |

1532 |
SWAP 6, 3 |

1533 |
SWAP 5, 3 |

1534 |
%else ; sse2 (h) |

1535 |
%if %4 == 16 |

1536 |
lea dst8_reg, [dst_reg + stride_reg*8] |

1537 |
%endif |

1538 | |

1539 |
; read 16 rows of 8px each, interleave |

1540 |
movh m0, [dst_reg +mstride_reg*4] |

1541 |
movh m1, [dst8_reg+mstride_reg*4] |

1542 |
movh m2, [dst_reg +mstride_reg*2] |

1543 |
movh m5, [dst8_reg+mstride_reg*2] |

1544 |
movh m3, [dst_reg +mstride_reg] |

1545 |
movh m6, [dst8_reg+mstride_reg] |

1546 |
movh m4, [dst_reg] |

1547 |
movh m7, [dst8_reg] |

1548 |
punpcklbw m0, m1 ; A/I |

1549 |
punpcklbw m2, m5 ; C/K |

1550 |
punpcklbw m3, m6 ; D/L |

1551 |
punpcklbw m4, m7 ; E/M |

1552 | |

1553 |
add dst8_reg, stride_reg |

1554 |
movh m1, [dst2_reg+mstride_reg*4] |

1555 |
movh m6, [dst8_reg+mstride_reg*4] |

1556 |
movh m5, [dst2_reg] |

1557 |
movh m7, [dst8_reg] |

1558 |
punpcklbw m1, m6 ; B/J |

1559 |
punpcklbw m5, m7 ; F/N |

1560 |
movh m6, [dst2_reg+ stride_reg] |

1561 |
movh m7, [dst8_reg+ stride_reg] |

1562 |
punpcklbw m6, m7 ; G/O |

1563 | |

1564 |
; 8x16 transpose |

1565 |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |

1566 |
%ifdef m8 |

1567 |
SWAP 1, 8 |

1568 |
%else |

1569 |
mova q0backup, m1 |

1570 |
%endif |

1571 |
movh m7, [dst2_reg+ stride_reg*2] |

1572 |
movh m1, [dst8_reg+ stride_reg*2] |

1573 |
punpcklbw m7, m1 ; H/P |

1574 |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |

1575 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |

1576 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |

1577 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |

1578 |
%ifdef m8 |

1579 |
SWAP 1, 8 |

1580 |
SWAP 2, 8 |

1581 |
%else |

1582 |
mova m1, q0backup |

1583 |
mova q0backup, m2 ; store q0 |

1584 |
%endif |

1585 |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |

1586 |
%ifdef m12 |

1587 |
SWAP 5, 12 |

1588 |
%else |

1589 |
mova p0backup, m5 ; store p0 |

1590 |
%endif |

1591 |
SWAP 1, 4 |

1592 |
SWAP 2, 4 |

1593 |
SWAP 6, 3 |

1594 |
SWAP 5, 3 |

1595 |
%endif |

1596 | |

1597 |
; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |

1598 |
mova m4, m1 |

1599 |
SWAP 4, 1 |

1600 |
psubusb m4, m0 ; p2-p3 |

1601 |
psubusb m0, m1 ; p3-p2 |

1602 |
por m0, m4 ; abs(p3-p2) |

1603 | |

1604 |
mova m4, m2 |

1605 |
SWAP 4, 2 |

1606 |
psubusb m4, m1 ; p1-p2 |

1607 |
psubusb m1, m2 ; p2-p1 |

1608 |
por m1, m4 ; abs(p2-p1) |

1609 | |

1610 |
mova m4, m6 |

1611 |
SWAP 4, 6 |

1612 |
psubusb m4, m7 ; q2-q3 |

1613 |
psubusb m7, m6 ; q3-q2 |

1614 |
por m7, m4 ; abs(q3-q2) |

1615 | |

1616 |
mova m4, m5 |

1617 |
SWAP 4, 5 |

1618 |
psubusb m4, m6 ; q1-q2 |

1619 |
psubusb m6, m5 ; q2-q1 |

1620 |
por m6, m4 ; abs(q2-q1) |

1621 | |

1622 |
%ifidn %1, mmx |

1623 |
mova m4, flim_I |

1624 |
pxor m3, m3 |

1625 |
psubusb m0, m4 |

1626 |
psubusb m1, m4 |

1627 |
psubusb m7, m4 |

1628 |
psubusb m6, m4 |

1629 |
pcmpeqb m0, m3 ; abs(p3-p2) <= I |

1630 |
pcmpeqb m1, m3 ; abs(p2-p1) <= I |

1631 |
pcmpeqb m7, m3 ; abs(q3-q2) <= I |

1632 |
pcmpeqb m6, m3 ; abs(q2-q1) <= I |

1633 |
pand m0, m1 |

1634 |
pand m7, m6 |

1635 |
pand m0, m7 |

1636 |
%else ; mmxext/sse2 |

1637 |
pmaxub m0, m1 |

1638 |
pmaxub m6, m7 |

1639 |
pmaxub m0, m6 |

1640 |
%endif |

1641 | |

1642 |
; normal_limit and high_edge_variance for p1-p0, q1-q0 |

1643 |
SWAP 7, 3 ; now m7 is zero |

1644 |
%ifidn %2, v |

1645 |
movrow m3, [dst_reg +mstride_reg] ; p0 |

1646 |
%if mmsize == 16 && %4 == 8 |

1647 |
movhps m3, [dst8_reg+mstride_reg] |

1648 |
%endif |

1649 |
%elifdef m12 |

1650 |
SWAP 3, 12 |

1651 |
%else |

1652 |
mova m3, p0backup |

1653 |
%endif |

1654 | |

1655 |
mova m1, m2 |

1656 |
SWAP 1, 2 |

1657 |
mova m6, m3 |

1658 |
SWAP 3, 6 |

1659 |
psubusb m1, m3 ; p1-p0 |

1660 |
psubusb m6, m2 ; p0-p1 |

1661 |
por m1, m6 ; abs(p1-p0) |

1662 |
%ifidn %1, mmx |

1663 |
mova m6, m1 |

1664 |
psubusb m1, m4 |

1665 |
psubusb m6, hev_thr |

1666 |
pcmpeqb m1, m7 ; abs(p1-p0) <= I |

1667 |
pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |

1668 |
pand m0, m1 |

1669 |
mova mask_res, m6 |

1670 |
%else ; mmxext/sse2 |

1671 |
pmaxub m0, m1 ; max_I |

1672 |
SWAP 1, 4 ; max_hev_thresh |

1673 |
%endif |

1674 | |

1675 |
SWAP 6, 4 ; now m6 is I |

1676 |
%ifidn %2, v |

1677 |
movrow m4, [dst_reg] ; q0 |

1678 |
%if mmsize == 16 && %4 == 8 |

1679 |
movhps m4, [dst8_reg] |

1680 |
%endif |

1681 |
%elifdef m8 |

1682 |
SWAP 4, 8 |

1683 |
%else |

1684 |
mova m4, q0backup |

1685 |
%endif |

1686 |
mova m1, m4 |

1687 |
SWAP 1, 4 |

1688 |
mova m7, m5 |

1689 |
SWAP 7, 5 |

1690 |
psubusb m1, m5 ; q0-q1 |

1691 |
psubusb m7, m4 ; q1-q0 |

1692 |
por m1, m7 ; abs(q1-q0) |

1693 |
%ifidn %1, mmx |

1694 |
mova m7, m1 |

1695 |
psubusb m1, m6 |

1696 |
psubusb m7, hev_thr |

1697 |
pxor m6, m6 |

1698 |
pcmpeqb m1, m6 ; abs(q1-q0) <= I |

1699 |
pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |

1700 |
mova m6, mask_res |

1701 |
pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |

1702 |
pand m6, m7 |

1703 |
%else ; mmxext/sse2 |

1704 |
pxor m7, m7 |

1705 |
pmaxub m0, m1 |

1706 |
pmaxub m6, m1 |

1707 |
psubusb m0, flim_I |

1708 |
psubusb m6, hev_thr |

1709 |
pcmpeqb m0, m7 ; max(abs(..)) <= I |

1710 |
pcmpeqb m6, m7 ; !(max(abs..) > thresh) |

1711 |
%endif |

1712 |
%ifdef m12 |

1713 |
SWAP 6, 12 |

1714 |
%else |

1715 |
mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |

1716 |
%endif |

1717 | |

1718 |
; simple_limit |

1719 |
mova m1, m3 |

1720 |
SWAP 1, 3 |

1721 |
mova m6, m4 ; keep copies of p0/q0 around for later use |

1722 |
SWAP 6, 4 |

1723 |
psubusb m1, m4 ; p0-q0 |

1724 |
psubusb m6, m3 ; q0-p0 |

1725 |
por m1, m6 ; abs(q0-p0) |

1726 |
paddusb m1, m1 ; m1=2*abs(q0-p0) |

1727 | |

1728 |
mova m7, m2 |

1729 |
SWAP 7, 2 |

1730 |
mova m6, m5 |

1731 |
SWAP 6, 5 |

1732 |
psubusb m7, m5 ; p1-q1 |

1733 |
psubusb m6, m2 ; q1-p1 |

1734 |
por m7, m6 ; abs(q1-p1) |

1735 |
pxor m6, m6 |

1736 |
pand m7, [pb_FE] |

1737 |
psrlq m7, 1 ; abs(q1-p1)/2 |

1738 |
paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |

1739 |
psubusb m7, flim_E |

1740 |
pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |

1741 |
pand m0, m7 ; normal_limit result |

1742 | |

1743 |
; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |

1744 |
%ifdef m8 ; x86-64 && sse2 |

1745 |
mova m8, [pb_80] |

1746 |
%define pb_80_var m8 |

1747 |
%else ; x86-32 or mmx/mmxext |

1748 |
%define pb_80_var [pb_80] |

1749 |
%endif |

1750 |
mova m1, m4 |

1751 |
mova m7, m3 |

1752 |
pxor m1, pb_80_var |

1753 |
pxor m7, pb_80_var |

1754 |
psubsb m1, m7 ; (signed) q0-p0 |

1755 |
mova m6, m2 |

1756 |
mova m7, m5 |

1757 |
pxor m6, pb_80_var |

1758 |
pxor m7, pb_80_var |

1759 |
psubsb m6, m7 ; (signed) p1-q1 |

1760 |
mova m7, mask_res |

1761 |
pandn m7, m6 |

1762 |
paddsb m7, m1 |

1763 |
paddsb m7, m1 |

1764 |
paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) |

1765 | |

1766 |
pand m7, m0 |

1767 |
mova m1, [pb_F8] |

1768 |
mova m6, m7 |

1769 |
paddsb m7, [pb_3] |

1770 |
paddsb m6, [pb_4] |

1771 |
pand m7, m1 |

1772 |
pand m6, m1 |

1773 | |

1774 |
pxor m1, m1 |

1775 |
pxor m0, m0 |

1776 |
pcmpgtb m1, m7 |

1777 |
psubb m0, m7 |

1778 |
psrlq m7, 3 ; +f2 |

1779 |
psrlq m0, 3 ; -f2 |

1780 |
pand m0, m1 |

1781 |
pandn m1, m7 |

1782 |
psubusb m3, m0 |

1783 |
paddusb m3, m1 ; p0+f2 |

1784 | |

1785 |
pxor m1, m1 |

1786 |
pxor m0, m0 |

1787 |
pcmpgtb m0, m6 |

1788 |
psubb m1, m6 |

1789 |
psrlq m6, 3 ; +f1 |

1790 |
psrlq m1, 3 ; -f1 |

1791 |
pand m1, m0 |

1792 |
pandn m0, m6 |

1793 |
psubusb m4, m0 |

1794 |
paddusb m4, m1 ; q0-f1 |

1795 | |

1796 |
%ifdef m12 |

1797 |
SWAP 6, 12 |

1798 |
%else |

1799 |
mova m6, mask_res |

1800 |
%endif |

1801 |
%ifidn %1, mmx |

1802 |
mova m7, [pb_1] |

1803 |
%else ; mmxext/sse2 |

1804 |
pxor m7, m7 |

1805 |
%endif |

1806 |
pand m0, m6 |

1807 |
pand m1, m6 |

1808 |
%ifidn %1, mmx |

1809 |
paddusb m0, m7 |

1810 |
pand m1, [pb_FE] |

1811 |
pandn m7, m0 |

1812 |
psrlq m1, 1 |

1813 |
psrlq m7, 1 |

1814 |
SWAP 0, 7 |

1815 |
%else ; mmxext/sse2 |

1816 |
psubusb m1, [pb_1] |

1817 |
pavgb m0, m7 ; a |

1818 |
pavgb m1, m7 ; -a |

1819 |
%endif |

1820 |
psubusb m5, m0 |

1821 |
psubusb m2, m1 |

1822 |
paddusb m5, m1 ; q1-a |

1823 |
paddusb m2, m0 ; p1+a |

1824 | |

1825 |
; store |

1826 |
%ifidn %2, v |

1827 |
movrow [dst_reg +mstride_reg*2], m2 |

1828 |
movrow [dst_reg +mstride_reg ], m3 |

1829 |
movrow [dst_reg], m4 |

1830 |
movrow [dst_reg + stride_reg ], m5 |

1831 |
%if mmsize == 16 && %4 == 8 |

1832 |
movhps [dst8_reg+mstride_reg*2], m2 |

1833 |
movhps [dst8_reg+mstride_reg ], m3 |

1834 |
movhps [dst8_reg], m4 |

1835 |
movhps [dst8_reg+ stride_reg ], m5 |

1836 |
%endif |

1837 |
%else ; h |

1838 |
add dst_reg, 2 |

1839 |
add dst2_reg, 2 |

1840 | |

1841 |
; 4x8/16 transpose |

1842 |
TRANSPOSE4x4B 2, 3, 4, 5, 6 |

1843 | |

1844 |
%if mmsize == 8 ; mmx/mmxext (h) |

1845 |
WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg |

1846 |
%else ; sse2 (h) |

1847 |
lea dst8_reg, [dst8_reg+mstride_reg+2] |

1848 |
WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |

1849 |
%endif |

1850 |
%endif |

1851 | |

1852 |
%if mmsize == 8 |

1853 |
%if %4 == 8 ; chroma |

1854 |
%ifidn %2, h |

1855 |
sub dst_reg, 2 |

1856 |
%endif |

1857 |
cmp dst_reg, dst8_reg |

1858 |
mov dst_reg, dst8_reg |

1859 |
jnz .next8px |

1860 |
%else |

1861 |
%ifidn %2, h |

1862 |
lea dst_reg, [dst_reg + stride_reg*8-2] |

1863 |
%else ; v |

1864 |
add dst_reg, 8 |

1865 |
%endif |

1866 |
dec cnt_reg |

1867 |
jg .next8px |

1868 |
%endif |

1869 |
%endif |

1870 | |

1871 |
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext |

1872 |
mov rsp, stack_reg ; restore stack pointer |

1873 |
%endif |

1874 |
RET |

1875 |
%endmacro |

1876 | |

1877 |
INIT_MMX |

1878 |
INNER_LOOPFILTER mmx, v, 6, 16, 8 |

1879 |
INNER_LOOPFILTER mmx, h, 6, 16, 8 |

1880 |
INNER_LOOPFILTER mmxext, v, 6, 16, 8 |

1881 |
INNER_LOOPFILTER mmxext, h, 6, 16, 8 |

1882 | |

1883 |
INNER_LOOPFILTER mmx, v, 6, 8, 8 |

1884 |
INNER_LOOPFILTER mmx, h, 6, 8, 8 |

1885 |
INNER_LOOPFILTER mmxext, v, 6, 8, 8 |

1886 |
INNER_LOOPFILTER mmxext, h, 6, 8, 8 |

1887 | |

1888 |
INIT_XMM |

1889 |
INNER_LOOPFILTER sse2, v, 5, 16, 13 |

1890 |
%ifdef m8 |

1891 |
INNER_LOOPFILTER sse2, h, 5, 16, 13 |

1892 |
%else |

1893 |
INNER_LOOPFILTER sse2, h, 6, 16, 13 |

1894 |
%endif |

1895 |
INNER_LOOPFILTER sse2, v, 6, 8, 13 |

1896 |
INNER_LOOPFILTER sse2, h, 6, 8, 13 |