## ffmpeg / libavcodec / x86 / vp8dsp.asm @ 2912e87a

History | View | Annotate | Download (78.4 KB)

1 |
;****************************************************************************** |
---|---|

2 |
;* VP8 MMXEXT optimizations |

3 |
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |

4 |
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |

5 |
;* |

6 |
;* This file is part of Libav. |

7 |
;* |

8 |
;* Libav is free software; you can redistribute it and/or |

9 |
;* modify it under the terms of the GNU Lesser General Public |

10 |
;* License as published by the Free Software Foundation; either |

11 |
;* version 2.1 of the License, or (at your option) any later version. |

12 |
;* |

13 |
;* Libav is distributed in the hope that it will be useful, |

14 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

15 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

16 |
;* Lesser General Public License for more details. |

17 |
;* |

18 |
;* You should have received a copy of the GNU Lesser General Public |

19 |
;* License along with Libav; if not, write to the Free Software |

20 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

21 |
;****************************************************************************** |

22 | |

23 |
%include "x86inc.asm" |

24 |
%include "x86util.asm" |

25 | |

26 |
SECTION_RODATA |

27 | |

28 |
fourtap_filter_hw_m: times 4 dw -6, 123 |

29 |
times 4 dw 12, -1 |

30 |
times 4 dw -9, 93 |

31 |
times 4 dw 50, -6 |

32 |
times 4 dw -6, 50 |

33 |
times 4 dw 93, -9 |

34 |
times 4 dw -1, 12 |

35 |
times 4 dw 123, -6 |

36 | |

37 |
sixtap_filter_hw_m: times 4 dw 2, -11 |

38 |
times 4 dw 108, 36 |

39 |
times 4 dw -8, 1 |

40 |
times 4 dw 3, -16 |

41 |
times 4 dw 77, 77 |

42 |
times 4 dw -16, 3 |

43 |
times 4 dw 1, -8 |

44 |
times 4 dw 36, 108 |

45 |
times 4 dw -11, 2 |

46 | |

47 |
fourtap_filter_hb_m: times 8 db -6, 123 |

48 |
times 8 db 12, -1 |

49 |
times 8 db -9, 93 |

50 |
times 8 db 50, -6 |

51 |
times 8 db -6, 50 |

52 |
times 8 db 93, -9 |

53 |
times 8 db -1, 12 |

54 |
times 8 db 123, -6 |

55 | |

56 |
sixtap_filter_hb_m: times 8 db 2, 1 |

57 |
times 8 db -11, 108 |

58 |
times 8 db 36, -8 |

59 |
times 8 db 3, 3 |

60 |
times 8 db -16, 77 |

61 |
times 8 db 77, -16 |

62 |
times 8 db 1, 2 |

63 |
times 8 db -8, 36 |

64 |
times 8 db 108, -11 |

65 | |

66 |
fourtap_filter_v_m: times 8 dw -6 |

67 |
times 8 dw 123 |

68 |
times 8 dw 12 |

69 |
times 8 dw -1 |

70 |
times 8 dw -9 |

71 |
times 8 dw 93 |

72 |
times 8 dw 50 |

73 |
times 8 dw -6 |

74 |
times 8 dw -6 |

75 |
times 8 dw 50 |

76 |
times 8 dw 93 |

77 |
times 8 dw -9 |

78 |
times 8 dw -1 |

79 |
times 8 dw 12 |

80 |
times 8 dw 123 |

81 |
times 8 dw -6 |

82 | |

83 |
sixtap_filter_v_m: times 8 dw 2 |

84 |
times 8 dw -11 |

85 |
times 8 dw 108 |

86 |
times 8 dw 36 |

87 |
times 8 dw -8 |

88 |
times 8 dw 1 |

89 |
times 8 dw 3 |

90 |
times 8 dw -16 |

91 |
times 8 dw 77 |

92 |
times 8 dw 77 |

93 |
times 8 dw -16 |

94 |
times 8 dw 3 |

95 |
times 8 dw 1 |

96 |
times 8 dw -8 |

97 |
times 8 dw 36 |

98 |
times 8 dw 108 |

99 |
times 8 dw -11 |

100 |
times 8 dw 2 |

101 | |

102 |
bilinear_filter_vw_m: times 8 dw 1 |

103 |
times 8 dw 2 |

104 |
times 8 dw 3 |

105 |
times 8 dw 4 |

106 |
times 8 dw 5 |

107 |
times 8 dw 6 |

108 |
times 8 dw 7 |

109 | |

110 |
bilinear_filter_vb_m: times 8 db 7, 1 |

111 |
times 8 db 6, 2 |

112 |
times 8 db 5, 3 |

113 |
times 8 db 4, 4 |

114 |
times 8 db 3, 5 |

115 |
times 8 db 2, 6 |

116 |
times 8 db 1, 7 |

117 | |

118 |
%ifdef PIC |

119 |
%define fourtap_filter_hw r11 |

120 |
%define sixtap_filter_hw r11 |

121 |
%define fourtap_filter_hb r11 |

122 |
%define sixtap_filter_hb r11 |

123 |
%define fourtap_filter_v r11 |

124 |
%define sixtap_filter_v r11 |

125 |
%define bilinear_filter_vw r11 |

126 |
%define bilinear_filter_vb r11 |

127 |
%else |

128 |
%define fourtap_filter_hw fourtap_filter_hw_m |

129 |
%define sixtap_filter_hw sixtap_filter_hw_m |

130 |
%define fourtap_filter_hb fourtap_filter_hb_m |

131 |
%define sixtap_filter_hb sixtap_filter_hb_m |

132 |
%define fourtap_filter_v fourtap_filter_v_m |

133 |
%define sixtap_filter_v sixtap_filter_v_m |

134 |
%define bilinear_filter_vw bilinear_filter_vw_m |

135 |
%define bilinear_filter_vb bilinear_filter_vb_m |

136 |
%endif |

137 | |

138 |
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |

139 |
filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |

140 | |

141 |
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |

142 |
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |

143 |
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |

144 | |

145 |
pw_20091: times 4 dw 20091 |

146 |
pw_17734: times 4 dw 17734 |

147 | |

148 |
pb_27_63: times 8 db 27, 63 |

149 |
pb_18_63: times 8 db 18, 63 |

150 |
pb_9_63: times 8 db 9, 63 |

151 | |

152 |
cextern pb_1 |

153 |
cextern pw_3 |

154 |
cextern pb_3 |

155 |
cextern pw_4 |

156 |
cextern pb_4 |

157 |
cextern pw_9 |

158 |
cextern pw_18 |

159 |
cextern pw_27 |

160 |
cextern pw_63 |

161 |
cextern pw_64 |

162 |
cextern pb_80 |

163 |
cextern pb_F8 |

164 |
cextern pb_FE |

165 | |

166 |
SECTION .text |

167 | |

168 |
;----------------------------------------------------------------------------- |

169 |
; subpel MC functions: |

170 |
; |

171 |
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, |

172 |
; uint8_t *src, int srcstride, |

173 |
; int height, int mx, int my); |

174 |
;----------------------------------------------------------------------------- |

175 | |

176 |
%macro FILTER_SSSE3 3 |

177 |
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 |

178 |
lea r5d, [r5*3] |

179 |
mova m3, [filter_h6_shuf2] |

180 |
mova m4, [filter_h6_shuf3] |

181 |
%ifdef PIC |

182 |
lea r11, [sixtap_filter_hb_m] |

183 |
%endif |

184 |
mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |

185 |
mova m6, [sixtap_filter_hb+r5*8-32] |

186 |
mova m7, [sixtap_filter_hb+r5*8-16] |

187 | |

188 |
.nextrow |

189 |
movu m0, [r2-2] |

190 |
mova m1, m0 |

191 |
mova m2, m0 |

192 |
%ifidn %1, 4 |

193 |
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the |

194 |
; shuffle with a memory operand |

195 |
punpcklbw m0, [r2+3] |

196 |
%else |

197 |
pshufb m0, [filter_h6_shuf1] |

198 |
%endif |

199 |
pshufb m1, m3 |

200 |
pshufb m2, m4 |

201 |
pmaddubsw m0, m5 |

202 |
pmaddubsw m1, m6 |

203 |
pmaddubsw m2, m7 |

204 |
paddsw m0, m1 |

205 |
paddsw m0, m2 |

206 |
paddsw m0, [pw_64] |

207 |
psraw m0, 7 |

208 |
packuswb m0, m0 |

209 |
movh [r0], m0 ; store |

210 | |

211 |
; go to next line |

212 |
add r0, r1 |

213 |
add r2, r3 |

214 |
dec r4d ; next row |

215 |
jg .nextrow |

216 |
REP_RET |

217 | |

218 |
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 |

219 |
shl r5d, 4 |

220 |
mova m2, [pw_64] |

221 |
mova m3, [filter_h2_shuf] |

222 |
mova m4, [filter_h4_shuf] |

223 |
%ifdef PIC |

224 |
lea r11, [fourtap_filter_hb_m] |

225 |
%endif |

226 |
mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |

227 |
mova m6, [fourtap_filter_hb+r5] |

228 | |

229 |
.nextrow |

230 |
movu m0, [r2-1] |

231 |
mova m1, m0 |

232 |
pshufb m0, m3 |

233 |
pshufb m1, m4 |

234 |
pmaddubsw m0, m5 |

235 |
pmaddubsw m1, m6 |

236 |
paddsw m0, m2 |

237 |
paddsw m0, m1 |

238 |
psraw m0, 7 |

239 |
packuswb m0, m0 |

240 |
movh [r0], m0 ; store |

241 | |

242 |
; go to next line |

243 |
add r0, r1 |

244 |
add r2, r3 |

245 |
dec r4d ; next row |

246 |
jg .nextrow |

247 |
REP_RET |

248 | |

249 |
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 |

250 |
shl r6d, 4 |

251 |
%ifdef PIC |

252 |
lea r11, [fourtap_filter_hb_m] |

253 |
%endif |

254 |
mova m5, [fourtap_filter_hb+r6-16] |

255 |
mova m6, [fourtap_filter_hb+r6] |

256 |
mova m7, [pw_64] |

257 | |

258 |
; read 3 lines |

259 |
sub r2, r3 |

260 |
movh m0, [r2] |

261 |
movh m1, [r2+ r3] |

262 |
movh m2, [r2+2*r3] |

263 |
add r2, r3 |

264 | |

265 |
.nextrow |

266 |
movh m3, [r2+2*r3] ; read new row |

267 |
mova m4, m0 |

268 |
mova m0, m1 |

269 |
punpcklbw m4, m1 |

270 |
mova m1, m2 |

271 |
punpcklbw m2, m3 |

272 |
pmaddubsw m4, m5 |

273 |
pmaddubsw m2, m6 |

274 |
paddsw m4, m2 |

275 |
mova m2, m3 |

276 |
paddsw m4, m7 |

277 |
psraw m4, 7 |

278 |
packuswb m4, m4 |

279 |
movh [r0], m4 |

280 | |

281 |
; go to next line |

282 |
add r0, r1 |

283 |
add r2, r3 |

284 |
dec r4d ; next row |

285 |
jg .nextrow |

286 |
REP_RET |

287 | |

288 |
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 |

289 |
lea r6d, [r6*3] |

290 |
%ifdef PIC |

291 |
lea r11, [sixtap_filter_hb_m] |

292 |
%endif |

293 |
lea r6, [sixtap_filter_hb+r6*8] |

294 | |

295 |
; read 5 lines |

296 |
sub r2, r3 |

297 |
sub r2, r3 |

298 |
movh m0, [r2] |

299 |
movh m1, [r2+r3] |

300 |
movh m2, [r2+r3*2] |

301 |
lea r2, [r2+r3*2] |

302 |
add r2, r3 |

303 |
movh m3, [r2] |

304 |
movh m4, [r2+r3] |

305 | |

306 |
.nextrow |

307 |
movh m5, [r2+2*r3] ; read new row |

308 |
mova m6, m0 |

309 |
punpcklbw m6, m5 |

310 |
mova m0, m1 |

311 |
punpcklbw m1, m2 |

312 |
mova m7, m3 |

313 |
punpcklbw m7, m4 |

314 |
pmaddubsw m6, [r6-48] |

315 |
pmaddubsw m1, [r6-32] |

316 |
pmaddubsw m7, [r6-16] |

317 |
paddsw m6, m1 |

318 |
paddsw m6, m7 |

319 |
mova m1, m2 |

320 |
paddsw m6, [pw_64] |

321 |
mova m2, m3 |

322 |
psraw m6, 7 |

323 |
mova m3, m4 |

324 |
packuswb m6, m6 |

325 |
mova m4, m5 |

326 |
movh [r0], m6 |

327 | |

328 |
; go to next line |

329 |
add r0, r1 |

330 |
add r2, r3 |

331 |
dec r4d ; next row |

332 |
jg .nextrow |

333 |
REP_RET |

334 |
%endmacro |

335 | |

336 |
INIT_MMX |

337 |
FILTER_SSSE3 4, 0, 0 |

338 |
INIT_XMM |

339 |
FILTER_SSSE3 8, 8, 7 |

340 | |

341 |
; 4x4 block, H-only 4-tap filter |

342 |
cglobal put_vp8_epel4_h4_mmxext, 6, 6 |

343 |
shl r5d, 4 |

344 |
%ifdef PIC |

345 |
lea r11, [fourtap_filter_hw_m] |

346 |
%endif |

347 |
movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |

348 |
movq mm5, [fourtap_filter_hw+r5] |

349 |
movq mm7, [pw_64] |

350 |
pxor mm6, mm6 |

351 | |

352 |
.nextrow |

353 |
movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels |

354 | |

355 |
; first set of 2 pixels |

356 |
movq mm2, mm1 ; byte ABCD.. |

357 |
punpcklbw mm1, mm6 ; byte->word ABCD |

358 |
pshufw mm0, mm2, 9 ; byte CDEF.. |

359 |
punpcklbw mm0, mm6 ; byte->word CDEF |

360 |
pshufw mm3, mm1, 0x94 ; word ABBC |

361 |
pshufw mm1, mm0, 0x94 ; word CDDE |

362 |
pmaddwd mm3, mm4 ; multiply 2px with F0/F1 |

363 |
movq mm0, mm1 ; backup for second set of pixels |

364 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |

365 |
paddd mm3, mm1 ; finish 1st 2px |

366 | |

367 |
; second set of 2 pixels, use backup of above |

368 |
punpckhbw mm2, mm6 ; byte->word EFGH |

369 |
pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 |

370 |
pshufw mm1, mm2, 0x94 ; word EFFG |

371 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |

372 |
paddd mm0, mm1 ; finish 2nd 2px |

373 | |

374 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |

375 |
packssdw mm3, mm0 ; merge dword->word (4px) |

376 |
paddsw mm3, mm7 ; rounding |

377 |
psraw mm3, 7 |

378 |
packuswb mm3, mm6 ; clip and word->bytes |

379 |
movd [r0], mm3 ; store |

380 | |

381 |
; go to next line |

382 |
add r0, r1 |

383 |
add r2, r3 |

384 |
dec r4d ; next row |

385 |
jg .nextrow |

386 |
REP_RET |

387 | |

388 |
; 4x4 block, H-only 6-tap filter |

389 |
cglobal put_vp8_epel4_h6_mmxext, 6, 6 |

390 |
lea r5d, [r5*3] |

391 |
%ifdef PIC |

392 |
lea r11, [sixtap_filter_hw_m] |

393 |
%endif |

394 |
movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words |

395 |
movq mm5, [sixtap_filter_hw+r5*8-32] |

396 |
movq mm6, [sixtap_filter_hw+r5*8-16] |

397 |
movq mm7, [pw_64] |

398 |
pxor mm3, mm3 |

399 | |

400 |
.nextrow |

401 |
movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels |

402 | |

403 |
; first set of 2 pixels |

404 |
movq mm2, mm1 ; byte ABCD.. |

405 |
punpcklbw mm1, mm3 ; byte->word ABCD |

406 |
pshufw mm0, mm2, 0x9 ; byte CDEF.. |

407 |
punpckhbw mm2, mm3 ; byte->word EFGH |

408 |
punpcklbw mm0, mm3 ; byte->word CDEF |

409 |
pshufw mm1, mm1, 0x94 ; word ABBC |

410 |
pshufw mm2, mm2, 0x94 ; word EFFG |

411 |
pmaddwd mm1, mm4 ; multiply 2px with F0/F1 |

412 |
pshufw mm3, mm0, 0x94 ; word CDDE |

413 |
movq mm0, mm3 ; backup for second set of pixels |

414 |
pmaddwd mm3, mm5 ; multiply 2px with F2/F3 |

415 |
paddd mm1, mm3 ; add to 1st 2px cache |

416 |
movq mm3, mm2 ; backup for second set of pixels |

417 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |

418 |
paddd mm1, mm2 ; finish 1st 2px |

419 | |

420 |
; second set of 2 pixels, use backup of above |

421 |
movd mm2, [r2+3] ; byte FGHI (prevent overreads) |

422 |
pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 |

423 |
pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 |

424 |
paddd mm0, mm3 ; add to 2nd 2px cache |

425 |
pxor mm3, mm3 |

426 |
punpcklbw mm2, mm3 ; byte->word FGHI |

427 |
pshufw mm2, mm2, 0xE9 ; word GHHI |

428 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |

429 |
paddd mm0, mm2 ; finish 2nd 2px |

430 | |

431 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |

432 |
packssdw mm1, mm0 ; merge dword->word (4px) |

433 |
paddsw mm1, mm7 ; rounding |

434 |
psraw mm1, 7 |

435 |
packuswb mm1, mm3 ; clip and word->bytes |

436 |
movd [r0], mm1 ; store |

437 | |

438 |
; go to next line |

439 |
add r0, r1 |

440 |
add r2, r3 |

441 |
dec r4d ; next row |

442 |
jg .nextrow |

443 |
REP_RET |

444 | |

445 |
INIT_XMM |

446 |
cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 |

447 |
shl r5d, 5 |

448 |
%ifdef PIC |

449 |
lea r11, [fourtap_filter_v_m] |

450 |
%endif |

451 |
lea r5, [fourtap_filter_v+r5-32] |

452 |
pxor m7, m7 |

453 |
mova m4, [pw_64] |

454 |
mova m5, [r5+ 0] |

455 |
mova m6, [r5+16] |

456 |
%ifdef m8 |

457 |
mova m8, [r5+32] |

458 |
mova m9, [r5+48] |

459 |
%endif |

460 |
.nextrow |

461 |
movq m0, [r2-1] |

462 |
movq m1, [r2-0] |

463 |
movq m2, [r2+1] |

464 |
movq m3, [r2+2] |

465 |
punpcklbw m0, m7 |

466 |
punpcklbw m1, m7 |

467 |
punpcklbw m2, m7 |

468 |
punpcklbw m3, m7 |

469 |
pmullw m0, m5 |

470 |
pmullw m1, m6 |

471 |
%ifdef m8 |

472 |
pmullw m2, m8 |

473 |
pmullw m3, m9 |

474 |
%else |

475 |
pmullw m2, [r5+32] |

476 |
pmullw m3, [r5+48] |

477 |
%endif |

478 |
paddsw m0, m1 |

479 |
paddsw m2, m3 |

480 |
paddsw m0, m2 |

481 |
paddsw m0, m4 |

482 |
psraw m0, 7 |

483 |
packuswb m0, m7 |

484 |
movh [r0], m0 ; store |

485 | |

486 |
; go to next line |

487 |
add r0, r1 |

488 |
add r2, r3 |

489 |
dec r4d ; next row |

490 |
jg .nextrow |

491 |
REP_RET |

492 | |

493 |
cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 |

494 |
lea r5d, [r5*3] |

495 |
shl r5d, 4 |

496 |
%ifdef PIC |

497 |
lea r11, [sixtap_filter_v_m] |

498 |
%endif |

499 |
lea r5, [sixtap_filter_v+r5-96] |

500 |
pxor m7, m7 |

501 |
mova m6, [pw_64] |

502 |
%ifdef m8 |

503 |
mova m8, [r5+ 0] |

504 |
mova m9, [r5+16] |

505 |
mova m10, [r5+32] |

506 |
mova m11, [r5+48] |

507 |
mova m12, [r5+64] |

508 |
mova m13, [r5+80] |

509 |
%endif |

510 |
.nextrow |

511 |
movq m0, [r2-2] |

512 |
movq m1, [r2-1] |

513 |
movq m2, [r2-0] |

514 |
movq m3, [r2+1] |

515 |
movq m4, [r2+2] |

516 |
movq m5, [r2+3] |

517 |
punpcklbw m0, m7 |

518 |
punpcklbw m1, m7 |

519 |
punpcklbw m2, m7 |

520 |
punpcklbw m3, m7 |

521 |
punpcklbw m4, m7 |

522 |
punpcklbw m5, m7 |

523 |
%ifdef m8 |

524 |
pmullw m0, m8 |

525 |
pmullw m1, m9 |

526 |
pmullw m2, m10 |

527 |
pmullw m3, m11 |

528 |
pmullw m4, m12 |

529 |
pmullw m5, m13 |

530 |
%else |

531 |
pmullw m0, [r5+ 0] |

532 |
pmullw m1, [r5+16] |

533 |
pmullw m2, [r5+32] |

534 |
pmullw m3, [r5+48] |

535 |
pmullw m4, [r5+64] |

536 |
pmullw m5, [r5+80] |

537 |
%endif |

538 |
paddsw m1, m4 |

539 |
paddsw m0, m5 |

540 |
paddsw m1, m2 |

541 |
paddsw m0, m3 |

542 |
paddsw m0, m1 |

543 |
paddsw m0, m6 |

544 |
psraw m0, 7 |

545 |
packuswb m0, m7 |

546 |
movh [r0], m0 ; store |

547 | |

548 |
; go to next line |

549 |
add r0, r1 |

550 |
add r2, r3 |

551 |
dec r4d ; next row |

552 |
jg .nextrow |

553 |
REP_RET |

554 | |

555 |
%macro FILTER_V 3 |

556 |
; 4x4 block, V-only 4-tap filter |

557 |
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 |

558 |
shl r6d, 5 |

559 |
%ifdef PIC |

560 |
lea r11, [fourtap_filter_v_m] |

561 |
%endif |

562 |
lea r6, [fourtap_filter_v+r6-32] |

563 |
mova m6, [pw_64] |

564 |
pxor m7, m7 |

565 |
mova m5, [r6+48] |

566 | |

567 |
; read 3 lines |

568 |
sub r2, r3 |

569 |
movh m0, [r2] |

570 |
movh m1, [r2+ r3] |

571 |
movh m2, [r2+2*r3] |

572 |
add r2, r3 |

573 |
punpcklbw m0, m7 |

574 |
punpcklbw m1, m7 |

575 |
punpcklbw m2, m7 |

576 | |

577 |
.nextrow |

578 |
; first calculate negative taps (to prevent losing positive overflows) |

579 |
movh m4, [r2+2*r3] ; read new row |

580 |
punpcklbw m4, m7 |

581 |
mova m3, m4 |

582 |
pmullw m0, [r6+0] |

583 |
pmullw m4, m5 |

584 |
paddsw m4, m0 |

585 | |

586 |
; then calculate positive taps |

587 |
mova m0, m1 |

588 |
pmullw m1, [r6+16] |

589 |
paddsw m4, m1 |

590 |
mova m1, m2 |

591 |
pmullw m2, [r6+32] |

592 |
paddsw m4, m2 |

593 |
mova m2, m3 |

594 | |

595 |
; round/clip/store |

596 |
paddsw m4, m6 |

597 |
psraw m4, 7 |

598 |
packuswb m4, m7 |

599 |
movh [r0], m4 |

600 | |

601 |
; go to next line |

602 |
add r0, r1 |

603 |
add r2, r3 |

604 |
dec r4d ; next row |

605 |
jg .nextrow |

606 |
REP_RET |

607 | |

608 | |

609 |
; 4x4 block, V-only 6-tap filter |

610 |
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 |

611 |
shl r6d, 4 |

612 |
lea r6, [r6*3] |

613 |
%ifdef PIC |

614 |
lea r11, [sixtap_filter_v_m] |

615 |
%endif |

616 |
lea r6, [sixtap_filter_v+r6-96] |

617 |
pxor m7, m7 |

618 | |

619 |
; read 5 lines |

620 |
sub r2, r3 |

621 |
sub r2, r3 |

622 |
movh m0, [r2] |

623 |
movh m1, [r2+r3] |

624 |
movh m2, [r2+r3*2] |

625 |
lea r2, [r2+r3*2] |

626 |
add r2, r3 |

627 |
movh m3, [r2] |

628 |
movh m4, [r2+r3] |

629 |
punpcklbw m0, m7 |

630 |
punpcklbw m1, m7 |

631 |
punpcklbw m2, m7 |

632 |
punpcklbw m3, m7 |

633 |
punpcklbw m4, m7 |

634 | |

635 |
.nextrow |

636 |
; first calculate negative taps (to prevent losing positive overflows) |

637 |
mova m5, m1 |

638 |
pmullw m5, [r6+16] |

639 |
mova m6, m4 |

640 |
pmullw m6, [r6+64] |

641 |
paddsw m6, m5 |

642 | |

643 |
; then calculate positive taps |

644 |
movh m5, [r2+2*r3] ; read new row |

645 |
punpcklbw m5, m7 |

646 |
pmullw m0, [r6+0] |

647 |
paddsw m6, m0 |

648 |
mova m0, m1 |

649 |
mova m1, m2 |

650 |
pmullw m2, [r6+32] |

651 |
paddsw m6, m2 |

652 |
mova m2, m3 |

653 |
pmullw m3, [r6+48] |

654 |
paddsw m6, m3 |

655 |
mova m3, m4 |

656 |
mova m4, m5 |

657 |
pmullw m5, [r6+80] |

658 |
paddsw m6, m5 |

659 | |

660 |
; round/clip/store |

661 |
paddsw m6, [pw_64] |

662 |
psraw m6, 7 |

663 |
packuswb m6, m7 |

664 |
movh [r0], m6 |

665 | |

666 |
; go to next line |

667 |
add r0, r1 |

668 |
add r2, r3 |

669 |
dec r4d ; next row |

670 |
jg .nextrow |

671 |
REP_RET |

672 |
%endmacro |

673 | |

674 |
INIT_MMX |

675 |
FILTER_V mmxext, 4, 0 |

676 |
INIT_XMM |

677 |
FILTER_V sse2, 8, 8 |

678 | |

679 |
%macro FILTER_BILINEAR 3 |

680 |
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 |

681 |
mov r5d, 8*16 |

682 |
shl r6d, 4 |

683 |
sub r5d, r6d |

684 |
%ifdef PIC |

685 |
lea r11, [bilinear_filter_vw_m] |

686 |
%endif |

687 |
pxor m6, m6 |

688 |
mova m4, [bilinear_filter_vw+r5-16] |

689 |
mova m5, [bilinear_filter_vw+r6-16] |

690 |
.nextrow |

691 |
movh m0, [r2+r3*0] |

692 |
movh m1, [r2+r3*1] |

693 |
movh m3, [r2+r3*2] |

694 |
punpcklbw m0, m6 |

695 |
punpcklbw m1, m6 |

696 |
punpcklbw m3, m6 |

697 |
mova m2, m1 |

698 |
pmullw m0, m4 |

699 |
pmullw m1, m5 |

700 |
pmullw m2, m4 |

701 |
pmullw m3, m5 |

702 |
paddsw m0, m1 |

703 |
paddsw m2, m3 |

704 |
psraw m0, 2 |

705 |
psraw m2, 2 |

706 |
pavgw m0, m6 |

707 |
pavgw m2, m6 |

708 |
%ifidn %1, mmxext |

709 |
packuswb m0, m0 |

710 |
packuswb m2, m2 |

711 |
movh [r0+r1*0], m0 |

712 |
movh [r0+r1*1], m2 |

713 |
%else |

714 |
packuswb m0, m2 |

715 |
movh [r0+r1*0], m0 |

716 |
movhps [r0+r1*1], m0 |

717 |
%endif |

718 | |

719 |
lea r0, [r0+r1*2] |

720 |
lea r2, [r2+r3*2] |

721 |
sub r4d, 2 |

722 |
jg .nextrow |

723 |
REP_RET |

724 | |

725 |
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 |

726 |
mov r6d, 8*16 |

727 |
shl r5d, 4 |

728 |
sub r6d, r5d |

729 |
%ifdef PIC |

730 |
lea r11, [bilinear_filter_vw_m] |

731 |
%endif |

732 |
pxor m6, m6 |

733 |
mova m4, [bilinear_filter_vw+r6-16] |

734 |
mova m5, [bilinear_filter_vw+r5-16] |

735 |
.nextrow |

736 |
movh m0, [r2+r3*0+0] |

737 |
movh m1, [r2+r3*0+1] |

738 |
movh m2, [r2+r3*1+0] |

739 |
movh m3, [r2+r3*1+1] |

740 |
punpcklbw m0, m6 |

741 |
punpcklbw m1, m6 |

742 |
punpcklbw m2, m6 |

743 |
punpcklbw m3, m6 |

744 |
pmullw m0, m4 |

745 |
pmullw m1, m5 |

746 |
pmullw m2, m4 |

747 |
pmullw m3, m5 |

748 |
paddsw m0, m1 |

749 |
paddsw m2, m3 |

750 |
psraw m0, 2 |

751 |
psraw m2, 2 |

752 |
pavgw m0, m6 |

753 |
pavgw m2, m6 |

754 |
%ifidn %1, mmxext |

755 |
packuswb m0, m0 |

756 |
packuswb m2, m2 |

757 |
movh [r0+r1*0], m0 |

758 |
movh [r0+r1*1], m2 |

759 |
%else |

760 |
packuswb m0, m2 |

761 |
movh [r0+r1*0], m0 |

762 |
movhps [r0+r1*1], m0 |

763 |
%endif |

764 | |

765 |
lea r0, [r0+r1*2] |

766 |
lea r2, [r2+r3*2] |

767 |
sub r4d, 2 |

768 |
jg .nextrow |

769 |
REP_RET |

770 |
%endmacro |

771 | |

772 |
INIT_MMX |

773 |
FILTER_BILINEAR mmxext, 4, 0 |

774 |
INIT_XMM |

775 |
FILTER_BILINEAR sse2, 8, 7 |

776 | |

777 |
%macro FILTER_BILINEAR_SSSE3 1 |

778 |
cglobal put_vp8_bilinear%1_v_ssse3, 7,7 |

779 |
shl r6d, 4 |

780 |
%ifdef PIC |

781 |
lea r11, [bilinear_filter_vb_m] |

782 |
%endif |

783 |
pxor m4, m4 |

784 |
mova m3, [bilinear_filter_vb+r6-16] |

785 |
.nextrow |

786 |
movh m0, [r2+r3*0] |

787 |
movh m1, [r2+r3*1] |

788 |
movh m2, [r2+r3*2] |

789 |
punpcklbw m0, m1 |

790 |
punpcklbw m1, m2 |

791 |
pmaddubsw m0, m3 |

792 |
pmaddubsw m1, m3 |

793 |
psraw m0, 2 |

794 |
psraw m1, 2 |

795 |
pavgw m0, m4 |

796 |
pavgw m1, m4 |

797 |
%if mmsize==8 |

798 |
packuswb m0, m0 |

799 |
packuswb m1, m1 |

800 |
movh [r0+r1*0], m0 |

801 |
movh [r0+r1*1], m1 |

802 |
%else |

803 |
packuswb m0, m1 |

804 |
movh [r0+r1*0], m0 |

805 |
movhps [r0+r1*1], m0 |

806 |
%endif |

807 | |

808 |
lea r0, [r0+r1*2] |

809 |
lea r2, [r2+r3*2] |

810 |
sub r4d, 2 |

811 |
jg .nextrow |

812 |
REP_RET |

813 | |

814 |
cglobal put_vp8_bilinear%1_h_ssse3, 7,7 |

815 |
shl r5d, 4 |

816 |
%ifdef PIC |

817 |
lea r11, [bilinear_filter_vb_m] |

818 |
%endif |

819 |
pxor m4, m4 |

820 |
mova m2, [filter_h2_shuf] |

821 |
mova m3, [bilinear_filter_vb+r5-16] |

822 |
.nextrow |

823 |
movu m0, [r2+r3*0] |

824 |
movu m1, [r2+r3*1] |

825 |
pshufb m0, m2 |

826 |
pshufb m1, m2 |

827 |
pmaddubsw m0, m3 |

828 |
pmaddubsw m1, m3 |

829 |
psraw m0, 2 |

830 |
psraw m1, 2 |

831 |
pavgw m0, m4 |

832 |
pavgw m1, m4 |

833 |
%if mmsize==8 |

834 |
packuswb m0, m0 |

835 |
packuswb m1, m1 |

836 |
movh [r0+r1*0], m0 |

837 |
movh [r0+r1*1], m1 |

838 |
%else |

839 |
packuswb m0, m1 |

840 |
movh [r0+r1*0], m0 |

841 |
movhps [r0+r1*1], m0 |

842 |
%endif |

843 | |

844 |
lea r0, [r0+r1*2] |

845 |
lea r2, [r2+r3*2] |

846 |
sub r4d, 2 |

847 |
jg .nextrow |

848 |
REP_RET |

849 |
%endmacro |

850 | |

851 |
INIT_MMX |

852 |
FILTER_BILINEAR_SSSE3 4 |

853 |
INIT_XMM |

854 |
FILTER_BILINEAR_SSSE3 8 |

855 | |

856 |
cglobal put_vp8_pixels8_mmx, 5,5 |

857 |
.nextrow: |

858 |
movq mm0, [r2+r3*0] |

859 |
movq mm1, [r2+r3*1] |

860 |
lea r2, [r2+r3*2] |

861 |
movq [r0+r1*0], mm0 |

862 |
movq [r0+r1*1], mm1 |

863 |
lea r0, [r0+r1*2] |

864 |
sub r4d, 2 |

865 |
jg .nextrow |

866 |
REP_RET |

867 | |

868 |
cglobal put_vp8_pixels16_mmx, 5,5 |

869 |
.nextrow: |

870 |
movq mm0, [r2+r3*0+0] |

871 |
movq mm1, [r2+r3*0+8] |

872 |
movq mm2, [r2+r3*1+0] |

873 |
movq mm3, [r2+r3*1+8] |

874 |
lea r2, [r2+r3*2] |

875 |
movq [r0+r1*0+0], mm0 |

876 |
movq [r0+r1*0+8], mm1 |

877 |
movq [r0+r1*1+0], mm2 |

878 |
movq [r0+r1*1+8], mm3 |

879 |
lea r0, [r0+r1*2] |

880 |
sub r4d, 2 |

881 |
jg .nextrow |

882 |
REP_RET |

883 | |

884 |
cglobal put_vp8_pixels16_sse, 5,5,2 |

885 |
.nextrow: |

886 |
movups xmm0, [r2+r3*0] |

887 |
movups xmm1, [r2+r3*1] |

888 |
lea r2, [r2+r3*2] |

889 |
movaps [r0+r1*0], xmm0 |

890 |
movaps [r0+r1*1], xmm1 |

891 |
lea r0, [r0+r1*2] |

892 |
sub r4d, 2 |

893 |
jg .nextrow |

894 |
REP_RET |

895 | |

896 |
;----------------------------------------------------------------------------- |

897 |
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |

898 |
;----------------------------------------------------------------------------- |

899 | |

900 |
%macro ADD_DC 4 |

901 |
%4 m2, [r0+%3] |

902 |
%4 m3, [r0+r2+%3] |

903 |
%4 m4, [r1+%3] |

904 |
%4 m5, [r1+r2+%3] |

905 |
paddusb m2, %1 |

906 |
paddusb m3, %1 |

907 |
paddusb m4, %1 |

908 |
paddusb m5, %1 |

909 |
psubusb m2, %2 |

910 |
psubusb m3, %2 |

911 |
psubusb m4, %2 |

912 |
psubusb m5, %2 |

913 |
%4 [r0+%3], m2 |

914 |
%4 [r0+r2+%3], m3 |

915 |
%4 [r1+%3], m4 |

916 |
%4 [r1+r2+%3], m5 |

917 |
%endmacro |

918 | |

919 |
INIT_MMX |

920 |
cglobal vp8_idct_dc_add_mmx, 3, 3 |

921 |
; load data |

922 |
movd m0, [r1] |

923 | |

924 |
; calculate DC |

925 |
paddw m0, [pw_4] |

926 |
pxor m1, m1 |

927 |
psraw m0, 3 |

928 |
movd [r1], m1 |

929 |
psubw m1, m0 |

930 |
packuswb m0, m0 |

931 |
packuswb m1, m1 |

932 |
punpcklbw m0, m0 |

933 |
punpcklbw m1, m1 |

934 |
punpcklwd m0, m0 |

935 |
punpcklwd m1, m1 |

936 | |

937 |
; add DC |

938 |
lea r1, [r0+r2*2] |

939 |
ADD_DC m0, m1, 0, movh |

940 |
RET |

941 | |

942 |
INIT_XMM |

943 |
cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |

944 |
; load data |

945 |
movd m0, [r1] |

946 |
pxor m1, m1 |

947 | |

948 |
; calculate DC |

949 |
paddw m0, [pw_4] |

950 |
movd [r1], m1 |

951 |
lea r1, [r0+r2*2] |

952 |
movd m2, [r0] |

953 |
movd m3, [r0+r2] |

954 |
movd m4, [r1] |

955 |
movd m5, [r1+r2] |

956 |
psraw m0, 3 |

957 |
pshuflw m0, m0, 0 |

958 |
punpcklqdq m0, m0 |

959 |
punpckldq m2, m3 |

960 |
punpckldq m4, m5 |

961 |
punpcklbw m2, m1 |

962 |
punpcklbw m4, m1 |

963 |
paddw m2, m0 |

964 |
paddw m4, m0 |

965 |
packuswb m2, m4 |

966 |
movd [r0], m2 |

967 |
pextrd [r0+r2], m2, 1 |

968 |
pextrd [r1], m2, 2 |

969 |
pextrd [r1+r2], m2, 3 |

970 |
RET |

971 | |

972 |
;----------------------------------------------------------------------------- |

973 |
; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |

974 |
;----------------------------------------------------------------------------- |

975 | |

976 |
INIT_MMX |

977 |
cglobal vp8_idct_dc_add4y_mmx, 3, 3 |

978 |
; load data |

979 |
movd m0, [r1+32*0] ; A |

980 |
movd m1, [r1+32*2] ; C |

981 |
punpcklwd m0, [r1+32*1] ; A B |

982 |
punpcklwd m1, [r1+32*3] ; C D |

983 |
punpckldq m0, m1 ; A B C D |

984 |
pxor m6, m6 |

985 | |

986 |
; calculate DC |

987 |
paddw m0, [pw_4] |

988 |
movd [r1+32*0], m6 |

989 |
movd [r1+32*1], m6 |

990 |
movd [r1+32*2], m6 |

991 |
movd [r1+32*3], m6 |

992 |
psraw m0, 3 |

993 |
psubw m6, m0 |

994 |
packuswb m0, m0 |

995 |
packuswb m6, m6 |

996 |
punpcklbw m0, m0 ; AABBCCDD |

997 |
punpcklbw m6, m6 ; AABBCCDD |

998 |
movq m1, m0 |

999 |
movq m7, m6 |

1000 |
punpcklbw m0, m0 ; AAAABBBB |

1001 |
punpckhbw m1, m1 ; CCCCDDDD |

1002 |
punpcklbw m6, m6 ; AAAABBBB |

1003 |
punpckhbw m7, m7 ; CCCCDDDD |

1004 | |

1005 |
; add DC |

1006 |
lea r1, [r0+r2*2] |

1007 |
ADD_DC m0, m6, 0, mova |

1008 |
ADD_DC m1, m7, 8, mova |

1009 |
RET |

1010 | |

1011 |
INIT_XMM |

1012 |
cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 |

1013 |
; load data |

1014 |
movd m0, [r1+32*0] ; A |

1015 |
movd m1, [r1+32*2] ; C |

1016 |
punpcklwd m0, [r1+32*1] ; A B |

1017 |
punpcklwd m1, [r1+32*3] ; C D |

1018 |
punpckldq m0, m1 ; A B C D |

1019 |
pxor m1, m1 |

1020 | |

1021 |
; calculate DC |

1022 |
paddw m0, [pw_4] |

1023 |
movd [r1+32*0], m1 |

1024 |
movd [r1+32*1], m1 |

1025 |
movd [r1+32*2], m1 |

1026 |
movd [r1+32*3], m1 |

1027 |
psraw m0, 3 |

1028 |
psubw m1, m0 |

1029 |
packuswb m0, m0 |

1030 |
packuswb m1, m1 |

1031 |
punpcklbw m0, m0 |

1032 |
punpcklbw m1, m1 |

1033 |
punpcklbw m0, m0 |

1034 |
punpcklbw m1, m1 |

1035 | |

1036 |
; add DC |

1037 |
lea r1, [r0+r2*2] |

1038 |
ADD_DC m0, m1, 0, mova |

1039 |
RET |

1040 | |

1041 |
;----------------------------------------------------------------------------- |

1042 |
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |

1043 |
;----------------------------------------------------------------------------- |

1044 | |

1045 |
INIT_MMX |

1046 |
cglobal vp8_idct_dc_add4uv_mmx, 3, 3 |

1047 |
; load data |

1048 |
movd m0, [r1+32*0] ; A |

1049 |
movd m1, [r1+32*2] ; C |

1050 |
punpcklwd m0, [r1+32*1] ; A B |

1051 |
punpcklwd m1, [r1+32*3] ; C D |

1052 |
punpckldq m0, m1 ; A B C D |

1053 |
pxor m6, m6 |

1054 | |

1055 |
; calculate DC |

1056 |
paddw m0, [pw_4] |

1057 |
movd [r1+32*0], m6 |

1058 |
movd [r1+32*1], m6 |

1059 |
movd [r1+32*2], m6 |

1060 |
movd [r1+32*3], m6 |

1061 |
psraw m0, 3 |

1062 |
psubw m6, m0 |

1063 |
packuswb m0, m0 |

1064 |
packuswb m6, m6 |

1065 |
punpcklbw m0, m0 ; AABBCCDD |

1066 |
punpcklbw m6, m6 ; AABBCCDD |

1067 |
movq m1, m0 |

1068 |
movq m7, m6 |

1069 |
punpcklbw m0, m0 ; AAAABBBB |

1070 |
punpckhbw m1, m1 ; CCCCDDDD |

1071 |
punpcklbw m6, m6 ; AAAABBBB |

1072 |
punpckhbw m7, m7 ; CCCCDDDD |

1073 | |

1074 |
; add DC |

1075 |
lea r1, [r0+r2*2] |

1076 |
ADD_DC m0, m6, 0, mova |

1077 |
lea r0, [r0+r2*4] |

1078 |
lea r1, [r1+r2*4] |

1079 |
ADD_DC m1, m7, 0, mova |

1080 |
RET |

1081 | |

1082 |
;----------------------------------------------------------------------------- |

1083 |
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |

1084 |
;----------------------------------------------------------------------------- |

1085 | |

1086 |
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) |

1087 |
; this macro assumes that m6/m7 have words for 20091/17734 loaded |

1088 |
%macro VP8_MULTIPLY_SUMSUB 4 |

1089 |
mova %3, %1 |

1090 |
mova %4, %2 |

1091 |
pmulhw %3, m6 ;20091(1) |

1092 |
pmulhw %4, m6 ;20091(2) |

1093 |
paddw %3, %1 |

1094 |
paddw %4, %2 |

1095 |
paddw %1, %1 |

1096 |
paddw %2, %2 |

1097 |
pmulhw %1, m7 ;35468(1) |

1098 |
pmulhw %2, m7 ;35468(2) |

1099 |
psubw %1, %4 |

1100 |
paddw %2, %3 |

1101 |
%endmacro |

1102 | |

1103 |
; calculate x0=%1+%3; x1=%1-%3 |

1104 |
; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) |

1105 |
; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) |

1106 |
; %5/%6 are temporary registers |

1107 |
; we assume m6/m7 have constant words 20091/17734 loaded in them |

1108 |
%macro VP8_IDCT_TRANSFORM4x4_1D 6 |

1109 |
SUMSUB_BA m%3, m%1, m%5 ;t0, t1 |

1110 |
VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 |

1111 |
SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 |

1112 |
SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 |

1113 |
SWAP %4, %1 |

1114 |
SWAP %4, %3 |

1115 |
%endmacro |

1116 | |

1117 |
INIT_MMX |

1118 |
%macro VP8_IDCT_ADD 1 |

1119 |
cglobal vp8_idct_add_%1, 3, 3 |

1120 |
; load block data |

1121 |
movq m0, [r1+ 0] |

1122 |
movq m1, [r1+ 8] |

1123 |
movq m2, [r1+16] |

1124 |
movq m3, [r1+24] |

1125 |
movq m6, [pw_20091] |

1126 |
movq m7, [pw_17734] |

1127 |
%ifidn %1, sse |

1128 |
xorps xmm0, xmm0 |

1129 |
movaps [r1+ 0], xmm0 |

1130 |
movaps [r1+16], xmm0 |

1131 |
%else |

1132 |
pxor m4, m4 |

1133 |
movq [r1+ 0], m4 |

1134 |
movq [r1+ 8], m4 |

1135 |
movq [r1+16], m4 |

1136 |
movq [r1+24], m4 |

1137 |
%endif |

1138 | |

1139 |
; actual IDCT |

1140 |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |

1141 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1142 |
paddw m0, [pw_4] |

1143 |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |

1144 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1145 | |

1146 |
; store |

1147 |
pxor m4, m4 |

1148 |
lea r1, [r0+2*r2] |

1149 |
STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 |

1150 |
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 |

1151 | |

1152 |
RET |

1153 |
%endmacro |

1154 | |

1155 |
VP8_IDCT_ADD mmx |

1156 |
VP8_IDCT_ADD sse |

1157 | |

1158 |
;----------------------------------------------------------------------------- |

1159 |
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |

1160 |
;----------------------------------------------------------------------------- |

1161 | |

1162 |
%macro SCATTER_WHT 3 |

1163 |
movd r1d, m%1 |

1164 |
movd r2d, m%2 |

1165 |
mov [r0+2*16*(0+%3)], r1w |

1166 |
mov [r0+2*16*(1+%3)], r2w |

1167 |
shr r1d, 16 |

1168 |
shr r2d, 16 |

1169 |
psrlq m%1, 32 |

1170 |
psrlq m%2, 32 |

1171 |
mov [r0+2*16*(4+%3)], r1w |

1172 |
mov [r0+2*16*(5+%3)], r2w |

1173 |
movd r1d, m%1 |

1174 |
movd r2d, m%2 |

1175 |
mov [r0+2*16*(8+%3)], r1w |

1176 |
mov [r0+2*16*(9+%3)], r2w |

1177 |
shr r1d, 16 |

1178 |
shr r2d, 16 |

1179 |
mov [r0+2*16*(12+%3)], r1w |

1180 |
mov [r0+2*16*(13+%3)], r2w |

1181 |
%endmacro |

1182 | |

1183 |
%macro HADAMARD4_1D 4 |

1184 |
SUMSUB_BADC m%2, m%1, m%4, m%3 |

1185 |
SUMSUB_BADC m%4, m%2, m%3, m%1 |

1186 |
SWAP %1, %4, %3 |

1187 |
%endmacro |

1188 | |

1189 |
%macro VP8_DC_WHT 1 |

1190 |
cglobal vp8_luma_dc_wht_%1, 2,3 |

1191 |
movq m0, [r1] |

1192 |
movq m1, [r1+8] |

1193 |
movq m2, [r1+16] |

1194 |
movq m3, [r1+24] |

1195 |
%ifidn %1, sse |

1196 |
xorps xmm0, xmm0 |

1197 |
movaps [r1+ 0], xmm0 |

1198 |
movaps [r1+16], xmm0 |

1199 |
%else |

1200 |
pxor m4, m4 |

1201 |
movq [r1+ 0], m4 |

1202 |
movq [r1+ 8], m4 |

1203 |
movq [r1+16], m4 |

1204 |
movq [r1+24], m4 |

1205 |
%endif |

1206 |
HADAMARD4_1D 0, 1, 2, 3 |

1207 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1208 |
paddw m0, [pw_3] |

1209 |
HADAMARD4_1D 0, 1, 2, 3 |

1210 |
psraw m0, 3 |

1211 |
psraw m1, 3 |

1212 |
psraw m2, 3 |

1213 |
psraw m3, 3 |

1214 |
SCATTER_WHT 0, 1, 0 |

1215 |
SCATTER_WHT 2, 3, 2 |

1216 |
RET |

1217 |
%endmacro |

1218 | |

1219 |
INIT_MMX |

1220 |
VP8_DC_WHT mmx |

1221 |
VP8_DC_WHT sse |

1222 | |

1223 |
;----------------------------------------------------------------------------- |

1224 |
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |

1225 |
;----------------------------------------------------------------------------- |

1226 | |

1227 |
; macro called with 7 mm register indexes as argument, and 4 regular registers |

1228 |
; |

1229 |
; first 4 mm registers will carry the transposed pixel data |

1230 |
; the other three are scratchspace (one would be sufficient, but this allows |

1231 |
; for more spreading/pipelining and thus faster execution on OOE CPUs) |

1232 |
; |

1233 |
; first two regular registers are buf+4*stride and buf+5*stride |

1234 |
; third is -stride, fourth is +stride |

1235 |
%macro READ_8x4_INTERLEAVED 11 |

1236 |
; interleave 8 (A-H) rows of 4 pixels each |

1237 |
movd m%1, [%8+%10*4] ; A0-3 |

1238 |
movd m%5, [%9+%10*4] ; B0-3 |

1239 |
movd m%2, [%8+%10*2] ; C0-3 |

1240 |
movd m%6, [%8+%10] ; D0-3 |

1241 |
movd m%3, [%8] ; E0-3 |

1242 |
movd m%7, [%9] ; F0-3 |

1243 |
movd m%4, [%9+%11] ; G0-3 |

1244 |
punpcklbw m%1, m%5 ; A/B interleaved |

1245 |
movd m%5, [%9+%11*2] ; H0-3 |

1246 |
punpcklbw m%2, m%6 ; C/D interleaved |

1247 |
punpcklbw m%3, m%7 ; E/F interleaved |

1248 |
punpcklbw m%4, m%5 ; G/H interleaved |

1249 |
%endmacro |

1250 | |

1251 |
; macro called with 7 mm register indexes as argument, and 5 regular registers |

1252 |
; first 11 mean the same as READ_8x4_TRANSPOSED above |

1253 |
; fifth regular register is scratchspace to reach the bottom 8 rows, it |

1254 |
; will be set to second regular register + 8*stride at the end |

1255 |
%macro READ_16x4_INTERLEAVED 12 |

1256 |
; transpose 16 (A-P) rows of 4 pixels each |

1257 |
lea %12, [r0+8*r2] |

1258 | |

1259 |
; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |

1260 |
movd m%1, [%8+%10*4] ; A0-3 |

1261 |
movd m%3, [%12+%10*4] ; I0-3 |

1262 |
movd m%2, [%8+%10*2] ; C0-3 |

1263 |
movd m%4, [%12+%10*2] ; K0-3 |

1264 |
movd m%6, [%8+%10] ; D0-3 |

1265 |
movd m%5, [%12+%10] ; L0-3 |

1266 |
movd m%7, [%12] ; M0-3 |

1267 |
add %12, %11 |

1268 |
punpcklbw m%1, m%3 ; A/I |

1269 |
movd m%3, [%8] ; E0-3 |

1270 |
punpcklbw m%2, m%4 ; C/K |

1271 |
punpcklbw m%6, m%5 ; D/L |

1272 |
punpcklbw m%3, m%7 ; E/M |

1273 |
punpcklbw m%2, m%6 ; C/D/K/L interleaved |

1274 | |

1275 |
; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |

1276 |
movd m%5, [%9+%10*4] ; B0-3 |

1277 |
movd m%4, [%12+%10*4] ; J0-3 |

1278 |
movd m%7, [%9] ; F0-3 |

1279 |
movd m%6, [%12] ; N0-3 |

1280 |
punpcklbw m%5, m%4 ; B/J |

1281 |
punpcklbw m%7, m%6 ; F/N |

1282 |
punpcklbw m%1, m%5 ; A/B/I/J interleaved |

1283 |
punpcklbw m%3, m%7 ; E/F/M/N interleaved |

1284 |
movd m%4, [%9+%11] ; G0-3 |

1285 |
movd m%6, [%12+%11] ; O0-3 |

1286 |
movd m%5, [%9+%11*2] ; H0-3 |

1287 |
movd m%7, [%12+%11*2] ; P0-3 |

1288 |
punpcklbw m%4, m%6 ; G/O |

1289 |
punpcklbw m%5, m%7 ; H/P |

1290 |
punpcklbw m%4, m%5 ; G/H/O/P interleaved |

1291 |
%endmacro |

1292 | |

1293 |
; write 4 mm registers of 2 dwords each |

1294 |
; first four arguments are mm register indexes containing source data |

1295 |
; last four are registers containing buf+4*stride, buf+5*stride, |

1296 |
; -stride and +stride |

1297 |
%macro WRITE_4x2D 8 |

1298 |
; write out (2 dwords per register) |

1299 |
movd [%5+%7*4], m%1 |

1300 |
movd [%5+%7*2], m%2 |

1301 |
movd [%5], m%3 |

1302 |
movd [%6+%8], m%4 |

1303 |
punpckhdq m%1, m%1 |

1304 |
punpckhdq m%2, m%2 |

1305 |
punpckhdq m%3, m%3 |

1306 |
punpckhdq m%4, m%4 |

1307 |
movd [%6+%7*4], m%1 |

1308 |
movd [%5+%7], m%2 |

1309 |
movd [%6], m%3 |

1310 |
movd [%6+%8*2], m%4 |

1311 |
%endmacro |

1312 | |

1313 |
; write 4 xmm registers of 4 dwords each |

1314 |
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |

1315 |
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |

1316 |
; we add 1*stride to the third regular registry in the process |

1317 |
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the |

1318 |
; same memory region), or 8 if they cover two separate buffers (third one points to |

1319 |
; a different memory region than the first two), allowing for more optimal code for |

1320 |
; the 16-width case |

1321 |
%macro WRITE_4x4D 10 |

1322 |
; write out (4 dwords per register), start with dwords zero |

1323 |
movd [%5+%8*4], m%1 |

1324 |
movd [%5], m%2 |

1325 |
movd [%7+%8*4], m%3 |

1326 |
movd [%7], m%4 |

1327 | |

1328 |
; store dwords 1 |

1329 |
psrldq m%1, 4 |

1330 |
psrldq m%2, 4 |

1331 |
psrldq m%3, 4 |

1332 |
psrldq m%4, 4 |

1333 |
movd [%6+%8*4], m%1 |

1334 |
movd [%6], m%2 |

1335 |
%if %10 == 16 |

1336 |
movd [%6+%9*4], m%3 |

1337 |
%endif |

1338 |
movd [%7+%9], m%4 |

1339 | |

1340 |
; write dwords 2 |

1341 |
psrldq m%1, 4 |

1342 |
psrldq m%2, 4 |

1343 |
%if %10 == 8 |

1344 |
movd [%5+%8*2], m%1 |

1345 |
movd %5d, m%3 |

1346 |
%endif |

1347 |
psrldq m%3, 4 |

1348 |
psrldq m%4, 4 |

1349 |
%if %10 == 16 |

1350 |
movd [%5+%8*2], m%1 |

1351 |
%endif |

1352 |
movd [%6+%9], m%2 |

1353 |
movd [%7+%8*2], m%3 |

1354 |
movd [%7+%9*2], m%4 |

1355 |
add %7, %9 |

1356 | |

1357 |
; store dwords 3 |

1358 |
psrldq m%1, 4 |

1359 |
psrldq m%2, 4 |

1360 |
psrldq m%3, 4 |

1361 |
psrldq m%4, 4 |

1362 |
%if %10 == 8 |

1363 |
mov [%7+%8*4], %5d |

1364 |
movd [%6+%8*2], m%1 |

1365 |
%else |

1366 |
movd [%5+%8], m%1 |

1367 |
%endif |

1368 |
movd [%6+%9*2], m%2 |

1369 |
movd [%7+%8*2], m%3 |

1370 |
movd [%7+%9*2], m%4 |

1371 |
%endmacro |

1372 | |

1373 |
; write 4 or 8 words in the mmx/xmm registers as 8 lines |

1374 |
; 1 and 2 are the registers to write, this can be the same (for SSE2) |

1375 |
; for pre-SSE4: |

1376 |
; 3 is a general-purpose register that we will clobber |

1377 |
; for SSE4: |

1378 |
; 3 is a pointer to the destination's 5th line |

1379 |
; 4 is a pointer to the destination's 4th line |

1380 |
; 5/6 is -stride and +stride |

1381 |
%macro WRITE_2x4W 6 |

1382 |
movd %3d, %1 |

1383 |
punpckhdq %1, %1 |

1384 |
mov [%4+%5*4], %3w |

1385 |
shr %3, 16 |

1386 |
add %4, %6 |

1387 |
mov [%4+%5*4], %3w |

1388 | |

1389 |
movd %3d, %1 |

1390 |
add %4, %5 |

1391 |
mov [%4+%5*2], %3w |

1392 |
shr %3, 16 |

1393 |
mov [%4+%5 ], %3w |

1394 | |

1395 |
movd %3d, %2 |

1396 |
punpckhdq %2, %2 |

1397 |
mov [%4 ], %3w |

1398 |
shr %3, 16 |

1399 |
mov [%4+%6 ], %3w |

1400 | |

1401 |
movd %3d, %2 |

1402 |
add %4, %6 |

1403 |
mov [%4+%6 ], %3w |

1404 |
shr %3, 16 |

1405 |
mov [%4+%6*2], %3w |

1406 |
add %4, %5 |

1407 |
%endmacro |

1408 | |

1409 |
%macro WRITE_8W_SSE2 5 |

1410 |
movd %2d, %1 |

1411 |
psrldq %1, 4 |

1412 |
mov [%3+%4*4], %2w |

1413 |
shr %2, 16 |

1414 |
add %3, %5 |

1415 |
mov [%3+%4*4], %2w |

1416 | |

1417 |
movd %2d, %1 |

1418 |
psrldq %1, 4 |

1419 |
add %3, %4 |

1420 |
mov [%3+%4*2], %2w |

1421 |
shr %2, 16 |

1422 |
mov [%3+%4 ], %2w |

1423 | |

1424 |
movd %2d, %1 |

1425 |
psrldq %1, 4 |

1426 |
mov [%3 ], %2w |

1427 |
shr %2, 16 |

1428 |
mov [%3+%5 ], %2w |

1429 | |

1430 |
movd %2d, %1 |

1431 |
add %3, %5 |

1432 |
mov [%3+%5 ], %2w |

1433 |
shr %2, 16 |

1434 |
mov [%3+%5*2], %2w |

1435 |
%endmacro |

1436 | |

1437 |
%macro WRITE_8W_SSE4 5 |

1438 |
pextrw [%3+%4*4], %1, 0 |

1439 |
pextrw [%2+%4*4], %1, 1 |

1440 |
pextrw [%3+%4*2], %1, 2 |

1441 |
pextrw [%3+%4 ], %1, 3 |

1442 |
pextrw [%3 ], %1, 4 |

1443 |
pextrw [%2 ], %1, 5 |

1444 |
pextrw [%2+%5 ], %1, 6 |

1445 |
pextrw [%2+%5*2], %1, 7 |

1446 |
%endmacro |

1447 | |

1448 |
%macro SPLATB_REG_MMX 2-3 |

1449 |
movd %1, %2d |

1450 |
punpcklbw %1, %1 |

1451 |
punpcklwd %1, %1 |

1452 |
punpckldq %1, %1 |

1453 |
%endmacro |

1454 | |

1455 |
%macro SPLATB_REG_MMXEXT 2-3 |

1456 |
movd %1, %2d |

1457 |
punpcklbw %1, %1 |

1458 |
pshufw %1, %1, 0x0 |

1459 |
%endmacro |

1460 | |

1461 |
%macro SPLATB_REG_SSE2 2-3 |

1462 |
movd %1, %2d |

1463 |
punpcklbw %1, %1 |

1464 |
pshuflw %1, %1, 0x0 |

1465 |
punpcklqdq %1, %1 |

1466 |
%endmacro |

1467 | |

1468 |
%macro SPLATB_REG_SSSE3 3 |

1469 |
movd %1, %2d |

1470 |
pshufb %1, %3 |

1471 |
%endmacro |

1472 | |

1473 |
%macro SIMPLE_LOOPFILTER 4 |

1474 |
cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4 |

1475 |
%if mmsize == 8 ; mmx/mmxext |

1476 |
mov r3, 2 |

1477 |
%endif |

1478 |
%ifnidn %1, sse2 |

1479 |
%if mmsize == 16 |

1480 |
pxor m0, m0 |

1481 |
%endif |

1482 |
%endif |

1483 |
SPLATB_REG m7, r2, m0 ; splat "flim" into register |

1484 | |

1485 |
; set up indexes to address 4 rows |

1486 |
mov r2, r1 |

1487 |
neg r1 |

1488 |
%ifidn %2, h |

1489 |
lea r0, [r0+4*r2-2] |

1490 |
%endif |

1491 | |

1492 |
%if mmsize == 8 ; mmx / mmxext |

1493 |
.next8px |

1494 |
%endif |

1495 |
%ifidn %2, v |

1496 |
; read 4 half/full rows of pixels |

1497 |
mova m0, [r0+r1*2] ; p1 |

1498 |
mova m1, [r0+r1] ; p0 |

1499 |
mova m2, [r0] ; q0 |

1500 |
mova m3, [r0+r2] ; q1 |

1501 |
%else ; h |

1502 |
lea r4, [r0+r2] |

1503 | |

1504 |
%if mmsize == 8 ; mmx/mmxext |

1505 |
READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |

1506 |
%else ; sse2 |

1507 |
READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |

1508 |
%endif |

1509 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1510 |
%endif |

1511 | |

1512 |
; simple_limit |

1513 |
mova m5, m2 ; m5=backup of q0 |

1514 |
mova m6, m1 ; m6=backup of p0 |

1515 |
psubusb m1, m2 ; p0-q0 |

1516 |
psubusb m2, m6 ; q0-p0 |

1517 |
por m1, m2 ; FFABS(p0-q0) |

1518 |
paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |

1519 | |

1520 |
mova m4, m3 |

1521 |
mova m2, m0 |

1522 |
psubusb m3, m0 ; q1-p1 |

1523 |
psubusb m0, m4 ; p1-q1 |

1524 |
por m3, m0 ; FFABS(p1-q1) |

1525 |
mova m0, [pb_80] |

1526 |
pxor m2, m0 |

1527 |
pxor m4, m0 |

1528 |
psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |

1529 |
pand m3, [pb_FE] |

1530 |
psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |

1531 |
paddusb m3, m1 |

1532 |
psubusb m3, m7 |

1533 |
pxor m1, m1 |

1534 |
pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |

1535 | |

1536 |
; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |

1537 |
mova m4, m5 |

1538 |
pxor m5, m0 |

1539 |
pxor m0, m6 |

1540 |
psubsb m5, m0 ; q0-p0 (signed) |

1541 |
paddsb m2, m5 |

1542 |
paddsb m2, m5 |

1543 |
paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |

1544 |
pand m2, m3 ; apply filter mask (m3) |

1545 | |

1546 |
mova m3, [pb_F8] |

1547 |
mova m1, m2 |

1548 |
paddsb m2, [pb_4] ; f1<<3=a+4 |

1549 |
paddsb m1, [pb_3] ; f2<<3=a+3 |

1550 |
pand m2, m3 |

1551 |
pand m1, m3 ; cache f2<<3 |

1552 | |

1553 |
pxor m0, m0 |

1554 |
pxor m3, m3 |

1555 |
pcmpgtb m0, m2 ; which values are <0? |

1556 |
psubb m3, m2 ; -f1<<3 |

1557 |
psrlq m2, 3 ; +f1 |

1558 |
psrlq m3, 3 ; -f1 |

1559 |
pand m3, m0 |

1560 |
pandn m0, m2 |

1561 |
psubusb m4, m0 |

1562 |
paddusb m4, m3 ; q0-f1 |

1563 | |

1564 |
pxor m0, m0 |

1565 |
pxor m3, m3 |

1566 |
pcmpgtb m0, m1 ; which values are <0? |

1567 |
psubb m3, m1 ; -f2<<3 |

1568 |
psrlq m1, 3 ; +f2 |

1569 |
psrlq m3, 3 ; -f2 |

1570 |
pand m3, m0 |

1571 |
pandn m0, m1 |

1572 |
paddusb m6, m0 |

1573 |
psubusb m6, m3 ; p0+f2 |

1574 | |

1575 |
; store |

1576 |
%ifidn %2, v |

1577 |
mova [r0], m4 |

1578 |
mova [r0+r1], m6 |

1579 |
%else ; h |

1580 |
inc r0 |

1581 |
SBUTTERFLY bw, 6, 4, 0 |

1582 | |

1583 |
%if mmsize == 16 ; sse2 |

1584 |
%ifidn %1, sse4 |

1585 |
inc r4 |

1586 |
%endif |

1587 |
WRITE_8W m6, r4, r0, r1, r2 |

1588 |
lea r4, [r3+r1+1] |

1589 |
%ifidn %1, sse4 |

1590 |
inc r3 |

1591 |
%endif |

1592 |
WRITE_8W m4, r3, r4, r1, r2 |

1593 |
%else ; mmx/mmxext |

1594 |
WRITE_2x4W m6, m4, r4, r0, r1, r2 |

1595 |
%endif |

1596 |
%endif |

1597 | |

1598 |
%if mmsize == 8 ; mmx/mmxext |

1599 |
; next 8 pixels |

1600 |
%ifidn %2, v |

1601 |
add r0, 8 ; advance 8 cols = pixels |

1602 |
%else ; h |

1603 |
lea r0, [r0+r2*8-1] ; advance 8 rows = lines |

1604 |
%endif |

1605 |
dec r3 |

1606 |
jg .next8px |

1607 |
REP_RET |

1608 |
%else ; sse2 |

1609 |
RET |

1610 |
%endif |

1611 |
%endmacro |

1612 | |

1613 |
INIT_MMX |

1614 |
%define SPLATB_REG SPLATB_REG_MMX |

1615 |
SIMPLE_LOOPFILTER mmx, v, 4, 0 |

1616 |
SIMPLE_LOOPFILTER mmx, h, 5, 0 |

1617 |
%define SPLATB_REG SPLATB_REG_MMXEXT |

1618 |
SIMPLE_LOOPFILTER mmxext, v, 4, 0 |

1619 |
SIMPLE_LOOPFILTER mmxext, h, 5, 0 |

1620 |
INIT_XMM |

1621 |
%define SPLATB_REG SPLATB_REG_SSE2 |

1622 |
%define WRITE_8W WRITE_8W_SSE2 |

1623 |
SIMPLE_LOOPFILTER sse2, v, 3, 8 |

1624 |
SIMPLE_LOOPFILTER sse2, h, 5, 8 |

1625 |
%define SPLATB_REG SPLATB_REG_SSSE3 |

1626 |
SIMPLE_LOOPFILTER ssse3, v, 3, 8 |

1627 |
SIMPLE_LOOPFILTER ssse3, h, 5, 8 |

1628 |
%define WRITE_8W WRITE_8W_SSE4 |

1629 |
SIMPLE_LOOPFILTER sse4, h, 5, 8 |

1630 | |

1631 |
;----------------------------------------------------------------------------- |

1632 |
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |

1633 |
; int flimE, int flimI, int hev_thr); |

1634 |
;----------------------------------------------------------------------------- |

1635 | |

1636 |
%macro INNER_LOOPFILTER 5 |

1637 |
%if %4 == 8 ; chroma |

1638 |
cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 |

1639 |
%define dst8_reg r1 |

1640 |
%define mstride_reg r2 |

1641 |
%define E_reg r3 |

1642 |
%define I_reg r4 |

1643 |
%define hev_thr_reg r5 |

1644 |
%else ; luma |

1645 |
cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 |

1646 |
%define mstride_reg r1 |

1647 |
%define E_reg r2 |

1648 |
%define I_reg r3 |

1649 |
%define hev_thr_reg r4 |

1650 |
%ifdef m8 ; x86-64, sse2 |

1651 |
%define dst8_reg r4 |

1652 |
%elif mmsize == 16 ; x86-32, sse2 |

1653 |
%define dst8_reg r5 |

1654 |
%else ; x86-32, mmx/mmxext |

1655 |
%define cnt_reg r5 |

1656 |
%endif |

1657 |
%endif |

1658 |
%define dst_reg r0 |

1659 |
%define stride_reg E_reg |

1660 |
%define dst2_reg I_reg |

1661 |
%ifndef m8 |

1662 |
%define stack_reg hev_thr_reg |

1663 |
%endif |

1664 | |

1665 |
%ifnidn %1, sse2 |

1666 |
%if mmsize == 16 |

1667 |
pxor m7, m7 |

1668 |
%endif |

1669 |
%endif |

1670 | |

1671 |
%ifndef m8 ; mmx/mmxext or sse2 on x86-32 |

1672 |
; splat function arguments |

1673 |
SPLATB_REG m0, E_reg, m7 ; E |

1674 |
SPLATB_REG m1, I_reg, m7 ; I |

1675 |
SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |

1676 | |

1677 |
; align stack |

1678 |
mov stack_reg, rsp ; backup stack pointer |

1679 |
and rsp, ~(mmsize-1) ; align stack |

1680 |
%ifidn %2, v |

1681 |
sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |

1682 |
; [3]=hev() result |

1683 |
%else ; h |

1684 |
sub rsp, mmsize * 5 ; extra storage space for transposes |

1685 |
%endif |

1686 | |

1687 |
%define flim_E [rsp] |

1688 |
%define flim_I [rsp+mmsize] |

1689 |
%define hev_thr [rsp+mmsize*2] |

1690 |
%define mask_res [rsp+mmsize*3] |

1691 |
%define p0backup [rsp+mmsize*3] |

1692 |
%define q0backup [rsp+mmsize*4] |

1693 | |

1694 |
mova flim_E, m0 |

1695 |
mova flim_I, m1 |

1696 |
mova hev_thr, m2 |

1697 | |

1698 |
%else ; sse2 on x86-64 |

1699 | |

1700 |
%define flim_E m9 |

1701 |
%define flim_I m10 |

1702 |
%define hev_thr m11 |

1703 |
%define mask_res m12 |

1704 |
%define p0backup m12 |

1705 |
%define q0backup m8 |

1706 | |

1707 |
; splat function arguments |

1708 |
SPLATB_REG flim_E, E_reg, m7 ; E |

1709 |
SPLATB_REG flim_I, I_reg, m7 ; I |

1710 |
SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |

1711 |
%endif |

1712 | |

1713 |
%if mmsize == 8 && %4 == 16 ; mmx/mmxext |

1714 |
mov cnt_reg, 2 |

1715 |
%endif |

1716 |
mov stride_reg, mstride_reg |

1717 |
neg mstride_reg |

1718 |
%ifidn %2, h |

1719 |
lea dst_reg, [dst_reg + stride_reg*4-4] |

1720 |
%if %4 == 8 |

1721 |
lea dst8_reg, [dst8_reg+ stride_reg*4-4] |

1722 |
%endif |

1723 |
%endif |

1724 | |

1725 |
%if mmsize == 8 |

1726 |
.next8px |

1727 |
%endif |

1728 |
; read |

1729 |
lea dst2_reg, [dst_reg + stride_reg] |

1730 |
%ifidn %2, v |

1731 |
%if %4 == 8 && mmsize == 16 |

1732 |
%define movrow movh |

1733 |
%else |

1734 |
%define movrow mova |

1735 |
%endif |

1736 |
movrow m0, [dst_reg +mstride_reg*4] ; p3 |

1737 |
movrow m1, [dst2_reg+mstride_reg*4] ; p2 |

1738 |
movrow m2, [dst_reg +mstride_reg*2] ; p1 |

1739 |
movrow m5, [dst2_reg] ; q1 |

1740 |
movrow m6, [dst2_reg+ stride_reg] ; q2 |

1741 |
movrow m7, [dst2_reg+ stride_reg*2] ; q3 |

1742 |
%if mmsize == 16 && %4 == 8 |

1743 |
movhps m0, [dst8_reg+mstride_reg*4] |

1744 |
movhps m2, [dst8_reg+mstride_reg*2] |

1745 |
add dst8_reg, stride_reg |

1746 |
movhps m1, [dst8_reg+mstride_reg*4] |

1747 |
movhps m5, [dst8_reg] |

1748 |
movhps m6, [dst8_reg+ stride_reg] |

1749 |
movhps m7, [dst8_reg+ stride_reg*2] |

1750 |
add dst8_reg, mstride_reg |

1751 |
%endif |

1752 |
%elif mmsize == 8 ; mmx/mmxext (h) |

1753 |
; read 8 rows of 8px each |

1754 |
movu m0, [dst_reg +mstride_reg*4] |

1755 |
movu m1, [dst2_reg+mstride_reg*4] |

1756 |
movu m2, [dst_reg +mstride_reg*2] |

1757 |
movu m3, [dst_reg +mstride_reg] |

1758 |
movu m4, [dst_reg] |

1759 |
movu m5, [dst2_reg] |

1760 |
movu m6, [dst2_reg+ stride_reg] |

1761 | |

1762 |
; 8x8 transpose |

1763 |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |

1764 |
mova q0backup, m1 |

1765 |
movu m7, [dst2_reg+ stride_reg*2] |

1766 |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |

1767 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |

1768 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |

1769 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |

1770 |
mova m1, q0backup |

1771 |
mova q0backup, m2 ; store q0 |

1772 |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |

1773 |
mova p0backup, m5 ; store p0 |

1774 |
SWAP 1, 4 |

1775 |
SWAP 2, 4 |

1776 |
SWAP 6, 3 |

1777 |
SWAP 5, 3 |

1778 |
%else ; sse2 (h) |

1779 |
%if %4 == 16 |

1780 |
lea dst8_reg, [dst_reg + stride_reg*8] |

1781 |
%endif |

1782 | |

1783 |
; read 16 rows of 8px each, interleave |

1784 |
movh m0, [dst_reg +mstride_reg*4] |

1785 |
movh m1, [dst8_reg+mstride_reg*4] |

1786 |
movh m2, [dst_reg +mstride_reg*2] |

1787 |
movh m5, [dst8_reg+mstride_reg*2] |

1788 |
movh m3, [dst_reg +mstride_reg] |

1789 |
movh m6, [dst8_reg+mstride_reg] |

1790 |
movh m4, [dst_reg] |

1791 |
movh m7, [dst8_reg] |

1792 |
punpcklbw m0, m1 ; A/I |

1793 |
punpcklbw m2, m5 ; C/K |

1794 |
punpcklbw m3, m6 ; D/L |

1795 |
punpcklbw m4, m7 ; E/M |

1796 | |

1797 |
add dst8_reg, stride_reg |

1798 |
movh m1, [dst2_reg+mstride_reg*4] |

1799 |
movh m6, [dst8_reg+mstride_reg*4] |

1800 |
movh m5, [dst2_reg] |

1801 |
movh m7, [dst8_reg] |

1802 |
punpcklbw m1, m6 ; B/J |

1803 |
punpcklbw m5, m7 ; F/N |

1804 |
movh m6, [dst2_reg+ stride_reg] |

1805 |
movh m7, [dst8_reg+ stride_reg] |

1806 |
punpcklbw m6, m7 ; G/O |

1807 | |

1808 |
; 8x16 transpose |

1809 |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |

1810 |
%ifdef m8 |

1811 |
SWAP 1, 8 |

1812 |
%else |

1813 |
mova q0backup, m1 |

1814 |
%endif |

1815 |
movh m7, [dst2_reg+ stride_reg*2] |

1816 |
movh m1, [dst8_reg+ stride_reg*2] |

1817 |
punpcklbw m7, m1 ; H/P |

1818 |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |

1819 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |

1820 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |

1821 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |

1822 |
%ifdef m8 |

1823 |
SWAP 1, 8 |

1824 |
SWAP 2, 8 |

1825 |
%else |

1826 |
mova m1, q0backup |

1827 |
mova q0backup, m2 ; store q0 |

1828 |
%endif |

1829 |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |

1830 |
%ifdef m12 |

1831 |
SWAP 5, 12 |

1832 |
%else |

1833 |
mova p0backup, m5 ; store p0 |

1834 |
%endif |

1835 |
SWAP 1, 4 |

1836 |
SWAP 2, 4 |

1837 |
SWAP 6, 3 |

1838 |
SWAP 5, 3 |

1839 |
%endif |

1840 | |

1841 |
; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |

1842 |
mova m4, m1 |

1843 |
SWAP 4, 1 |

1844 |
psubusb m4, m0 ; p2-p3 |

1845 |
psubusb m0, m1 ; p3-p2 |

1846 |
por m0, m4 ; abs(p3-p2) |

1847 | |

1848 |
mova m4, m2 |

1849 |
SWAP 4, 2 |

1850 |
psubusb m4, m1 ; p1-p2 |

1851 |
psubusb m1, m2 ; p2-p1 |

1852 |
por m1, m4 ; abs(p2-p1) |

1853 | |

1854 |
mova m4, m6 |

1855 |
SWAP 4, 6 |

1856 |
psubusb m4, m7 ; q2-q3 |

1857 |
psubusb m7, m6 ; q3-q2 |

1858 |
por m7, m4 ; abs(q3-q2) |

1859 | |

1860 |
mova m4, m5 |

1861 |
SWAP 4, 5 |

1862 |
psubusb m4, m6 ; q1-q2 |

1863 |
psubusb m6, m5 ; q2-q1 |

1864 |
por m6, m4 ; abs(q2-q1) |

1865 | |

1866 |
%ifidn %1, mmx |

1867 |
mova m4, flim_I |

1868 |
pxor m3, m3 |

1869 |
psubusb m0, m4 |

1870 |
psubusb m1, m4 |

1871 |
psubusb m7, m4 |

1872 |
psubusb m6, m4 |

1873 |
pcmpeqb m0, m3 ; abs(p3-p2) <= I |

1874 |
pcmpeqb m1, m3 ; abs(p2-p1) <= I |

1875 |
pcmpeqb m7, m3 ; abs(q3-q2) <= I |

1876 |
pcmpeqb m6, m3 ; abs(q2-q1) <= I |

1877 |
pand m0, m1 |

1878 |
pand m7, m6 |

1879 |
pand m0, m7 |

1880 |
%else ; mmxext/sse2 |

1881 |
pmaxub m0, m1 |

1882 |
pmaxub m6, m7 |

1883 |
pmaxub m0, m6 |

1884 |
%endif |

1885 | |

1886 |
; normal_limit and high_edge_variance for p1-p0, q1-q0 |

1887 |
SWAP 7, 3 ; now m7 is zero |

1888 |
%ifidn %2, v |

1889 |
movrow m3, [dst_reg +mstride_reg] ; p0 |

1890 |
%if mmsize == 16 && %4 == 8 |

1891 |
movhps m3, [dst8_reg+mstride_reg] |

1892 |
%endif |

1893 |
%elifdef m12 |

1894 |
SWAP 3, 12 |

1895 |
%else |

1896 |
mova m3, p0backup |

1897 |
%endif |

1898 | |

1899 |
mova m1, m2 |

1900 |
SWAP 1, 2 |

1901 |
mova m6, m3 |

1902 |
SWAP 3, 6 |

1903 |
psubusb m1, m3 ; p1-p0 |

1904 |
psubusb m6, m2 ; p0-p1 |

1905 |
por m1, m6 ; abs(p1-p0) |

1906 |
%ifidn %1, mmx |

1907 |
mova m6, m1 |

1908 |
psubusb m1, m4 |

1909 |
psubusb m6, hev_thr |

1910 |
pcmpeqb m1, m7 ; abs(p1-p0) <= I |

1911 |
pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |

1912 |
pand m0, m1 |

1913 |
mova mask_res, m6 |

1914 |
%else ; mmxext/sse2 |

1915 |
pmaxub m0, m1 ; max_I |

1916 |
SWAP 1, 4 ; max_hev_thresh |

1917 |
%endif |

1918 | |

1919 |
SWAP 6, 4 ; now m6 is I |

1920 |
%ifidn %2, v |

1921 |
movrow m4, [dst_reg] ; q0 |

1922 |
%if mmsize == 16 && %4 == 8 |

1923 |
movhps m4, [dst8_reg] |

1924 |
%endif |

1925 |
%elifdef m8 |

1926 |
SWAP 4, 8 |

1927 |
%else |

1928 |
mova m4, q0backup |

1929 |
%endif |

1930 |
mova m1, m4 |

1931 |
SWAP 1, 4 |

1932 |
mova m7, m5 |

1933 |
SWAP 7, 5 |

1934 |
psubusb m1, m5 ; q0-q1 |

1935 |
psubusb m7, m4 ; q1-q0 |

1936 |
por m1, m7 ; abs(q1-q0) |

1937 |
%ifidn %1, mmx |

1938 |
mova m7, m1 |

1939 |
psubusb m1, m6 |

1940 |
psubusb m7, hev_thr |

1941 |
pxor m6, m6 |

1942 |
pcmpeqb m1, m6 ; abs(q1-q0) <= I |

1943 |
pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |

1944 |
mova m6, mask_res |

1945 |
pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |

1946 |
pand m6, m7 |

1947 |
%else ; mmxext/sse2 |

1948 |
pxor m7, m7 |

1949 |
pmaxub m0, m1 |

1950 |
pmaxub m6, m1 |

1951 |
psubusb m0, flim_I |

1952 |
psubusb m6, hev_thr |

1953 |
pcmpeqb m0, m7 ; max(abs(..)) <= I |

1954 |
pcmpeqb m6, m7 ; !(max(abs..) > thresh) |

1955 |
%endif |

1956 |
%ifdef m12 |

1957 |
SWAP 6, 12 |

1958 |
%else |

1959 |
mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |

1960 |
%endif |

1961 | |

1962 |
; simple_limit |

1963 |
mova m1, m3 |

1964 |
SWAP 1, 3 |

1965 |
mova m6, m4 ; keep copies of p0/q0 around for later use |

1966 |
SWAP 6, 4 |

1967 |
psubusb m1, m4 ; p0-q0 |

1968 |
psubusb m6, m3 ; q0-p0 |

1969 |
por m1, m6 ; abs(q0-p0) |

1970 |
paddusb m1, m1 ; m1=2*abs(q0-p0) |

1971 | |

1972 |
mova m7, m2 |

1973 |
SWAP 7, 2 |

1974 |
mova m6, m5 |

1975 |
SWAP 6, 5 |

1976 |
psubusb m7, m5 ; p1-q1 |

1977 |
psubusb m6, m2 ; q1-p1 |

1978 |
por m7, m6 ; abs(q1-p1) |

1979 |
pxor m6, m6 |

1980 |
pand m7, [pb_FE] |

1981 |
psrlq m7, 1 ; abs(q1-p1)/2 |

1982 |
paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |

1983 |
psubusb m7, flim_E |

1984 |
pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |

1985 |
pand m0, m7 ; normal_limit result |

1986 | |

1987 |
; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |

1988 |
%ifdef m8 ; x86-64 && sse2 |

1989 |
mova m8, [pb_80] |

1990 |
%define pb_80_var m8 |

1991 |
%else ; x86-32 or mmx/mmxext |

1992 |
%define pb_80_var [pb_80] |

1993 |
%endif |

1994 |
mova m1, m4 |

1995 |
mova m7, m3 |

1996 |
pxor m1, pb_80_var |

1997 |
pxor m7, pb_80_var |

1998 |
psubsb m1, m7 ; (signed) q0-p0 |

1999 |
mova m6, m2 |

2000 |
mova m7, m5 |

2001 |
pxor m6, pb_80_var |

2002 |
pxor m7, pb_80_var |

2003 |
psubsb m6, m7 ; (signed) p1-q1 |

2004 |
mova m7, mask_res |

2005 |
pandn m7, m6 |

2006 |
paddsb m7, m1 |

2007 |
paddsb m7, m1 |

2008 |
paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) |

2009 | |

2010 |
pand m7, m0 |

2011 |
mova m1, [pb_F8] |

2012 |
mova m6, m7 |

2013 |
paddsb m7, [pb_3] |

2014 |
paddsb m6, [pb_4] |

2015 |
pand m7, m1 |

2016 |
pand m6, m1 |

2017 | |

2018 |
pxor m1, m1 |

2019 |
pxor m0, m0 |

2020 |
pcmpgtb m1, m7 |

2021 |
psubb m0, m7 |

2022 |
psrlq m7, 3 ; +f2 |

2023 |
psrlq m0, 3 ; -f2 |

2024 |
pand m0, m1 |

2025 |
pandn m1, m7 |

2026 |
psubusb m3, m0 |

2027 |
paddusb m3, m1 ; p0+f2 |

2028 | |

2029 |
pxor m1, m1 |

2030 |
pxor m0, m0 |

2031 |
pcmpgtb m0, m6 |

2032 |
psubb m1, m6 |

2033 |
psrlq m6, 3 ; +f1 |

2034 |
psrlq m1, 3 ; -f1 |

2035 |
pand m1, m0 |

2036 |
pandn m0, m6 |

2037 |
psubusb m4, m0 |

2038 |
paddusb m4, m1 ; q0-f1 |

2039 | |

2040 |
%ifdef m12 |

2041 |
SWAP 6, 12 |

2042 |
%else |

2043 |
mova m6, mask_res |

2044 |
%endif |

2045 |
%ifidn %1, mmx |

2046 |
mova m7, [pb_1] |

2047 |
%else ; mmxext/sse2 |

2048 |
pxor m7, m7 |

2049 |
%endif |

2050 |
pand m0, m6 |

2051 |
pand m1, m6 |

2052 |
%ifidn %1, mmx |

2053 |
paddusb m0, m7 |

2054 |
pand m1, [pb_FE] |

2055 |
pandn m7, m0 |

2056 |
psrlq m1, 1 |

2057 |
psrlq m7, 1 |

2058 |
SWAP 0, 7 |

2059 |
%else ; mmxext/sse2 |

2060 |
psubusb m1, [pb_1] |

2061 |
pavgb m0, m7 ; a |

2062 |
pavgb m1, m7 ; -a |

2063 |
%endif |

2064 |
psubusb m5, m0 |

2065 |
psubusb m2, m1 |

2066 |
paddusb m5, m1 ; q1-a |

2067 |
paddusb m2, m0 ; p1+a |

2068 | |

2069 |
; store |

2070 |
%ifidn %2, v |

2071 |
movrow [dst_reg +mstride_reg*2], m2 |

2072 |
movrow [dst_reg +mstride_reg ], m3 |

2073 |
movrow [dst_reg], m4 |

2074 |
movrow [dst_reg + stride_reg ], m5 |

2075 |
%if mmsize == 16 && %4 == 8 |

2076 |
movhps [dst8_reg+mstride_reg*2], m2 |

2077 |
movhps [dst8_reg+mstride_reg ], m3 |

2078 |
movhps [dst8_reg], m4 |

2079 |
movhps [dst8_reg+ stride_reg ], m5 |

2080 |
%endif |

2081 |
%else ; h |

2082 |
add dst_reg, 2 |

2083 |
add dst2_reg, 2 |

2084 | |

2085 |
; 4x8/16 transpose |

2086 |
TRANSPOSE4x4B 2, 3, 4, 5, 6 |

2087 | |

2088 |
%if mmsize == 8 ; mmx/mmxext (h) |

2089 |
WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg |

2090 |
%else ; sse2 (h) |

2091 |
lea dst8_reg, [dst8_reg+mstride_reg+2] |

2092 |
WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |

2093 |
%endif |

2094 |
%endif |

2095 | |

2096 |
%if mmsize == 8 |

2097 |
%if %4 == 8 ; chroma |

2098 |
%ifidn %2, h |

2099 |
sub dst_reg, 2 |

2100 |
%endif |

2101 |
cmp dst_reg, dst8_reg |

2102 |
mov dst_reg, dst8_reg |

2103 |
jnz .next8px |

2104 |
%else |

2105 |
%ifidn %2, h |

2106 |
lea dst_reg, [dst_reg + stride_reg*8-2] |

2107 |
%else ; v |

2108 |
add dst_reg, 8 |

2109 |
%endif |

2110 |
dec cnt_reg |

2111 |
jg .next8px |

2112 |
%endif |

2113 |
%endif |

2114 | |

2115 |
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext |

2116 |
mov rsp, stack_reg ; restore stack pointer |

2117 |
%endif |

2118 |
RET |

2119 |
%endmacro |

2120 | |

2121 |
INIT_MMX |

2122 |
%define SPLATB_REG SPLATB_REG_MMX |

2123 |
INNER_LOOPFILTER mmx, v, 6, 16, 0 |

2124 |
INNER_LOOPFILTER mmx, h, 6, 16, 0 |

2125 |
INNER_LOOPFILTER mmx, v, 6, 8, 0 |

2126 |
INNER_LOOPFILTER mmx, h, 6, 8, 0 |

2127 | |

2128 |
%define SPLATB_REG SPLATB_REG_MMXEXT |

2129 |
INNER_LOOPFILTER mmxext, v, 6, 16, 0 |

2130 |
INNER_LOOPFILTER mmxext, h, 6, 16, 0 |

2131 |
INNER_LOOPFILTER mmxext, v, 6, 8, 0 |

2132 |
INNER_LOOPFILTER mmxext, h, 6, 8, 0 |

2133 | |

2134 |
INIT_XMM |

2135 |
%define SPLATB_REG SPLATB_REG_SSE2 |

2136 |
INNER_LOOPFILTER sse2, v, 5, 16, 13 |

2137 |
%ifdef m8 |

2138 |
INNER_LOOPFILTER sse2, h, 5, 16, 13 |

2139 |
%else |

2140 |
INNER_LOOPFILTER sse2, h, 6, 16, 13 |

2141 |
%endif |

2142 |
INNER_LOOPFILTER sse2, v, 6, 8, 13 |

2143 |
INNER_LOOPFILTER sse2, h, 6, 8, 13 |

2144 | |

2145 |
%define SPLATB_REG SPLATB_REG_SSSE3 |

2146 |
INNER_LOOPFILTER ssse3, v, 5, 16, 13 |

2147 |
%ifdef m8 |

2148 |
INNER_LOOPFILTER ssse3, h, 5, 16, 13 |

2149 |
%else |

2150 |
INNER_LOOPFILTER ssse3, h, 6, 16, 13 |

2151 |
%endif |

2152 |
INNER_LOOPFILTER ssse3, v, 6, 8, 13 |

2153 |
INNER_LOOPFILTER ssse3, h, 6, 8, 13 |

2154 | |

2155 |
;----------------------------------------------------------------------------- |

2156 |
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |

2157 |
; int flimE, int flimI, int hev_thr); |

2158 |
;----------------------------------------------------------------------------- |

2159 | |

2160 |
%macro MBEDGE_LOOPFILTER 5 |

2161 |
%if %4 == 8 ; chroma |

2162 |
cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 |

2163 |
%define dst8_reg r1 |

2164 |
%define mstride_reg r2 |

2165 |
%define E_reg r3 |

2166 |
%define I_reg r4 |

2167 |
%define hev_thr_reg r5 |

2168 |
%else ; luma |

2169 |
cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 |

2170 |
%define mstride_reg r1 |

2171 |
%define E_reg r2 |

2172 |
%define I_reg r3 |

2173 |
%define hev_thr_reg r4 |

2174 |
%ifdef m8 ; x86-64, sse2 |

2175 |
%define dst8_reg r4 |

2176 |
%elif mmsize == 16 ; x86-32, sse2 |

2177 |
%define dst8_reg r5 |

2178 |
%else ; x86-32, mmx/mmxext |

2179 |
%define cnt_reg r5 |

2180 |
%endif |

2181 |
%endif |

2182 |
%define dst_reg r0 |

2183 |
%define stride_reg E_reg |

2184 |
%define dst2_reg I_reg |

2185 |
%ifndef m8 |

2186 |
%define stack_reg hev_thr_reg |

2187 |
%endif |

2188 | |

2189 |
%define ssse3_or_higher 0 |

2190 |
%ifnidn %1, sse2 |

2191 |
%if mmsize == 16 |

2192 |
%define ssse3_or_higher 1 |

2193 |
%endif |

2194 |
%endif |

2195 | |

2196 |
%if ssse3_or_higher |

2197 |
pxor m7, m7 |

2198 |
%endif |

2199 | |

2200 |
%ifndef m8 ; mmx/mmxext or sse2 on x86-32 |

2201 |
; splat function arguments |

2202 |
SPLATB_REG m0, E_reg, m7 ; E |

2203 |
SPLATB_REG m1, I_reg, m7 ; I |

2204 |
SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |

2205 | |

2206 |
; align stack |

2207 |
mov stack_reg, rsp ; backup stack pointer |

2208 |
and rsp, ~(mmsize-1) ; align stack |

2209 |
%if mmsize == 16 |

2210 |
sub rsp, mmsize * 7 |

2211 |
%else |

2212 |
sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |

2213 |
; [3]=hev() result |

2214 |
; [4]=filter tmp result |

2215 |
; [5]/[6] = p2/q2 backup |

2216 |
; [7]=lim_res sign result |

2217 |
%endif |

2218 | |

2219 |
%define flim_E [rsp] |

2220 |
%define flim_I [rsp+mmsize] |

2221 |
%define hev_thr [rsp+mmsize*2] |

2222 |
%define mask_res [rsp+mmsize*3] |

2223 |
%define lim_res [rsp+mmsize*4] |

2224 |
%define p0backup [rsp+mmsize*3] |

2225 |
%define q0backup [rsp+mmsize*4] |

2226 |
%define p2backup [rsp+mmsize*5] |

2227 |
%define q2backup [rsp+mmsize*6] |

2228 |
%if mmsize == 16 |

2229 |
%define lim_sign [rsp] |

2230 |
%else |

2231 |
%define lim_sign [rsp+mmsize*7] |

2232 |
%endif |

2233 | |

2234 |
mova flim_E, m0 |

2235 |
mova flim_I, m1 |

2236 |
mova hev_thr, m2 |

2237 | |

2238 |
%else ; sse2 on x86-64 |

2239 | |

2240 |
%define flim_E m9 |

2241 |
%define flim_I m10 |

2242 |
%define hev_thr m11 |

2243 |
%define mask_res m12 |

2244 |
%define lim_res m8 |

2245 |
%define p0backup m12 |

2246 |
%define q0backup m8 |

2247 |
%define p2backup m13 |

2248 |
%define q2backup m14 |

2249 |
%define lim_sign m9 |

2250 | |

2251 |
; splat function arguments |

2252 |
SPLATB_REG flim_E, E_reg, m7 ; E |

2253 |
SPLATB_REG flim_I, I_reg, m7 ; I |

2254 |
SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |

2255 |
%endif |

2256 | |

2257 |
%if mmsize == 8 && %4 == 16 ; mmx/mmxext |

2258 |
mov cnt_reg, 2 |

2259 |
%endif |

2260 |
mov stride_reg, mstride_reg |

2261 |
neg mstride_reg |

2262 |
%ifidn %2, h |

2263 |
lea dst_reg, [dst_reg + stride_reg*4-4] |

2264 |
%if %4 == 8 |

2265 |
lea dst8_reg, [dst8_reg+ stride_reg*4-4] |

2266 |
%endif |

2267 |
%endif |

2268 | |

2269 |
%if mmsize == 8 |

2270 |
.next8px |

2271 |
%endif |

2272 |
; read |

2273 |
lea dst2_reg, [dst_reg + stride_reg] |

2274 |
%ifidn %2, v |

2275 |
%if %4 == 8 && mmsize == 16 |

2276 |
%define movrow movh |

2277 |
%else |

2278 |
%define movrow mova |

2279 |
%endif |

2280 |
movrow m0, [dst_reg +mstride_reg*4] ; p3 |

2281 |
movrow m1, [dst2_reg+mstride_reg*4] ; p2 |

2282 |
movrow m2, [dst_reg +mstride_reg*2] ; p1 |

2283 |
movrow m5, [dst2_reg] ; q1 |

2284 |
movrow m6, [dst2_reg+ stride_reg] ; q2 |

2285 |
movrow m7, [dst2_reg+ stride_reg*2] ; q3 |

2286 |
%if mmsize == 16 && %4 == 8 |

2287 |
movhps m0, [dst8_reg+mstride_reg*4] |

2288 |
movhps m2, [dst8_reg+mstride_reg*2] |

2289 |
add dst8_reg, stride_reg |

2290 |
movhps m1, [dst8_reg+mstride_reg*4] |

2291 |
movhps m5, [dst8_reg] |

2292 |
movhps m6, [dst8_reg+ stride_reg] |

2293 |
movhps m7, [dst8_reg+ stride_reg*2] |

2294 |
add dst8_reg, mstride_reg |

2295 |
%endif |

2296 |
%elif mmsize == 8 ; mmx/mmxext (h) |

2297 |
; read 8 rows of 8px each |

2298 |
movu m0, [dst_reg +mstride_reg*4] |

2299 |
movu m1, [dst2_reg+mstride_reg*4] |

2300 |
movu m2, [dst_reg +mstride_reg*2] |

2301 |
movu m3, [dst_reg +mstride_reg] |

2302 |
movu m4, [dst_reg] |

2303 |
movu m5, [dst2_reg] |

2304 |
movu m6, [dst2_reg+ stride_reg] |

2305 | |

2306 |
; 8x8 transpose |

2307 |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |

2308 |
mova q0backup, m1 |

2309 |
movu m7, [dst2_reg+ stride_reg*2] |

2310 |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |

2311 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |

2312 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |

2313 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |

2314 |
mova m1, q0backup |

2315 |
mova q0backup, m2 ; store q0 |

2316 |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |

2317 |
mova p0backup, m5 ; store p0 |

2318 |
SWAP 1, 4 |

2319 |
SWAP 2, 4 |

2320 |
SWAP 6, 3 |

2321 |
SWAP 5, 3 |

2322 |
%else ; sse2 (h) |

2323 |
%if %4 == 16 |

2324 |
lea dst8_reg, [dst_reg + stride_reg*8] |

2325 |
%endif |

2326 | |

2327 |
; read 16 rows of 8px each, interleave |

2328 |
movh m0, [dst_reg +mstride_reg*4] |

2329 |
movh m1, [dst8_reg+mstride_reg*4] |

2330 |
movh m2, [dst_reg +mstride_reg*2] |

2331 |
movh m5, [dst8_reg+mstride_reg*2] |

2332 |
movh m3, [dst_reg +mstride_reg] |

2333 |
movh m6, [dst8_reg+mstride_reg] |

2334 |
movh m4, [dst_reg] |

2335 |
movh m7, [dst8_reg] |

2336 |
punpcklbw m0, m1 ; A/I |

2337 |
punpcklbw m2, m5 ; C/K |

2338 |
punpcklbw m3, m6 ; D/L |

2339 |
punpcklbw m4, m7 ; E/M |

2340 | |

2341 |
add dst8_reg, stride_reg |

2342 |
movh m1, [dst2_reg+mstride_reg*4] |

2343 |
movh m6, [dst8_reg+mstride_reg*4] |

2344 |
movh m5, [dst2_reg] |

2345 |
movh m7, [dst8_reg] |

2346 |
punpcklbw m1, m6 ; B/J |

2347 |
punpcklbw m5, m7 ; F/N |

2348 |
movh m6, [dst2_reg+ stride_reg] |

2349 |
movh m7, [dst8_reg+ stride_reg] |

2350 |
punpcklbw m6, m7 ; G/O |

2351 | |

2352 |
; 8x16 transpose |

2353 |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |

2354 |
%ifdef m8 |

2355 |
SWAP 1, 8 |

2356 |
%else |

2357 |
mova q0backup, m1 |

2358 |
%endif |

2359 |
movh m7, [dst2_reg+ stride_reg*2] |

2360 |
movh m1, [dst8_reg+ stride_reg*2] |

2361 |
punpcklbw m7, m1 ; H/P |

2362 |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |

2363 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |

2364 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |

2365 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |

2366 |
%ifdef m8 |

2367 |
SWAP 1, 8 |

2368 |
SWAP 2, 8 |

2369 |
%else |

2370 |
mova m1, q0backup |

2371 |
mova q0backup, m2 ; store q0 |

2372 |
%endif |

2373 |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |

2374 |
%ifdef m12 |

2375 |
SWAP 5, 12 |

2376 |
%else |

2377 |
mova p0backup, m5 ; store p0 |

2378 |
%endif |

2379 |
SWAP 1, 4 |

2380 |
SWAP 2, 4 |

2381 |
SWAP 6, 3 |

2382 |
SWAP 5, 3 |

2383 |
%endif |

2384 | |

2385 |
; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |

2386 |
mova m4, m1 |

2387 |
SWAP 4, 1 |

2388 |
psubusb m4, m0 ; p2-p3 |

2389 |
psubusb m0, m1 ; p3-p2 |

2390 |
por m0, m4 ; abs(p3-p2) |

2391 | |

2392 |
mova m4, m2 |

2393 |
SWAP 4, 2 |

2394 |
psubusb m4, m1 ; p1-p2 |

2395 |
mova p2backup, m1 |

2396 |
psubusb m1, m2 ; p2-p1 |

2397 |
por m1, m4 ; abs(p2-p1) |

2398 | |

2399 |
mova m4, m6 |

2400 |
SWAP 4, 6 |

2401 |
psubusb m4, m7 ; q2-q3 |

2402 |
psubusb m7, m6 ; q3-q2 |

2403 |
por m7, m4 ; abs(q3-q2) |

2404 | |

2405 |
mova m4, m5 |

2406 |
SWAP 4, 5 |

2407 |
psubusb m4, m6 ; q1-q2 |

2408 |
mova q2backup, m6 |

2409 |
psubusb m6, m5 ; q2-q1 |

2410 |
por m6, m4 ; abs(q2-q1) |

2411 | |

2412 |
%ifidn %1, mmx |

2413 |
mova m4, flim_I |

2414 |
pxor m3, m3 |

2415 |
psubusb m0, m4 |

2416 |
psubusb m1, m4 |

2417 |
psubusb m7, m4 |

2418 |
psubusb m6, m4 |

2419 |
pcmpeqb m0, m3 ; abs(p3-p2) <= I |

2420 |
pcmpeqb m1, m3 ; abs(p2-p1) <= I |

2421 |
pcmpeqb m7, m3 ; abs(q3-q2) <= I |

2422 |
pcmpeqb m6, m3 ; abs(q2-q1) <= I |

2423 |
pand m0, m1 |

2424 |
pand m7, m6 |

2425 |
pand m0, m7 |

2426 |
%else ; mmxext/sse2 |

2427 |
pmaxub m0, m1 |

2428 |
pmaxub m6, m7 |

2429 |
pmaxub m0, m6 |

2430 |
%endif |

2431 | |

2432 |
; normal_limit and high_edge_variance for p1-p0, q1-q0 |

2433 |
SWAP 7, 3 ; now m7 is zero |

2434 |
%ifidn %2, v |

2435 |
movrow m3, [dst_reg +mstride_reg] ; p0 |

2436 |
%if mmsize == 16 && %4 == 8 |

2437 |
movhps m3, [dst8_reg+mstride_reg] |

2438 |
%endif |

2439 |
%elifdef m12 |

2440 |
SWAP 3, 12 |

2441 |
%else |

2442 |
mova m3, p0backup |

2443 |
%endif |

2444 | |

2445 |
mova m1, m2 |

2446 |
SWAP 1, 2 |

2447 |
mova m6, m3 |

2448 |
SWAP 3, 6 |

2449 |
psubusb m1, m3 ; p1-p0 |

2450 |
psubusb m6, m2 ; p0-p1 |

2451 |
por m1, m6 ; abs(p1-p0) |

2452 |
%ifidn %1, mmx |

2453 |
mova m6, m1 |

2454 |
psubusb m1, m4 |

2455 |
psubusb m6, hev_thr |

2456 |
pcmpeqb m1, m7 ; abs(p1-p0) <= I |

2457 |
pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |

2458 |
pand m0, m1 |

2459 |
mova mask_res, m6 |

2460 |
%else ; mmxext/sse2 |

2461 |
pmaxub m0, m1 ; max_I |

2462 |
SWAP 1, 4 ; max_hev_thresh |

2463 |
%endif |

2464 | |

2465 |
SWAP 6, 4 ; now m6 is I |

2466 |
%ifidn %2, v |

2467 |
movrow m4, [dst_reg] ; q0 |

2468 |
%if mmsize == 16 && %4 == 8 |

2469 |
movhps m4, [dst8_reg] |

2470 |
%endif |

2471 |
%elifdef m8 |

2472 |
SWAP 4, 8 |

2473 |
%else |

2474 |
mova m4, q0backup |

2475 |
%endif |

2476 |
mova m1, m4 |

2477 |
SWAP 1, 4 |

2478 |
mova m7, m5 |

2479 |
SWAP 7, 5 |

2480 |
psubusb m1, m5 ; q0-q1 |

2481 |
psubusb m7, m4 ; q1-q0 |

2482 |
por m1, m7 ; abs(q1-q0) |

2483 |
%ifidn %1, mmx |

2484 |
mova m7, m1 |

2485 |
psubusb m1, m6 |

2486 |
psubusb m7, hev_thr |

2487 |
pxor m6, m6 |

2488 |
pcmpeqb m1, m6 ; abs(q1-q0) <= I |

2489 |
pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |

2490 |
mova m6, mask_res |

2491 |
pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |

2492 |
pand m6, m7 |

2493 |
%else ; mmxext/sse2 |

2494 |
pxor m7, m7 |

2495 |
pmaxub m0, m1 |

2496 |
pmaxub m6, m1 |

2497 |
psubusb m0, flim_I |

2498 |
psubusb m6, hev_thr |

2499 |
pcmpeqb m0, m7 ; max(abs(..)) <= I |

2500 |
pcmpeqb m6, m7 ; !(max(abs..) > thresh) |

2501 |
%endif |

2502 |
%ifdef m12 |

2503 |
SWAP 6, 12 |

2504 |
%else |

2505 |
mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |

2506 |
%endif |

2507 | |

2508 |
; simple_limit |

2509 |
mova m1, m3 |

2510 |
SWAP 1, 3 |

2511 |
mova m6, m4 ; keep copies of p0/q0 around for later use |

2512 |
SWAP 6, 4 |

2513 |
psubusb m1, m4 ; p0-q0 |

2514 |
psubusb m6, m3 ; q0-p0 |

2515 |
por m1, m6 ; abs(q0-p0) |

2516 |
paddusb m1, m1 ; m1=2*abs(q0-p0) |

2517 | |

2518 |
mova m7, m2 |

2519 |
SWAP 7, 2 |

2520 |
mova m6, m5 |

2521 |
SWAP 6, 5 |

2522 |
psubusb m7, m5 ; p1-q1 |

2523 |
psubusb m6, m2 ; q1-p1 |

2524 |
por m7, m6 ; abs(q1-p1) |

2525 |
pxor m6, m6 |

2526 |
pand m7, [pb_FE] |

2527 |
psrlq m7, 1 ; abs(q1-p1)/2 |

2528 |
paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |

2529 |
psubusb m7, flim_E |

2530 |
pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |

2531 |
pand m0, m7 ; normal_limit result |

2532 | |

2533 |
; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |

2534 |
%ifdef m8 ; x86-64 && sse2 |

2535 |
mova m8, [pb_80] |

2536 |
%define pb_80_var m8 |

2537 |
%else ; x86-32 or mmx/mmxext |

2538 |
%define pb_80_var [pb_80] |

2539 |
%endif |

2540 |
mova m1, m4 |

2541 |
mova m7, m3 |

2542 |
pxor m1, pb_80_var |

2543 |
pxor m7, pb_80_var |

2544 |
psubsb m1, m7 ; (signed) q0-p0 |

2545 |
mova m6, m2 |

2546 |
mova m7, m5 |

2547 |
pxor m6, pb_80_var |

2548 |
pxor m7, pb_80_var |

2549 |
psubsb m6, m7 ; (signed) p1-q1 |

2550 |
mova m7, mask_res |

2551 |
paddsb m6, m1 |

2552 |
paddsb m6, m1 |

2553 |
paddsb m6, m1 |

2554 |
pand m6, m0 |

2555 |
%ifdef m8 |

2556 |
mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge |

2557 |
pand lim_res, m7 |

2558 |
%else |

2559 |
mova m0, m6 |

2560 |
pand m0, m7 |

2561 |
mova lim_res, m0 |

2562 |
%endif |

2563 |
pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common |

2564 | |

2565 |
mova m1, [pb_F8] |

2566 |
mova m6, m7 |

2567 |
paddsb m7, [pb_3] |

2568 |
paddsb m6, [pb_4] |

2569 |
pand m7, m1 |

2570 |
pand m6, m1 |

2571 | |

2572 |
pxor m1, m1 |

2573 |
pxor m0, m0 |

2574 |
pcmpgtb m1, m7 |

2575 |
psubb m0, m7 |

2576 |
psrlq m7, 3 ; +f2 |

2577 |
psrlq m0, 3 ; -f2 |

2578 |
pand m0, m1 |

2579 |
pandn m1, m7 |

2580 |
psubusb m3, m0 |

2581 |
paddusb m3, m1 ; p0+f2 |

2582 | |

2583 |
pxor m1, m1 |

2584 |
pxor m0, m0 |

2585 |
pcmpgtb m0, m6 |

2586 |
psubb m1, m6 |

2587 |
psrlq m6, 3 ; +f1 |

2588 |
psrlq m1, 3 ; -f1 |

2589 |
pand m1, m0 |

2590 |
pandn m0, m6 |

2591 |
psubusb m4, m0 |

2592 |
paddusb m4, m1 ; q0-f1 |

2593 | |

2594 |
; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) |

2595 |
%if ssse3_or_higher |

2596 |
mova m7, [pb_1] |

2597 |
%else |

2598 |
mova m7, [pw_63] |

2599 |
%endif |

2600 |
%ifdef m8 |

2601 |
SWAP 1, 8 |

2602 |
%else |

2603 |
mova m1, lim_res |

2604 |
%endif |

2605 |
pxor m0, m0 |

2606 |
mova m6, m1 |

2607 |
pcmpgtb m0, m1 ; which are negative |

2608 |
%if ssse3_or_higher |

2609 |
punpcklbw m6, m7 ; interleave with "1" for rounding |

2610 |
punpckhbw m1, m7 |

2611 |
%else |

2612 |
punpcklbw m6, m0 ; signed byte->word |

2613 |
punpckhbw m1, m0 |

2614 |
%endif |

2615 |
mova lim_sign, m0 |

2616 |
%if ssse3_or_higher |

2617 |
mova m7, [pb_27_63] |

2618 |
%ifndef m8 |

2619 |
mova lim_res, m1 |

2620 |
%endif |

2621 |
%ifdef m10 |

2622 |
SWAP 0, 10 ; don't lose lim_sign copy |

2623 |
%endif |

2624 |
mova m0, m7 |

2625 |
pmaddubsw m7, m6 |

2626 |
SWAP 6, 7 |

2627 |
pmaddubsw m0, m1 |

2628 |
SWAP 1, 0 |

2629 |
%ifdef m10 |

2630 |
SWAP 0, 10 |

2631 |
%else |

2632 |
mova m0, lim_sign |

2633 |
%endif |

2634 |
%else |

2635 |
mova mask_res, m6 ; backup for later in filter |

2636 |
mova lim_res, m1 |

2637 |
pmullw m6, [pw_27] |

2638 |
pmullw m1, [pw_27] |

2639 |
paddw m6, m7 |

2640 |
paddw m1, m7 |

2641 |
%endif |

2642 |
psraw m6, 7 |

2643 |
psraw m1, 7 |

2644 |
packsswb m6, m1 ; a0 |

2645 |
pxor m1, m1 |

2646 |
psubb m1, m6 |

2647 |
pand m1, m0 ; -a0 |

2648 |
pandn m0, m6 ; +a0 |

2649 |
%if ssse3_or_higher |

2650 |
mova m6, [pb_18_63] ; pipelining |

2651 |
%endif |

2652 |
psubusb m3, m1 |

2653 |
paddusb m4, m1 |

2654 |
paddusb m3, m0 ; p0+a0 |

2655 |
psubusb m4, m0 ; q0-a0 |

2656 | |

2657 |
%if ssse3_or_higher |

2658 |
SWAP 6, 7 |

2659 |
%ifdef m10 |

2660 |
SWAP 1, 10 |

2661 |
%else |

2662 |
mova m1, lim_res |

2663 |
%endif |

2664 |
mova m0, m7 |

2665 |
pmaddubsw m7, m6 |

2666 |
SWAP 6, 7 |

2667 |
pmaddubsw m0, m1 |

2668 |
SWAP 1, 0 |

2669 |
%ifdef m10 |

2670 |
SWAP 0, 10 |

2671 |
%endif |

2672 |
mova m0, lim_sign |

2673 |
%else |

2674 |
mova m6, mask_res |

2675 |
mova m1, lim_res |

2676 |
pmullw m6, [pw_18] |

2677 |
pmullw m1, [pw_18] |

2678 |
paddw m6, m7 |

2679 |
paddw m1, m7 |

2680 |
%endif |

2681 |
mova m0, lim_sign |

2682 |
psraw m6, 7 |

2683 |
psraw m1, 7 |

2684 |
packsswb m6, m1 ; a1 |

2685 |
pxor m1, m1 |

2686 |
psubb m1, m6 |

2687 |
pand m1, m0 ; -a1 |

2688 |
pandn m0, m6 ; +a1 |

2689 |
%if ssse3_or_higher |

2690 |
mova m6, [pb_9_63] |

2691 |
%endif |

2692 |
psubusb m2, m1 |

2693 |
paddusb m5, m1 |

2694 |
paddusb m2, m0 ; p1+a1 |

2695 |
psubusb m5, m0 ; q1-a1 |

2696 | |

2697 |
%if ssse3_or_higher |

2698 |
SWAP 6, 7 |

2699 |
%ifdef m10 |

2700 |
SWAP 1, 10 |

2701 |
%else |

2702 |
mova m1, lim_res |

2703 |
%endif |

2704 |
mova m0, m7 |

2705 |
pmaddubsw m7, m6 |

2706 |
SWAP 6, 7 |

2707 |
pmaddubsw m0, m1 |

2708 |
SWAP 1, 0 |

2709 |
%else |

2710 |
%ifdef m8 |

2711 |
SWAP 6, 12 |

2712 |
SWAP 1, 8 |

2713 |
%else |

2714 |
mova m6, mask_res |

2715 |
mova m1, lim_res |

2716 |
%endif |

2717 |
pmullw m6, [pw_9] |

2718 |
pmullw m1, [pw_9] |

2719 |
paddw m6, m7 |

2720 |
paddw m1, m7 |

2721 |
%endif |

2722 |
%ifdef m9 |

2723 |
SWAP 7, 9 |

2724 |
%else |

2725 |
mova m7, lim_sign |

2726 |
%endif |

2727 |
psraw m6, 7 |

2728 |
psraw m1, 7 |

2729 |
packsswb m6, m1 ; a1 |

2730 |
pxor m0, m0 |

2731 |
psubb m0, m6 |

2732 |
pand m0, m7 ; -a1 |

2733 |
pandn m7, m6 ; +a1 |

2734 |
%ifdef m8 |

2735 |
SWAP 1, 13 |

2736 |
SWAP 6, 14 |

2737 |
%else |

2738 |
mova m1, p2backup |

2739 |
mova m6, q2backup |

2740 |
%endif |

2741 |
psubusb m1, m0 |

2742 |
paddusb m6, m0 |

2743 |
paddusb m1, m7 ; p1+a1 |

2744 |
psubusb m6, m7 ; q1-a1 |

2745 | |

2746 |
; store |

2747 |
%ifidn %2, v |

2748 |
movrow [dst2_reg+mstride_reg*4], m1 |

2749 |
movrow [dst_reg +mstride_reg*2], m2 |

2750 |
movrow [dst_reg +mstride_reg ], m3 |

2751 |
movrow [dst_reg], m4 |

2752 |
movrow [dst2_reg], m5 |

2753 |
movrow [dst2_reg+ stride_reg ], m6 |

2754 |
%if mmsize == 16 && %4 == 8 |

2755 |
add dst8_reg, mstride_reg |

2756 |
movhps [dst8_reg+mstride_reg*2], m1 |

2757 |
movhps [dst8_reg+mstride_reg ], m2 |

2758 |
movhps [dst8_reg], m3 |

2759 |
add dst8_reg, stride_reg |

2760 |
movhps [dst8_reg], m4 |

2761 |
movhps [dst8_reg+ stride_reg ], m5 |

2762 |
movhps [dst8_reg+ stride_reg*2], m6 |

2763 |
%endif |

2764 |
%else ; h |

2765 |
inc dst_reg |

2766 |
inc dst2_reg |

2767 | |

2768 |
; 4x8/16 transpose |

2769 |
TRANSPOSE4x4B 1, 2, 3, 4, 0 |

2770 |
SBUTTERFLY bw, 5, 6, 0 |

2771 | |

2772 |
%if mmsize == 8 ; mmx/mmxext (h) |

2773 |
WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg |

2774 |
add dst_reg, 4 |

2775 |
WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg |

2776 |
%else ; sse2 (h) |

2777 |
lea dst8_reg, [dst8_reg+mstride_reg+1] |

2778 |
WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |

2779 |
lea dst_reg, [dst2_reg+mstride_reg+4] |

2780 |
lea dst8_reg, [dst8_reg+mstride_reg+4] |

2781 |
%ifidn %1, sse4 |

2782 |
add dst2_reg, 4 |

2783 |
%endif |

2784 |
WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg |

2785 |
%ifidn %1, sse4 |

2786 |
lea dst2_reg, [dst8_reg+ stride_reg] |

2787 |
%endif |

2788 |
WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg |

2789 |
%endif |

2790 |
%endif |

2791 | |

2792 |
%if mmsize == 8 |

2793 |
%if %4 == 8 ; chroma |

2794 |
%ifidn %2, h |

2795 |
sub dst_reg, 5 |

2796 |
%endif |

2797 |
cmp dst_reg, dst8_reg |

2798 |
mov dst_reg, dst8_reg |

2799 |
jnz .next8px |

2800 |
%else |

2801 |
%ifidn %2, h |

2802 |
lea dst_reg, [dst_reg + stride_reg*8-5] |

2803 |
%else ; v |

2804 |
add dst_reg, 8 |

2805 |
%endif |

2806 |
dec cnt_reg |

2807 |
jg .next8px |

2808 |
%endif |

2809 |
%endif |

2810 | |

2811 |
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext |

2812 |
mov rsp, stack_reg ; restore stack pointer |

2813 |
%endif |

2814 |
RET |

2815 |
%endmacro |

2816 | |

2817 |
INIT_MMX |

2818 |
%define SPLATB_REG SPLATB_REG_MMX |

2819 |
MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 |

2820 |
MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 |

2821 |
MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 |

2822 |
MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 |

2823 | |

2824 |
%define SPLATB_REG SPLATB_REG_MMXEXT |

2825 |
MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 |

2826 |
MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 |

2827 |
MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 |

2828 |
MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 |

2829 | |

2830 |
INIT_XMM |

2831 |
%define SPLATB_REG SPLATB_REG_SSE2 |

2832 |
%define WRITE_8W WRITE_8W_SSE2 |

2833 |
MBEDGE_LOOPFILTER sse2, v, 5, 16, 15 |

2834 |
%ifdef m8 |

2835 |
MBEDGE_LOOPFILTER sse2, h, 5, 16, 15 |

2836 |
%else |

2837 |
MBEDGE_LOOPFILTER sse2, h, 6, 16, 15 |

2838 |
%endif |

2839 |
MBEDGE_LOOPFILTER sse2, v, 6, 8, 15 |

2840 |
MBEDGE_LOOPFILTER sse2, h, 6, 8, 15 |

2841 | |

2842 |
%define SPLATB_REG SPLATB_REG_SSSE3 |

2843 |
MBEDGE_LOOPFILTER ssse3, v, 5, 16, 15 |

2844 |
%ifdef m8 |

2845 |
MBEDGE_LOOPFILTER ssse3, h, 5, 16, 15 |

2846 |
%else |

2847 |
MBEDGE_LOOPFILTER ssse3, h, 6, 16, 15 |

2848 |
%endif |

2849 |
MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15 |

2850 |
MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15 |

2851 | |

2852 |
%define WRITE_8W WRITE_8W_SSE4 |

2853 |
%ifdef m8 |

2854 |
MBEDGE_LOOPFILTER sse4, h, 5, 16, 15 |

2855 |
%else |

2856 |
MBEDGE_LOOPFILTER sse4, h, 6, 16, 15 |

2857 |
%endif |

2858 |
MBEDGE_LOOPFILTER sse4, h, 6, 8, 15 |