## ffmpeg / libavcodec / x86 / vp8dsp.asm @ 3ae079a3

History | View | Annotate | Download (76.5 KB)

1 |
;****************************************************************************** |
---|---|

2 |
;* VP8 MMXEXT optimizations |

3 |
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |

4 |
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |

5 |
;* |

6 |
;* This file is part of FFmpeg. |

7 |
;* |

8 |
;* FFmpeg is free software; you can redistribute it and/or |

9 |
;* modify it under the terms of the GNU Lesser General Public |

10 |
;* License as published by the Free Software Foundation; either |

11 |
;* version 2.1 of the License, or (at your option) any later version. |

12 |
;* |

13 |
;* FFmpeg is distributed in the hope that it will be useful, |

14 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

15 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

16 |
;* Lesser General Public License for more details. |

17 |
;* |

18 |
;* You should have received a copy of the GNU Lesser General Public |

19 |
;* License along with FFmpeg; if not, write to the Free Software |

20 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

21 |
;****************************************************************************** |

22 | |

23 |
%include "x86inc.asm" |

24 |
%include "x86util.asm" |

25 | |

26 |
SECTION_RODATA |

27 | |

28 |
fourtap_filter_hw_m: times 4 dw -6, 123 |

29 |
times 4 dw 12, -1 |

30 |
times 4 dw -9, 93 |

31 |
times 4 dw 50, -6 |

32 |
times 4 dw -6, 50 |

33 |
times 4 dw 93, -9 |

34 |
times 4 dw -1, 12 |

35 |
times 4 dw 123, -6 |

36 | |

37 |
sixtap_filter_hw_m: times 4 dw 2, -11 |

38 |
times 4 dw 108, 36 |

39 |
times 4 dw -8, 1 |

40 |
times 4 dw 3, -16 |

41 |
times 4 dw 77, 77 |

42 |
times 4 dw -16, 3 |

43 |
times 4 dw 1, -8 |

44 |
times 4 dw 36, 108 |

45 |
times 4 dw -11, 2 |

46 | |

47 |
fourtap_filter_hb_m: times 8 db -6, 123 |

48 |
times 8 db 12, -1 |

49 |
times 8 db -9, 93 |

50 |
times 8 db 50, -6 |

51 |
times 8 db -6, 50 |

52 |
times 8 db 93, -9 |

53 |
times 8 db -1, 12 |

54 |
times 8 db 123, -6 |

55 | |

56 |
sixtap_filter_hb_m: times 8 db 2, 1 |

57 |
times 8 db -11, 108 |

58 |
times 8 db 36, -8 |

59 |
times 8 db 3, 3 |

60 |
times 8 db -16, 77 |

61 |
times 8 db 77, -16 |

62 |
times 8 db 1, 2 |

63 |
times 8 db -8, 36 |

64 |
times 8 db 108, -11 |

65 | |

66 |
fourtap_filter_v_m: times 8 dw -6 |

67 |
times 8 dw 123 |

68 |
times 8 dw 12 |

69 |
times 8 dw -1 |

70 |
times 8 dw -9 |

71 |
times 8 dw 93 |

72 |
times 8 dw 50 |

73 |
times 8 dw -6 |

74 |
times 8 dw -6 |

75 |
times 8 dw 50 |

76 |
times 8 dw 93 |

77 |
times 8 dw -9 |

78 |
times 8 dw -1 |

79 |
times 8 dw 12 |

80 |
times 8 dw 123 |

81 |
times 8 dw -6 |

82 | |

83 |
sixtap_filter_v_m: times 8 dw 2 |

84 |
times 8 dw -11 |

85 |
times 8 dw 108 |

86 |
times 8 dw 36 |

87 |
times 8 dw -8 |

88 |
times 8 dw 1 |

89 |
times 8 dw 3 |

90 |
times 8 dw -16 |

91 |
times 8 dw 77 |

92 |
times 8 dw 77 |

93 |
times 8 dw -16 |

94 |
times 8 dw 3 |

95 |
times 8 dw 1 |

96 |
times 8 dw -8 |

97 |
times 8 dw 36 |

98 |
times 8 dw 108 |

99 |
times 8 dw -11 |

100 |
times 8 dw 2 |

101 | |

102 |
bilinear_filter_vw_m: times 8 dw 1 |

103 |
times 8 dw 2 |

104 |
times 8 dw 3 |

105 |
times 8 dw 4 |

106 |
times 8 dw 5 |

107 |
times 8 dw 6 |

108 |
times 8 dw 7 |

109 | |

110 |
bilinear_filter_vb_m: times 8 db 7, 1 |

111 |
times 8 db 6, 2 |

112 |
times 8 db 5, 3 |

113 |
times 8 db 4, 4 |

114 |
times 8 db 3, 5 |

115 |
times 8 db 2, 6 |

116 |
times 8 db 1, 7 |

117 | |

118 |
%ifdef PIC |

119 |
%define fourtap_filter_hw r11 |

120 |
%define sixtap_filter_hw r11 |

121 |
%define fourtap_filter_hb r11 |

122 |
%define sixtap_filter_hb r11 |

123 |
%define fourtap_filter_v r11 |

124 |
%define sixtap_filter_v r11 |

125 |
%define bilinear_filter_vw r11 |

126 |
%define bilinear_filter_vb r11 |

127 |
%else |

128 |
%define fourtap_filter_hw fourtap_filter_hw_m |

129 |
%define sixtap_filter_hw sixtap_filter_hw_m |

130 |
%define fourtap_filter_hb fourtap_filter_hb_m |

131 |
%define sixtap_filter_hb sixtap_filter_hb_m |

132 |
%define fourtap_filter_v fourtap_filter_v_m |

133 |
%define sixtap_filter_v sixtap_filter_v_m |

134 |
%define bilinear_filter_vw bilinear_filter_vw_m |

135 |
%define bilinear_filter_vb bilinear_filter_vb_m |

136 |
%endif |

137 | |

138 |
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |

139 |
filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |

140 | |

141 |
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |

142 |
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |

143 |
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |

144 | |

145 |
pw_20091: times 4 dw 20091 |

146 |
pw_17734: times 4 dw 17734 |

147 | |

148 |
cextern pb_1 |

149 |
cextern pw_3 |

150 |
cextern pb_3 |

151 |
cextern pw_4 |

152 |
cextern pb_4 |

153 |
cextern pw_9 |

154 |
cextern pw_18 |

155 |
cextern pw_27 |

156 |
cextern pw_63 |

157 |
cextern pw_64 |

158 |
cextern pb_80 |

159 |
cextern pb_F8 |

160 |
cextern pb_FE |

161 | |

162 |
SECTION .text |

163 | |

164 |
;----------------------------------------------------------------------------- |

165 |
; subpel MC functions: |

166 |
; |

167 |
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, |

168 |
; uint8_t *src, int srcstride, |

169 |
; int height, int mx, int my); |

170 |
;----------------------------------------------------------------------------- |

171 | |

172 |
%macro FILTER_SSSE3 3 |

173 |
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 |

174 |
lea r5d, [r5*3] |

175 |
mova m3, [filter_h6_shuf2] |

176 |
mova m4, [filter_h6_shuf3] |

177 |
%ifdef PIC |

178 |
lea r11, [sixtap_filter_hb_m] |

179 |
%endif |

180 |
mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |

181 |
mova m6, [sixtap_filter_hb+r5*8-32] |

182 |
mova m7, [sixtap_filter_hb+r5*8-16] |

183 | |

184 |
.nextrow |

185 |
movu m0, [r2-2] |

186 |
mova m1, m0 |

187 |
mova m2, m0 |

188 |
%ifidn %1, 4 |

189 |
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the |

190 |
; shuffle with a memory operand |

191 |
punpcklbw m0, [r2+3] |

192 |
%else |

193 |
pshufb m0, [filter_h6_shuf1] |

194 |
%endif |

195 |
pshufb m1, m3 |

196 |
pshufb m2, m4 |

197 |
pmaddubsw m0, m5 |

198 |
pmaddubsw m1, m6 |

199 |
pmaddubsw m2, m7 |

200 |
paddsw m0, m1 |

201 |
paddsw m0, m2 |

202 |
paddsw m0, [pw_64] |

203 |
psraw m0, 7 |

204 |
packuswb m0, m0 |

205 |
movh [r0], m0 ; store |

206 | |

207 |
; go to next line |

208 |
add r0, r1 |

209 |
add r2, r3 |

210 |
dec r4 ; next row |

211 |
jg .nextrow |

212 |
REP_RET |

213 | |

214 |
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 |

215 |
shl r5d, 4 |

216 |
mova m2, [pw_64] |

217 |
mova m3, [filter_h2_shuf] |

218 |
mova m4, [filter_h4_shuf] |

219 |
%ifdef PIC |

220 |
lea r11, [fourtap_filter_hb_m] |

221 |
%endif |

222 |
mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |

223 |
mova m6, [fourtap_filter_hb+r5] |

224 | |

225 |
.nextrow |

226 |
movu m0, [r2-1] |

227 |
mova m1, m0 |

228 |
pshufb m0, m3 |

229 |
pshufb m1, m4 |

230 |
pmaddubsw m0, m5 |

231 |
pmaddubsw m1, m6 |

232 |
paddsw m0, m2 |

233 |
paddsw m0, m1 |

234 |
psraw m0, 7 |

235 |
packuswb m0, m0 |

236 |
movh [r0], m0 ; store |

237 | |

238 |
; go to next line |

239 |
add r0, r1 |

240 |
add r2, r3 |

241 |
dec r4 ; next row |

242 |
jg .nextrow |

243 |
REP_RET |

244 | |

245 |
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 |

246 |
shl r6d, 4 |

247 |
%ifdef PIC |

248 |
lea r11, [fourtap_filter_hb_m] |

249 |
%endif |

250 |
mova m5, [fourtap_filter_hb+r6-16] |

251 |
mova m6, [fourtap_filter_hb+r6] |

252 |
mova m7, [pw_64] |

253 | |

254 |
; read 3 lines |

255 |
sub r2, r3 |

256 |
movh m0, [r2] |

257 |
movh m1, [r2+ r3] |

258 |
movh m2, [r2+2*r3] |

259 |
add r2, r3 |

260 | |

261 |
.nextrow |

262 |
movh m3, [r2+2*r3] ; read new row |

263 |
mova m4, m0 |

264 |
mova m0, m1 |

265 |
punpcklbw m4, m1 |

266 |
mova m1, m2 |

267 |
punpcklbw m2, m3 |

268 |
pmaddubsw m4, m5 |

269 |
pmaddubsw m2, m6 |

270 |
paddsw m4, m2 |

271 |
mova m2, m3 |

272 |
paddsw m4, m7 |

273 |
psraw m4, 7 |

274 |
packuswb m4, m4 |

275 |
movh [r0], m4 |

276 | |

277 |
; go to next line |

278 |
add r0, r1 |

279 |
add r2, r3 |

280 |
dec r4 ; next row |

281 |
jg .nextrow |

282 |
REP_RET |

283 | |

284 |
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 |

285 |
lea r6d, [r6*3] |

286 |
%ifdef PIC |

287 |
lea r11, [sixtap_filter_hb_m] |

288 |
%endif |

289 |
lea r6, [sixtap_filter_hb+r6*8] |

290 | |

291 |
; read 5 lines |

292 |
sub r2, r3 |

293 |
sub r2, r3 |

294 |
movh m0, [r2] |

295 |
movh m1, [r2+r3] |

296 |
movh m2, [r2+r3*2] |

297 |
lea r2, [r2+r3*2] |

298 |
add r2, r3 |

299 |
movh m3, [r2] |

300 |
movh m4, [r2+r3] |

301 | |

302 |
.nextrow |

303 |
movh m5, [r2+2*r3] ; read new row |

304 |
mova m6, m0 |

305 |
punpcklbw m6, m5 |

306 |
mova m0, m1 |

307 |
punpcklbw m1, m2 |

308 |
mova m7, m3 |

309 |
punpcklbw m7, m4 |

310 |
pmaddubsw m6, [r6-48] |

311 |
pmaddubsw m1, [r6-32] |

312 |
pmaddubsw m7, [r6-16] |

313 |
paddsw m6, m1 |

314 |
paddsw m6, m7 |

315 |
mova m1, m2 |

316 |
paddsw m6, [pw_64] |

317 |
mova m2, m3 |

318 |
psraw m6, 7 |

319 |
mova m3, m4 |

320 |
packuswb m6, m6 |

321 |
mova m4, m5 |

322 |
movh [r0], m6 |

323 | |

324 |
; go to next line |

325 |
add r0, r1 |

326 |
add r2, r3 |

327 |
dec r4 ; next row |

328 |
jg .nextrow |

329 |
REP_RET |

330 |
%endmacro |

331 | |

332 |
INIT_MMX |

333 |
FILTER_SSSE3 4, 0, 0 |

334 |
INIT_XMM |

335 |
FILTER_SSSE3 8, 8, 7 |

336 | |

337 |
; 4x4 block, H-only 4-tap filter |

338 |
cglobal put_vp8_epel4_h4_mmxext, 6, 6 |

339 |
shl r5d, 4 |

340 |
%ifdef PIC |

341 |
lea r11, [fourtap_filter_hw_m] |

342 |
%endif |

343 |
movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |

344 |
movq mm5, [fourtap_filter_hw+r5] |

345 |
movq mm7, [pw_64] |

346 |
pxor mm6, mm6 |

347 | |

348 |
.nextrow |

349 |
movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels |

350 | |

351 |
; first set of 2 pixels |

352 |
movq mm2, mm1 ; byte ABCD.. |

353 |
punpcklbw mm1, mm6 ; byte->word ABCD |

354 |
pshufw mm0, mm2, 9 ; byte CDEF.. |

355 |
punpcklbw mm0, mm6 ; byte->word CDEF |

356 |
pshufw mm3, mm1, 0x94 ; word ABBC |

357 |
pshufw mm1, mm0, 0x94 ; word CDDE |

358 |
pmaddwd mm3, mm4 ; multiply 2px with F0/F1 |

359 |
movq mm0, mm1 ; backup for second set of pixels |

360 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |

361 |
paddd mm3, mm1 ; finish 1st 2px |

362 | |

363 |
; second set of 2 pixels, use backup of above |

364 |
punpckhbw mm2, mm6 ; byte->word EFGH |

365 |
pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 |

366 |
pshufw mm1, mm2, 0x94 ; word EFFG |

367 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |

368 |
paddd mm0, mm1 ; finish 2nd 2px |

369 | |

370 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |

371 |
packssdw mm3, mm0 ; merge dword->word (4px) |

372 |
paddsw mm3, mm7 ; rounding |

373 |
psraw mm3, 7 |

374 |
packuswb mm3, mm6 ; clip and word->bytes |

375 |
movd [r0], mm3 ; store |

376 | |

377 |
; go to next line |

378 |
add r0, r1 |

379 |
add r2, r3 |

380 |
dec r4 ; next row |

381 |
jg .nextrow |

382 |
REP_RET |

383 | |

384 |
; 4x4 block, H-only 6-tap filter |

385 |
cglobal put_vp8_epel4_h6_mmxext, 6, 6 |

386 |
lea r5d, [r5*3] |

387 |
%ifdef PIC |

388 |
lea r11, [sixtap_filter_hw_m] |

389 |
%endif |

390 |
movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words |

391 |
movq mm5, [sixtap_filter_hw+r5*8-32] |

392 |
movq mm6, [sixtap_filter_hw+r5*8-16] |

393 |
movq mm7, [pw_64] |

394 |
pxor mm3, mm3 |

395 | |

396 |
.nextrow |

397 |
movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels |

398 | |

399 |
; first set of 2 pixels |

400 |
movq mm2, mm1 ; byte ABCD.. |

401 |
punpcklbw mm1, mm3 ; byte->word ABCD |

402 |
pshufw mm0, mm2, 0x9 ; byte CDEF.. |

403 |
punpckhbw mm2, mm3 ; byte->word EFGH |

404 |
punpcklbw mm0, mm3 ; byte->word CDEF |

405 |
pshufw mm1, mm1, 0x94 ; word ABBC |

406 |
pshufw mm2, mm2, 0x94 ; word EFFG |

407 |
pmaddwd mm1, mm4 ; multiply 2px with F0/F1 |

408 |
pshufw mm3, mm0, 0x94 ; word CDDE |

409 |
movq mm0, mm3 ; backup for second set of pixels |

410 |
pmaddwd mm3, mm5 ; multiply 2px with F2/F3 |

411 |
paddd mm1, mm3 ; add to 1st 2px cache |

412 |
movq mm3, mm2 ; backup for second set of pixels |

413 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |

414 |
paddd mm1, mm2 ; finish 1st 2px |

415 | |

416 |
; second set of 2 pixels, use backup of above |

417 |
movd mm2, [r2+3] ; byte FGHI (prevent overreads) |

418 |
pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 |

419 |
pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 |

420 |
paddd mm0, mm3 ; add to 2nd 2px cache |

421 |
pxor mm3, mm3 |

422 |
punpcklbw mm2, mm3 ; byte->word FGHI |

423 |
pshufw mm2, mm2, 0xE9 ; word GHHI |

424 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |

425 |
paddd mm0, mm2 ; finish 2nd 2px |

426 | |

427 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |

428 |
packssdw mm1, mm0 ; merge dword->word (4px) |

429 |
paddsw mm1, mm7 ; rounding |

430 |
psraw mm1, 7 |

431 |
packuswb mm1, mm3 ; clip and word->bytes |

432 |
movd [r0], mm1 ; store |

433 | |

434 |
; go to next line |

435 |
add r0, r1 |

436 |
add r2, r3 |

437 |
dec r4 ; next row |

438 |
jg .nextrow |

439 |
REP_RET |

440 | |

441 |
; 4x4 block, H-only 4-tap filter |

442 |
INIT_XMM |

443 |
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 |

444 |
shl r5d, 4 |

445 |
%ifdef PIC |

446 |
lea r11, [fourtap_filter_hw_m] |

447 |
%endif |

448 |
mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |

449 |
mova m6, [fourtap_filter_hw+r5] |

450 |
pxor m7, m7 |

451 | |

452 |
.nextrow |

453 |
movh m0, [r2-1] |

454 |
punpcklbw m0, m7 ; ABCDEFGH |

455 |
mova m1, m0 |

456 |
mova m2, m0 |

457 |
mova m3, m0 |

458 |
psrldq m1, 2 ; BCDEFGH |

459 |
psrldq m2, 4 ; CDEFGH |

460 |
psrldq m3, 6 ; DEFGH |

461 |
punpcklwd m0, m1 ; ABBCCDDE |

462 |
punpcklwd m2, m3 ; CDDEEFFG |

463 |
pmaddwd m0, m5 |

464 |
pmaddwd m2, m6 |

465 |
paddd m0, m2 |

466 | |

467 |
movh m1, [r2+3] |

468 |
punpcklbw m1, m7 ; ABCDEFGH |

469 |
mova m2, m1 |

470 |
mova m3, m1 |

471 |
mova m4, m1 |

472 |
psrldq m2, 2 ; BCDEFGH |

473 |
psrldq m3, 4 ; CDEFGH |

474 |
psrldq m4, 6 ; DEFGH |

475 |
punpcklwd m1, m2 ; ABBCCDDE |

476 |
punpcklwd m3, m4 ; CDDEEFFG |

477 |
pmaddwd m1, m5 |

478 |
pmaddwd m3, m6 |

479 |
paddd m1, m3 |

480 | |

481 |
packssdw m0, m1 |

482 |
paddsw m0, [pw_64] |

483 |
psraw m0, 7 |

484 |
packuswb m0, m7 |

485 |
movh [r0], m0 ; store |

486 | |

487 |
; go to next line |

488 |
add r0, r1 |

489 |
add r2, r3 |

490 |
dec r4 ; next row |

491 |
jg .nextrow |

492 |
REP_RET |

493 | |

494 |
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 |

495 |
lea r5d, [r5*3] |

496 |
%ifdef PIC |

497 |
lea r11, [sixtap_filter_hw_m] |

498 |
%endif |

499 |
lea r5, [sixtap_filter_hw+r5*8] |

500 |
pxor m7, m7 |

501 | |

502 |
.nextrow |

503 |
movu m0, [r2-2] |

504 |
mova m6, m0 |

505 |
mova m4, m0 |

506 |
punpcklbw m0, m7 ; ABCDEFGHI |

507 |
mova m1, m0 |

508 |
mova m2, m0 |

509 |
mova m3, m0 |

510 |
psrldq m1, 2 ; BCDEFGH |

511 |
psrldq m2, 4 ; CDEFGH |

512 |
psrldq m3, 6 ; DEFGH |

513 |
psrldq m4, 4 |

514 |
punpcklbw m4, m7 ; EFGH |

515 |
mova m5, m4 |

516 |
psrldq m5, 2 ; FGH |

517 |
punpcklwd m0, m1 ; ABBCCDDE |

518 |
punpcklwd m2, m3 ; CDDEEFFG |

519 |
punpcklwd m4, m5 ; EFFGGHHI |

520 |
pmaddwd m0, [r5-48] |

521 |
pmaddwd m2, [r5-32] |

522 |
pmaddwd m4, [r5-16] |

523 |
paddd m0, m2 |

524 |
paddd m0, m4 |

525 | |

526 |
psrldq m6, 4 |

527 |
mova m4, m6 |

528 |
punpcklbw m6, m7 ; ABCDEFGHI |

529 |
mova m1, m6 |

530 |
mova m2, m6 |

531 |
mova m3, m6 |

532 |
psrldq m1, 2 ; BCDEFGH |

533 |
psrldq m2, 4 ; CDEFGH |

534 |
psrldq m3, 6 ; DEFGH |

535 |
psrldq m4, 4 |

536 |
punpcklbw m4, m7 ; EFGH |

537 |
mova m5, m4 |

538 |
psrldq m5, 2 ; FGH |

539 |
punpcklwd m6, m1 ; ABBCCDDE |

540 |
punpcklwd m2, m3 ; CDDEEFFG |

541 |
punpcklwd m4, m5 ; EFFGGHHI |

542 |
pmaddwd m6, [r5-48] |

543 |
pmaddwd m2, [r5-32] |

544 |
pmaddwd m4, [r5-16] |

545 |
paddd m6, m2 |

546 |
paddd m6, m4 |

547 | |

548 |
packssdw m0, m6 |

549 |
paddsw m0, [pw_64] |

550 |
psraw m0, 7 |

551 |
packuswb m0, m7 |

552 |
movh [r0], m0 ; store |

553 | |

554 |
; go to next line |

555 |
add r0, r1 |

556 |
add r2, r3 |

557 |
dec r4 ; next row |

558 |
jg .nextrow |

559 |
REP_RET |

560 | |

561 |
%macro FILTER_V 3 |

562 |
; 4x4 block, V-only 4-tap filter |

563 |
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 |

564 |
shl r6d, 5 |

565 |
%ifdef PIC |

566 |
lea r11, [fourtap_filter_v_m] |

567 |
%endif |

568 |
lea r6, [fourtap_filter_v+r6-32] |

569 |
mova m6, [pw_64] |

570 |
pxor m7, m7 |

571 |
mova m5, [r6+48] |

572 | |

573 |
; read 3 lines |

574 |
sub r2, r3 |

575 |
movh m0, [r2] |

576 |
movh m1, [r2+ r3] |

577 |
movh m2, [r2+2*r3] |

578 |
add r2, r3 |

579 |
punpcklbw m0, m7 |

580 |
punpcklbw m1, m7 |

581 |
punpcklbw m2, m7 |

582 | |

583 |
.nextrow |

584 |
; first calculate negative taps (to prevent losing positive overflows) |

585 |
movh m4, [r2+2*r3] ; read new row |

586 |
punpcklbw m4, m7 |

587 |
mova m3, m4 |

588 |
pmullw m0, [r6+0] |

589 |
pmullw m4, m5 |

590 |
paddsw m4, m0 |

591 | |

592 |
; then calculate positive taps |

593 |
mova m0, m1 |

594 |
pmullw m1, [r6+16] |

595 |
paddsw m4, m1 |

596 |
mova m1, m2 |

597 |
pmullw m2, [r6+32] |

598 |
paddsw m4, m2 |

599 |
mova m2, m3 |

600 | |

601 |
; round/clip/store |

602 |
paddsw m4, m6 |

603 |
psraw m4, 7 |

604 |
packuswb m4, m7 |

605 |
movh [r0], m4 |

606 | |

607 |
; go to next line |

608 |
add r0, r1 |

609 |
add r2, r3 |

610 |
dec r4 ; next row |

611 |
jg .nextrow |

612 |
REP_RET |

613 | |

614 | |

615 |
; 4x4 block, V-only 6-tap filter |

616 |
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 |

617 |
shl r6d, 4 |

618 |
lea r6, [r6*3] |

619 |
%ifdef PIC |

620 |
lea r11, [sixtap_filter_v_m] |

621 |
%endif |

622 |
lea r6, [sixtap_filter_v+r6-96] |

623 |
pxor m7, m7 |

624 | |

625 |
; read 5 lines |

626 |
sub r2, r3 |

627 |
sub r2, r3 |

628 |
movh m0, [r2] |

629 |
movh m1, [r2+r3] |

630 |
movh m2, [r2+r3*2] |

631 |
lea r2, [r2+r3*2] |

632 |
add r2, r3 |

633 |
movh m3, [r2] |

634 |
movh m4, [r2+r3] |

635 |
punpcklbw m0, m7 |

636 |
punpcklbw m1, m7 |

637 |
punpcklbw m2, m7 |

638 |
punpcklbw m3, m7 |

639 |
punpcklbw m4, m7 |

640 | |

641 |
.nextrow |

642 |
; first calculate negative taps (to prevent losing positive overflows) |

643 |
mova m5, m1 |

644 |
pmullw m5, [r6+16] |

645 |
mova m6, m4 |

646 |
pmullw m6, [r6+64] |

647 |
paddsw m6, m5 |

648 | |

649 |
; then calculate positive taps |

650 |
movh m5, [r2+2*r3] ; read new row |

651 |
punpcklbw m5, m7 |

652 |
pmullw m0, [r6+0] |

653 |
paddsw m6, m0 |

654 |
mova m0, m1 |

655 |
mova m1, m2 |

656 |
pmullw m2, [r6+32] |

657 |
paddsw m6, m2 |

658 |
mova m2, m3 |

659 |
pmullw m3, [r6+48] |

660 |
paddsw m6, m3 |

661 |
mova m3, m4 |

662 |
mova m4, m5 |

663 |
pmullw m5, [r6+80] |

664 |
paddsw m6, m5 |

665 | |

666 |
; round/clip/store |

667 |
paddsw m6, [pw_64] |

668 |
psraw m6, 7 |

669 |
packuswb m6, m7 |

670 |
movh [r0], m6 |

671 | |

672 |
; go to next line |

673 |
add r0, r1 |

674 |
add r2, r3 |

675 |
dec r4 ; next row |

676 |
jg .nextrow |

677 |
REP_RET |

678 |
%endmacro |

679 | |

680 |
INIT_MMX |

681 |
FILTER_V mmxext, 4, 0 |

682 |
INIT_XMM |

683 |
FILTER_V sse2, 8, 8 |

684 | |

685 |
%macro FILTER_BILINEAR 3 |

686 |
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 |

687 |
mov r5d, 8*16 |

688 |
shl r6d, 4 |

689 |
sub r5d, r6d |

690 |
%ifdef PIC |

691 |
lea r11, [bilinear_filter_vw_m] |

692 |
%endif |

693 |
pxor m6, m6 |

694 |
mova m4, [bilinear_filter_vw+r5-16] |

695 |
mova m5, [bilinear_filter_vw+r6-16] |

696 |
.nextrow |

697 |
movh m0, [r2+r3*0] |

698 |
movh m1, [r2+r3*1] |

699 |
movh m3, [r2+r3*2] |

700 |
punpcklbw m0, m6 |

701 |
punpcklbw m1, m6 |

702 |
punpcklbw m3, m6 |

703 |
mova m2, m1 |

704 |
pmullw m0, m4 |

705 |
pmullw m1, m5 |

706 |
pmullw m2, m4 |

707 |
pmullw m3, m5 |

708 |
paddsw m0, m1 |

709 |
paddsw m2, m3 |

710 |
psraw m0, 2 |

711 |
psraw m2, 2 |

712 |
pavgw m0, m6 |

713 |
pavgw m2, m6 |

714 |
%ifidn %1, mmxext |

715 |
packuswb m0, m0 |

716 |
packuswb m2, m2 |

717 |
movh [r0+r1*0], m0 |

718 |
movh [r0+r1*1], m2 |

719 |
%else |

720 |
packuswb m0, m2 |

721 |
movh [r0+r1*0], m0 |

722 |
movhps [r0+r1*1], m0 |

723 |
%endif |

724 | |

725 |
lea r0, [r0+r1*2] |

726 |
lea r2, [r2+r3*2] |

727 |
sub r4, 2 |

728 |
jg .nextrow |

729 |
REP_RET |

730 | |

731 |
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 |

732 |
mov r6d, 8*16 |

733 |
shl r5d, 4 |

734 |
sub r6d, r5d |

735 |
%ifdef PIC |

736 |
lea r11, [bilinear_filter_vw_m] |

737 |
%endif |

738 |
pxor m6, m6 |

739 |
mova m4, [bilinear_filter_vw+r6-16] |

740 |
mova m5, [bilinear_filter_vw+r5-16] |

741 |
.nextrow |

742 |
movh m0, [r2+r3*0+0] |

743 |
movh m1, [r2+r3*0+1] |

744 |
movh m2, [r2+r3*1+0] |

745 |
movh m3, [r2+r3*1+1] |

746 |
punpcklbw m0, m6 |

747 |
punpcklbw m1, m6 |

748 |
punpcklbw m2, m6 |

749 |
punpcklbw m3, m6 |

750 |
pmullw m0, m4 |

751 |
pmullw m1, m5 |

752 |
pmullw m2, m4 |

753 |
pmullw m3, m5 |

754 |
paddsw m0, m1 |

755 |
paddsw m2, m3 |

756 |
psraw m0, 2 |

757 |
psraw m2, 2 |

758 |
pavgw m0, m6 |

759 |
pavgw m2, m6 |

760 |
%ifidn %1, mmxext |

761 |
packuswb m0, m0 |

762 |
packuswb m2, m2 |

763 |
movh [r0+r1*0], m0 |

764 |
movh [r0+r1*1], m2 |

765 |
%else |

766 |
packuswb m0, m2 |

767 |
movh [r0+r1*0], m0 |

768 |
movhps [r0+r1*1], m0 |

769 |
%endif |

770 | |

771 |
lea r0, [r0+r1*2] |

772 |
lea r2, [r2+r3*2] |

773 |
sub r4, 2 |

774 |
jg .nextrow |

775 |
REP_RET |

776 |
%endmacro |

777 | |

778 |
INIT_MMX |

779 |
FILTER_BILINEAR mmxext, 4, 0 |

780 |
INIT_XMM |

781 |
FILTER_BILINEAR sse2, 8, 7 |

782 | |

783 |
%macro FILTER_BILINEAR_SSSE3 1 |

784 |
cglobal put_vp8_bilinear%1_v_ssse3, 7,7 |

785 |
shl r6d, 4 |

786 |
%ifdef PIC |

787 |
lea r11, [bilinear_filter_vb_m] |

788 |
%endif |

789 |
pxor m4, m4 |

790 |
mova m3, [bilinear_filter_vb+r6-16] |

791 |
.nextrow |

792 |
movh m0, [r2+r3*0] |

793 |
movh m1, [r2+r3*1] |

794 |
movh m2, [r2+r3*2] |

795 |
punpcklbw m0, m1 |

796 |
punpcklbw m1, m2 |

797 |
pmaddubsw m0, m3 |

798 |
pmaddubsw m1, m3 |

799 |
psraw m0, 2 |

800 |
psraw m1, 2 |

801 |
pavgw m0, m4 |

802 |
pavgw m1, m4 |

803 |
%if mmsize==8 |

804 |
packuswb m0, m0 |

805 |
packuswb m1, m1 |

806 |
movh [r0+r1*0], m0 |

807 |
movh [r0+r1*1], m1 |

808 |
%else |

809 |
packuswb m0, m1 |

810 |
movh [r0+r1*0], m0 |

811 |
movhps [r0+r1*1], m0 |

812 |
%endif |

813 | |

814 |
lea r0, [r0+r1*2] |

815 |
lea r2, [r2+r3*2] |

816 |
sub r4, 2 |

817 |
jg .nextrow |

818 |
REP_RET |

819 | |

820 |
cglobal put_vp8_bilinear%1_h_ssse3, 7,7 |

821 |
shl r5d, 4 |

822 |
%ifdef PIC |

823 |
lea r11, [bilinear_filter_vb_m] |

824 |
%endif |

825 |
pxor m4, m4 |

826 |
mova m2, [filter_h2_shuf] |

827 |
mova m3, [bilinear_filter_vb+r5-16] |

828 |
.nextrow |

829 |
movu m0, [r2+r3*0] |

830 |
movu m1, [r2+r3*1] |

831 |
pshufb m0, m2 |

832 |
pshufb m1, m2 |

833 |
pmaddubsw m0, m3 |

834 |
pmaddubsw m1, m3 |

835 |
psraw m0, 2 |

836 |
psraw m1, 2 |

837 |
pavgw m0, m4 |

838 |
pavgw m1, m4 |

839 |
%if mmsize==8 |

840 |
packuswb m0, m0 |

841 |
packuswb m1, m1 |

842 |
movh [r0+r1*0], m0 |

843 |
movh [r0+r1*1], m1 |

844 |
%else |

845 |
packuswb m0, m1 |

846 |
movh [r0+r1*0], m0 |

847 |
movhps [r0+r1*1], m0 |

848 |
%endif |

849 | |

850 |
lea r0, [r0+r1*2] |

851 |
lea r2, [r2+r3*2] |

852 |
sub r4, 2 |

853 |
jg .nextrow |

854 |
REP_RET |

855 |
%endmacro |

856 | |

857 |
INIT_MMX |

858 |
FILTER_BILINEAR_SSSE3 4 |

859 |
INIT_XMM |

860 |
FILTER_BILINEAR_SSSE3 8 |

861 | |

862 |
cglobal put_vp8_pixels8_mmx, 5,5 |

863 |
.nextrow: |

864 |
movq mm0, [r2+r3*0] |

865 |
movq mm1, [r2+r3*1] |

866 |
lea r2, [r2+r3*2] |

867 |
movq [r0+r1*0], mm0 |

868 |
movq [r0+r1*1], mm1 |

869 |
lea r0, [r0+r1*2] |

870 |
sub r4d, 2 |

871 |
jg .nextrow |

872 |
REP_RET |

873 | |

874 |
cglobal put_vp8_pixels16_mmx, 5,5 |

875 |
.nextrow: |

876 |
movq mm0, [r2+r3*0+0] |

877 |
movq mm1, [r2+r3*0+8] |

878 |
movq mm2, [r2+r3*1+0] |

879 |
movq mm3, [r2+r3*1+8] |

880 |
lea r2, [r2+r3*2] |

881 |
movq [r0+r1*0+0], mm0 |

882 |
movq [r0+r1*0+8], mm1 |

883 |
movq [r0+r1*1+0], mm2 |

884 |
movq [r0+r1*1+8], mm3 |

885 |
lea r0, [r0+r1*2] |

886 |
sub r4d, 2 |

887 |
jg .nextrow |

888 |
REP_RET |

889 | |

890 |
cglobal put_vp8_pixels16_sse, 5,5,2 |

891 |
.nextrow: |

892 |
movups xmm0, [r2+r3*0] |

893 |
movups xmm1, [r2+r3*1] |

894 |
lea r2, [r2+r3*2] |

895 |
movaps [r0+r1*0], xmm0 |

896 |
movaps [r0+r1*1], xmm1 |

897 |
lea r0, [r0+r1*2] |

898 |
sub r4d, 2 |

899 |
jg .nextrow |

900 |
REP_RET |

901 | |

902 |
;----------------------------------------------------------------------------- |

903 |
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |

904 |
;----------------------------------------------------------------------------- |

905 | |

906 |
%macro ADD_DC 4 |

907 |
%4 m2, [r0+%3] |

908 |
%4 m3, [r0+r2+%3] |

909 |
%4 m4, [r1+%3] |

910 |
%4 m5, [r1+r2+%3] |

911 |
paddusb m2, %1 |

912 |
paddusb m3, %1 |

913 |
paddusb m4, %1 |

914 |
paddusb m5, %1 |

915 |
psubusb m2, %2 |

916 |
psubusb m3, %2 |

917 |
psubusb m4, %2 |

918 |
psubusb m5, %2 |

919 |
%4 [r0+%3], m2 |

920 |
%4 [r0+r2+%3], m3 |

921 |
%4 [r1+%3], m4 |

922 |
%4 [r1+r2+%3], m5 |

923 |
%endmacro |

924 | |

925 |
INIT_MMX |

926 |
cglobal vp8_idct_dc_add_mmx, 3, 3 |

927 |
; load data |

928 |
movd m0, [r1] |

929 | |

930 |
; calculate DC |

931 |
paddw m0, [pw_4] |

932 |
pxor m1, m1 |

933 |
psraw m0, 3 |

934 |
movd [r1], m1 |

935 |
psubw m1, m0 |

936 |
packuswb m0, m0 |

937 |
packuswb m1, m1 |

938 |
punpcklbw m0, m0 |

939 |
punpcklbw m1, m1 |

940 |
punpcklwd m0, m0 |

941 |
punpcklwd m1, m1 |

942 | |

943 |
; add DC |

944 |
lea r1, [r0+r2*2] |

945 |
ADD_DC m0, m1, 0, movh |

946 |
RET |

947 | |

948 |
INIT_XMM |

949 |
cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |

950 |
; load data |

951 |
movd m0, [r1] |

952 |
pxor m1, m1 |

953 | |

954 |
; calculate DC |

955 |
paddw m0, [pw_4] |

956 |
movd [r1], m1 |

957 |
lea r1, [r0+r2*2] |

958 |
movd m2, [r0] |

959 |
movd m3, [r0+r2] |

960 |
movd m4, [r1] |

961 |
movd m5, [r1+r2] |

962 |
psraw m0, 3 |

963 |
pshuflw m0, m0, 0 |

964 |
punpcklqdq m0, m0 |

965 |
punpckldq m2, m3 |

966 |
punpckldq m4, m5 |

967 |
punpcklbw m2, m1 |

968 |
punpcklbw m4, m1 |

969 |
paddw m2, m0 |

970 |
paddw m4, m0 |

971 |
packuswb m2, m4 |

972 |
movd [r0], m2 |

973 |
pextrd [r0+r2], m2, 1 |

974 |
pextrd [r1], m2, 2 |

975 |
pextrd [r1+r2], m2, 3 |

976 |
RET |

977 | |

978 |
;----------------------------------------------------------------------------- |

979 |
; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |

980 |
;----------------------------------------------------------------------------- |

981 | |

982 |
INIT_MMX |

983 |
cglobal vp8_idct_dc_add4y_mmx, 3, 3 |

984 |
; load data |

985 |
movd m0, [r1+32*0] ; A |

986 |
movd m1, [r1+32*2] ; C |

987 |
punpcklwd m0, [r1+32*1] ; A B |

988 |
punpcklwd m1, [r1+32*3] ; C D |

989 |
punpckldq m0, m1 ; A B C D |

990 |
pxor m6, m6 |

991 | |

992 |
; calculate DC |

993 |
paddw m0, [pw_4] |

994 |
movd [r1+32*0], m6 |

995 |
movd [r1+32*1], m6 |

996 |
movd [r1+32*2], m6 |

997 |
movd [r1+32*3], m6 |

998 |
psraw m0, 3 |

999 |
psubw m6, m0 |

1000 |
packuswb m0, m0 |

1001 |
packuswb m6, m6 |

1002 |
punpcklbw m0, m0 ; AABBCCDD |

1003 |
punpcklbw m6, m6 ; AABBCCDD |

1004 |
movq m1, m0 |

1005 |
movq m7, m6 |

1006 |
punpcklbw m0, m0 ; AAAABBBB |

1007 |
punpckhbw m1, m1 ; CCCCDDDD |

1008 |
punpcklbw m6, m6 ; AAAABBBB |

1009 |
punpckhbw m7, m7 ; CCCCDDDD |

1010 | |

1011 |
; add DC |

1012 |
lea r1, [r0+r2*2] |

1013 |
ADD_DC m0, m6, 0, mova |

1014 |
ADD_DC m1, m7, 8, mova |

1015 |
RET |

1016 | |

1017 |
INIT_XMM |

1018 |
cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 |

1019 |
; load data |

1020 |
movd m0, [r1+32*0] ; A |

1021 |
movd m1, [r1+32*2] ; C |

1022 |
punpcklwd m0, [r1+32*1] ; A B |

1023 |
punpcklwd m1, [r1+32*3] ; C D |

1024 |
punpckldq m0, m1 ; A B C D |

1025 |
pxor m1, m1 |

1026 | |

1027 |
; calculate DC |

1028 |
paddw m0, [pw_4] |

1029 |
movd [r1+32*0], m1 |

1030 |
movd [r1+32*1], m1 |

1031 |
movd [r1+32*2], m1 |

1032 |
movd [r1+32*3], m1 |

1033 |
psraw m0, 3 |

1034 |
psubw m1, m0 |

1035 |
packuswb m0, m0 |

1036 |
packuswb m1, m1 |

1037 |
punpcklbw m0, m0 |

1038 |
punpcklbw m1, m1 |

1039 |
punpcklbw m0, m0 |

1040 |
punpcklbw m1, m1 |

1041 | |

1042 |
; add DC |

1043 |
lea r1, [r0+r2*2] |

1044 |
ADD_DC m0, m1, 0, mova |

1045 |
RET |

1046 | |

1047 |
;----------------------------------------------------------------------------- |

1048 |
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |

1049 |
;----------------------------------------------------------------------------- |

1050 | |

1051 |
INIT_MMX |

1052 |
cglobal vp8_idct_dc_add4uv_mmx, 3, 3 |

1053 |
; load data |

1054 |
movd m0, [r1+32*0] ; A |

1055 |
movd m1, [r1+32*2] ; C |

1056 |
punpcklwd m0, [r1+32*1] ; A B |

1057 |
punpcklwd m1, [r1+32*3] ; C D |

1058 |
punpckldq m0, m1 ; A B C D |

1059 |
pxor m6, m6 |

1060 | |

1061 |
; calculate DC |

1062 |
paddw m0, [pw_4] |

1063 |
movd [r1+32*0], m6 |

1064 |
movd [r1+32*1], m6 |

1065 |
movd [r1+32*2], m6 |

1066 |
movd [r1+32*3], m6 |

1067 |
psraw m0, 3 |

1068 |
psubw m6, m0 |

1069 |
packuswb m0, m0 |

1070 |
packuswb m6, m6 |

1071 |
punpcklbw m0, m0 ; AABBCCDD |

1072 |
punpcklbw m6, m6 ; AABBCCDD |

1073 |
movq m1, m0 |

1074 |
movq m7, m6 |

1075 |
punpcklbw m0, m0 ; AAAABBBB |

1076 |
punpckhbw m1, m1 ; CCCCDDDD |

1077 |
punpcklbw m6, m6 ; AAAABBBB |

1078 |
punpckhbw m7, m7 ; CCCCDDDD |

1079 | |

1080 |
; add DC |

1081 |
lea r1, [r0+r2*2] |

1082 |
ADD_DC m0, m6, 0, mova |

1083 |
lea r0, [r0+r2*4] |

1084 |
lea r1, [r1+r2*4] |

1085 |
ADD_DC m1, m7, 0, mova |

1086 |
RET |

1087 | |

1088 |
;----------------------------------------------------------------------------- |

1089 |
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |

1090 |
;----------------------------------------------------------------------------- |

1091 | |

1092 |
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) |

1093 |
; this macro assumes that m6/m7 have words for 20091/17734 loaded |

1094 |
%macro VP8_MULTIPLY_SUMSUB 4 |

1095 |
mova %3, %1 |

1096 |
mova %4, %2 |

1097 |
pmulhw %3, m6 ;20091(1) |

1098 |
pmulhw %4, m6 ;20091(2) |

1099 |
paddw %3, %1 |

1100 |
paddw %4, %2 |

1101 |
paddw %1, %1 |

1102 |
paddw %2, %2 |

1103 |
pmulhw %1, m7 ;35468(1) |

1104 |
pmulhw %2, m7 ;35468(2) |

1105 |
psubw %1, %4 |

1106 |
paddw %2, %3 |

1107 |
%endmacro |

1108 | |

1109 |
; calculate x0=%1+%3; x1=%1-%3 |

1110 |
; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) |

1111 |
; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) |

1112 |
; %5/%6 are temporary registers |

1113 |
; we assume m6/m7 have constant words 20091/17734 loaded in them |

1114 |
%macro VP8_IDCT_TRANSFORM4x4_1D 6 |

1115 |
SUMSUB_BA m%3, m%1, m%5 ;t0, t1 |

1116 |
VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 |

1117 |
SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 |

1118 |
SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 |

1119 |
SWAP %4, %1 |

1120 |
SWAP %4, %3 |

1121 |
%endmacro |

1122 | |

1123 |
INIT_MMX |

1124 |
%macro VP8_IDCT_ADD 1 |

1125 |
cglobal vp8_idct_add_%1, 3, 3 |

1126 |
; load block data |

1127 |
movq m0, [r1+ 0] |

1128 |
movq m1, [r1+ 8] |

1129 |
movq m2, [r1+16] |

1130 |
movq m3, [r1+24] |

1131 |
movq m6, [pw_20091] |

1132 |
movq m7, [pw_17734] |

1133 |
%ifidn %1, sse |

1134 |
xorps xmm0, xmm0 |

1135 |
movaps [r1+ 0], xmm0 |

1136 |
movaps [r1+16], xmm0 |

1137 |
%else |

1138 |
pxor m4, m4 |

1139 |
movq [r1+ 0], m4 |

1140 |
movq [r1+ 8], m4 |

1141 |
movq [r1+16], m4 |

1142 |
movq [r1+24], m4 |

1143 |
%endif |

1144 | |

1145 |
; actual IDCT |

1146 |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |

1147 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1148 |
paddw m0, [pw_4] |

1149 |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |

1150 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1151 | |

1152 |
; store |

1153 |
pxor m4, m4 |

1154 |
lea r1, [r0+2*r2] |

1155 |
STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 |

1156 |
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 |

1157 | |

1158 |
RET |

1159 |
%endmacro |

1160 | |

1161 |
VP8_IDCT_ADD mmx |

1162 |
VP8_IDCT_ADD sse |

1163 | |

1164 |
;----------------------------------------------------------------------------- |

1165 |
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |

1166 |
;----------------------------------------------------------------------------- |

1167 | |

1168 |
%macro SCATTER_WHT 3 |

1169 |
movd r1d, m%1 |

1170 |
movd r2d, m%2 |

1171 |
mov [r0+2*16*(0+%3)], r1w |

1172 |
mov [r0+2*16*(1+%3)], r2w |

1173 |
shr r1d, 16 |

1174 |
shr r2d, 16 |

1175 |
psrlq m%1, 32 |

1176 |
psrlq m%2, 32 |

1177 |
mov [r0+2*16*(4+%3)], r1w |

1178 |
mov [r0+2*16*(5+%3)], r2w |

1179 |
movd r1d, m%1 |

1180 |
movd r2d, m%2 |

1181 |
mov [r0+2*16*(8+%3)], r1w |

1182 |
mov [r0+2*16*(9+%3)], r2w |

1183 |
shr r1d, 16 |

1184 |
shr r2d, 16 |

1185 |
mov [r0+2*16*(12+%3)], r1w |

1186 |
mov [r0+2*16*(13+%3)], r2w |

1187 |
%endmacro |

1188 | |

1189 |
%macro HADAMARD4_1D 4 |

1190 |
SUMSUB_BADC m%2, m%1, m%4, m%3 |

1191 |
SUMSUB_BADC m%4, m%2, m%3, m%1 |

1192 |
SWAP %1, %4, %3 |

1193 |
%endmacro |

1194 | |

1195 |
INIT_MMX |

1196 |
cglobal vp8_luma_dc_wht_mmx, 2,3 |

1197 |
movq m0, [r1] |

1198 |
movq m1, [r1+8] |

1199 |
movq m2, [r1+16] |

1200 |
movq m3, [r1+24] |

1201 |
HADAMARD4_1D 0, 1, 2, 3 |

1202 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1203 |
paddw m0, [pw_3] |

1204 |
HADAMARD4_1D 0, 1, 2, 3 |

1205 |
psraw m0, 3 |

1206 |
psraw m1, 3 |

1207 |
psraw m2, 3 |

1208 |
psraw m3, 3 |

1209 |
SCATTER_WHT 0, 1, 0 |

1210 |
SCATTER_WHT 2, 3, 2 |

1211 |
RET |

1212 | |

1213 |
;----------------------------------------------------------------------------- |

1214 |
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |

1215 |
;----------------------------------------------------------------------------- |

1216 | |

1217 |
; macro called with 7 mm register indexes as argument, and 4 regular registers |

1218 |
; |

1219 |
; first 4 mm registers will carry the transposed pixel data |

1220 |
; the other three are scratchspace (one would be sufficient, but this allows |

1221 |
; for more spreading/pipelining and thus faster execution on OOE CPUs) |

1222 |
; |

1223 |
; first two regular registers are buf+4*stride and buf+5*stride |

1224 |
; third is -stride, fourth is +stride |

1225 |
%macro READ_8x4_INTERLEAVED 11 |

1226 |
; interleave 8 (A-H) rows of 4 pixels each |

1227 |
movd m%1, [%8+%10*4] ; A0-3 |

1228 |
movd m%5, [%9+%10*4] ; B0-3 |

1229 |
movd m%2, [%8+%10*2] ; C0-3 |

1230 |
movd m%6, [%8+%10] ; D0-3 |

1231 |
movd m%3, [%8] ; E0-3 |

1232 |
movd m%7, [%9] ; F0-3 |

1233 |
movd m%4, [%9+%11] ; G0-3 |

1234 |
punpcklbw m%1, m%5 ; A/B interleaved |

1235 |
movd m%5, [%9+%11*2] ; H0-3 |

1236 |
punpcklbw m%2, m%6 ; C/D interleaved |

1237 |
punpcklbw m%3, m%7 ; E/F interleaved |

1238 |
punpcklbw m%4, m%5 ; G/H interleaved |

1239 |
%endmacro |

1240 | |

1241 |
; macro called with 7 mm register indexes as argument, and 5 regular registers |

1242 |
; first 11 mean the same as READ_8x4_TRANSPOSED above |

1243 |
; fifth regular register is scratchspace to reach the bottom 8 rows, it |

1244 |
; will be set to second regular register + 8*stride at the end |

1245 |
%macro READ_16x4_INTERLEAVED 12 |

1246 |
; transpose 16 (A-P) rows of 4 pixels each |

1247 |
lea %12, [r0+8*r2] |

1248 | |

1249 |
; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |

1250 |
movd m%1, [%8+%10*4] ; A0-3 |

1251 |
movd m%3, [%12+%10*4] ; I0-3 |

1252 |
movd m%2, [%8+%10*2] ; C0-3 |

1253 |
movd m%4, [%12+%10*2] ; K0-3 |

1254 |
movd m%6, [%8+%10] ; D0-3 |

1255 |
movd m%5, [%12+%10] ; L0-3 |

1256 |
movd m%7, [%12] ; M0-3 |

1257 |
add %12, %11 |

1258 |
punpcklbw m%1, m%3 ; A/I |

1259 |
movd m%3, [%8] ; E0-3 |

1260 |
punpcklbw m%2, m%4 ; C/K |

1261 |
punpcklbw m%6, m%5 ; D/L |

1262 |
punpcklbw m%3, m%7 ; E/M |

1263 |
punpcklbw m%2, m%6 ; C/D/K/L interleaved |

1264 | |

1265 |
; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |

1266 |
movd m%5, [%9+%10*4] ; B0-3 |

1267 |
movd m%4, [%12+%10*4] ; J0-3 |

1268 |
movd m%7, [%9] ; F0-3 |

1269 |
movd m%6, [%12] ; N0-3 |

1270 |
punpcklbw m%5, m%4 ; B/J |

1271 |
punpcklbw m%7, m%6 ; F/N |

1272 |
punpcklbw m%1, m%5 ; A/B/I/J interleaved |

1273 |
punpcklbw m%3, m%7 ; E/F/M/N interleaved |

1274 |
movd m%4, [%9+%11] ; G0-3 |

1275 |
movd m%6, [%12+%11] ; O0-3 |

1276 |
movd m%5, [%9+%11*2] ; H0-3 |

1277 |
movd m%7, [%12+%11*2] ; P0-3 |

1278 |
punpcklbw m%4, m%6 ; G/O |

1279 |
punpcklbw m%5, m%7 ; H/P |

1280 |
punpcklbw m%4, m%5 ; G/H/O/P interleaved |

1281 |
%endmacro |

1282 | |

1283 |
; write 4 mm registers of 2 dwords each |

1284 |
; first four arguments are mm register indexes containing source data |

1285 |
; last four are registers containing buf+4*stride, buf+5*stride, |

1286 |
; -stride and +stride |

1287 |
%macro WRITE_4x2D 8 |

1288 |
; write out (2 dwords per register) |

1289 |
movd [%5+%7*4], m%1 |

1290 |
movd [%5+%7*2], m%2 |

1291 |
movd [%5], m%3 |

1292 |
movd [%6+%8], m%4 |

1293 |
punpckhdq m%1, m%1 |

1294 |
punpckhdq m%2, m%2 |

1295 |
punpckhdq m%3, m%3 |

1296 |
punpckhdq m%4, m%4 |

1297 |
movd [%6+%7*4], m%1 |

1298 |
movd [%5+%7], m%2 |

1299 |
movd [%6], m%3 |

1300 |
movd [%6+%8*2], m%4 |

1301 |
%endmacro |

1302 | |

1303 |
; write 4 xmm registers of 4 dwords each |

1304 |
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |

1305 |
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |

1306 |
; we add 1*stride to the third regular registry in the process |

1307 |
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the |

1308 |
; same memory region), or 8 if they cover two separate buffers (third one points to |

1309 |
; a different memory region than the first two), allowing for more optimal code for |

1310 |
; the 16-width case |

1311 |
%macro WRITE_4x4D 10 |

1312 |
; write out (4 dwords per register), start with dwords zero |

1313 |
movd [%5+%8*4], m%1 |

1314 |
movd [%5], m%2 |

1315 |
movd [%7+%8*4], m%3 |

1316 |
movd [%7], m%4 |

1317 | |

1318 |
; store dwords 1 |

1319 |
psrldq m%1, 4 |

1320 |
psrldq m%2, 4 |

1321 |
psrldq m%3, 4 |

1322 |
psrldq m%4, 4 |

1323 |
movd [%6+%8*4], m%1 |

1324 |
movd [%6], m%2 |

1325 |
%if %10 == 16 |

1326 |
movd [%6+%9*4], m%3 |

1327 |
%endif |

1328 |
movd [%7+%9], m%4 |

1329 | |

1330 |
; write dwords 2 |

1331 |
psrldq m%1, 4 |

1332 |
psrldq m%2, 4 |

1333 |
%if %10 == 8 |

1334 |
movd [%5+%8*2], m%1 |

1335 |
movd %5, m%3 |

1336 |
%endif |

1337 |
psrldq m%3, 4 |

1338 |
psrldq m%4, 4 |

1339 |
%if %10 == 16 |

1340 |
movd [%5+%8*2], m%1 |

1341 |
%endif |

1342 |
movd [%6+%9], m%2 |

1343 |
movd [%7+%8*2], m%3 |

1344 |
movd [%7+%9*2], m%4 |

1345 |
add %7, %9 |

1346 | |

1347 |
; store dwords 3 |

1348 |
psrldq m%1, 4 |

1349 |
psrldq m%2, 4 |

1350 |
psrldq m%3, 4 |

1351 |
psrldq m%4, 4 |

1352 |
%if %10 == 8 |

1353 |
mov [%7+%8*4], %5d |

1354 |
movd [%6+%8*2], m%1 |

1355 |
%else |

1356 |
movd [%5+%8], m%1 |

1357 |
%endif |

1358 |
movd [%6+%9*2], m%2 |

1359 |
movd [%7+%8*2], m%3 |

1360 |
movd [%7+%9*2], m%4 |

1361 |
%endmacro |

1362 | |

1363 |
%macro SPLATB_REG 3-4 |

1364 |
movd %1, %2 |

1365 |
%ifidn %3, ssse3 |

1366 |
pshufb %1, %4 |

1367 |
%else |

1368 |
punpcklbw %1, %1 |

1369 |
%if mmsize == 16 ; sse2 |

1370 |
pshuflw %1, %1, 0x0 |

1371 |
punpcklqdq %1, %1 |

1372 |
%elifidn %3, mmx |

1373 |
punpcklwd %1, %1 |

1374 |
punpckldq %1, %1 |

1375 |
%else ; mmxext |

1376 |
pshufw %1, %1, 0x0 |

1377 |
%endif |

1378 |
%endif |

1379 |
%endmacro |

1380 | |

1381 |
%macro SIMPLE_LOOPFILTER 3 |

1382 |
cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |

1383 |
%ifidn %2, h |

1384 |
mov r5, rsp ; backup stack pointer |

1385 |
and rsp, ~(mmsize-1) ; align stack |

1386 |
%endif |

1387 |
%if mmsize == 8 ; mmx/mmxext |

1388 |
mov r3, 2 |

1389 |
%endif |

1390 |
%ifidn %1, ssse3 |

1391 |
pxor m0, m0 |

1392 |
%endif |

1393 |
SPLATB_REG m7, r2, %1, m0 ; splat "flim" into register |

1394 | |

1395 |
; set up indexes to address 4 rows |

1396 |
mov r2, r1 |

1397 |
neg r1 |

1398 |
%ifidn %2, h |

1399 |
lea r0, [r0+4*r2-2] |

1400 |
sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 |

1401 |
%endif |

1402 | |

1403 |
%if mmsize == 8 ; mmx / mmxext |

1404 |
.next8px |

1405 |
%endif |

1406 |
%ifidn %2, v |

1407 |
; read 4 half/full rows of pixels |

1408 |
mova m0, [r0+r1*2] ; p1 |

1409 |
mova m1, [r0+r1] ; p0 |

1410 |
mova m2, [r0] ; q0 |

1411 |
mova m3, [r0+r2] ; q1 |

1412 |
%else ; h |

1413 |
lea r4, [r0+r2] |

1414 | |

1415 |
%if mmsize == 8 ; mmx/mmxext |

1416 |
READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |

1417 |
%else ; sse2 |

1418 |
READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |

1419 |
%endif |

1420 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1421 | |

1422 |
mova [rsp], m0 ; store p1 |

1423 |
mova [rsp+mmsize], m3 ; store q1 |

1424 |
%endif |

1425 | |

1426 |
; simple_limit |

1427 |
mova m5, m2 ; m5=backup of q0 |

1428 |
mova m6, m1 ; m6=backup of p0 |

1429 |
psubusb m1, m2 ; p0-q0 |

1430 |
psubusb m2, m6 ; q0-p0 |

1431 |
por m1, m2 ; FFABS(p0-q0) |

1432 |
paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |

1433 | |

1434 |
mova m4, m3 |

1435 |
mova m2, m0 |

1436 |
psubusb m3, m0 ; q1-p1 |

1437 |
psubusb m0, m4 ; p1-q1 |

1438 |
por m3, m0 ; FFABS(p1-q1) |

1439 |
mova m0, [pb_80] |

1440 |
pxor m2, m0 |

1441 |
pxor m4, m0 |

1442 |
psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |

1443 |
pand m3, [pb_FE] |

1444 |
psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |

1445 |
paddusb m3, m1 |

1446 |
psubusb m3, m7 |

1447 |
pxor m1, m1 |

1448 |
pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |

1449 | |

1450 |
; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |

1451 |
mova m4, m5 |

1452 |
pxor m5, m0 |

1453 |
pxor m0, m6 |

1454 |
psubsb m5, m0 ; q0-p0 (signed) |

1455 |
paddsb m2, m5 |

1456 |
paddsb m2, m5 |

1457 |
paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |

1458 |
pand m2, m3 ; apply filter mask (m3) |

1459 | |

1460 |
mova m3, [pb_F8] |

1461 |
mova m1, m2 |

1462 |
paddsb m2, [pb_4] ; f1<<3=a+4 |

1463 |
paddsb m1, [pb_3] ; f2<<3=a+3 |

1464 |
pand m2, m3 |

1465 |
pand m1, m3 ; cache f2<<3 |

1466 | |

1467 |
pxor m0, m0 |

1468 |
pxor m3, m3 |

1469 |
pcmpgtb m0, m2 ; which values are <0? |

1470 |
psubb m3, m2 ; -f1<<3 |

1471 |
psrlq m2, 3 ; +f1 |

1472 |
psrlq m3, 3 ; -f1 |

1473 |
pand m3, m0 |

1474 |
pandn m0, m2 |

1475 |
psubusb m4, m0 |

1476 |
paddusb m4, m3 ; q0-f1 |

1477 | |

1478 |
pxor m0, m0 |

1479 |
pxor m3, m3 |

1480 |
pcmpgtb m0, m1 ; which values are <0? |

1481 |
psubb m3, m1 ; -f2<<3 |

1482 |
psrlq m1, 3 ; +f2 |

1483 |
psrlq m3, 3 ; -f2 |

1484 |
pand m3, m0 |

1485 |
pandn m0, m1 |

1486 |
paddusb m6, m0 |

1487 |
psubusb m6, m3 ; p0+f2 |

1488 | |

1489 |
; store |

1490 |
%ifidn %2, v |

1491 |
mova [r0], m4 |

1492 |
mova [r0+r1], m6 |

1493 |
%else ; h |

1494 |
mova m0, [rsp] ; p1 |

1495 |
SWAP 2, 4 ; p0 |

1496 |
SWAP 1, 6 ; q0 |

1497 |
mova m3, [rsp+mmsize] ; q1 |

1498 | |

1499 |
TRANSPOSE4x4B 0, 1, 2, 3, 4 |

1500 |
%if mmsize == 16 ; sse2 |

1501 |
add r3, r1 ; change from r4*8*stride to r0+8*stride |

1502 |
WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16 |

1503 |
%else ; mmx/mmxext |

1504 |
WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 |

1505 |
%endif |

1506 |
%endif |

1507 | |

1508 |
%if mmsize == 8 ; mmx/mmxext |

1509 |
; next 8 pixels |

1510 |
%ifidn %2, v |

1511 |
add r0, 8 ; advance 8 cols = pixels |

1512 |
%else ; h |

1513 |
lea r0, [r0+r2*8] ; advance 8 rows = lines |

1514 |
%endif |

1515 |
dec r3 |

1516 |
jg .next8px |

1517 |
%ifidn %2, v |

1518 |
REP_RET |

1519 |
%else ; h |

1520 |
mov rsp, r5 ; restore stack pointer |

1521 |
RET |

1522 |
%endif |

1523 |
%else ; sse2 |

1524 |
%ifidn %2, h |

1525 |
mov rsp, r5 ; restore stack pointer |

1526 |
%endif |

1527 |
RET |

1528 |
%endif |

1529 |
%endmacro |

1530 | |

1531 |
INIT_MMX |

1532 |
SIMPLE_LOOPFILTER mmx, v, 4 |

1533 |
SIMPLE_LOOPFILTER mmx, h, 6 |

1534 |
SIMPLE_LOOPFILTER mmxext, v, 4 |

1535 |
SIMPLE_LOOPFILTER mmxext, h, 6 |

1536 |
INIT_XMM |

1537 |
SIMPLE_LOOPFILTER sse2, v, 3 |

1538 |
SIMPLE_LOOPFILTER sse2, h, 6 |

1539 |
SIMPLE_LOOPFILTER ssse3, v, 3 |

1540 |
SIMPLE_LOOPFILTER ssse3, h, 6 |

1541 | |

1542 |
;----------------------------------------------------------------------------- |

1543 |
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |

1544 |
; int flimE, int flimI, int hev_thr); |

1545 |
;----------------------------------------------------------------------------- |

1546 | |

1547 |
%macro INNER_LOOPFILTER 5 |

1548 |
%if %4 == 8 ; chroma |

1549 |
cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 |

1550 |
%define dst8_reg r1 |

1551 |
%define mstride_reg r2 |

1552 |
%define E_reg r3 |

1553 |
%define I_reg r4 |

1554 |
%define hev_thr_reg r5 |

1555 |
%else ; luma |

1556 |
cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 |

1557 |
%define mstride_reg r1 |

1558 |
%define E_reg r2 |

1559 |
%define I_reg r3 |

1560 |
%define hev_thr_reg r4 |

1561 |
%ifdef m8 ; x86-64, sse2 |

1562 |
%define dst8_reg r4 |

1563 |
%elif mmsize == 16 ; x86-32, sse2 |

1564 |
%define dst8_reg r5 |

1565 |
%else ; x86-32, mmx/mmxext |

1566 |
%define cnt_reg r5 |

1567 |
%endif |

1568 |
%endif |

1569 |
%define dst_reg r0 |

1570 |
%define stride_reg E_reg |

1571 |
%define dst2_reg I_reg |

1572 |
%ifndef m8 |

1573 |
%define stack_reg hev_thr_reg |

1574 |
%endif |

1575 | |

1576 |
%ifidn %1, ssse3 |

1577 |
pxor m7, m7 |

1578 |
%endif |

1579 | |

1580 |
%ifndef m8 ; mmx/mmxext or sse2 on x86-32 |

1581 |
; splat function arguments |

1582 |
SPLATB_REG m0, E_reg, %1, m7 ; E |

1583 |
SPLATB_REG m1, I_reg, %1, m7 ; I |

1584 |
SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh |

1585 | |

1586 |
; align stack |

1587 |
mov stack_reg, rsp ; backup stack pointer |

1588 |
and rsp, ~(mmsize-1) ; align stack |

1589 |
%ifidn %2, v |

1590 |
sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |

1591 |
; [3]=hev() result |

1592 |
%else ; h |

1593 |
sub rsp, mmsize * 5 ; extra storage space for transposes |

1594 |
%endif |

1595 | |

1596 |
%define flim_E [rsp] |

1597 |
%define flim_I [rsp+mmsize] |

1598 |
%define hev_thr [rsp+mmsize*2] |

1599 |
%define mask_res [rsp+mmsize*3] |

1600 |
%define p0backup [rsp+mmsize*3] |

1601 |
%define q0backup [rsp+mmsize*4] |

1602 | |

1603 |
mova flim_E, m0 |

1604 |
mova flim_I, m1 |

1605 |
mova hev_thr, m2 |

1606 | |

1607 |
%else ; sse2 on x86-64 |

1608 | |

1609 |
%define flim_E m9 |

1610 |
%define flim_I m10 |

1611 |
%define hev_thr m11 |

1612 |
%define mask_res m12 |

1613 |
%define p0backup m12 |

1614 |
%define q0backup m8 |

1615 | |

1616 |
; splat function arguments |

1617 |
SPLATB_REG flim_E, E_reg, %1, m7 ; E |

1618 |
SPLATB_REG flim_I, I_reg, %1, m7 ; I |

1619 |
SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh |

1620 |
%endif |

1621 | |

1622 |
%if mmsize == 8 && %4 == 16 ; mmx/mmxext |

1623 |
mov cnt_reg, 2 |

1624 |
%endif |

1625 |
mov stride_reg, mstride_reg |

1626 |
neg mstride_reg |

1627 |
%ifidn %2, h |

1628 |
lea dst_reg, [dst_reg + stride_reg*4-4] |

1629 |
%if %4 == 8 |

1630 |
lea dst8_reg, [dst8_reg+ stride_reg*4-4] |

1631 |
%endif |

1632 |
%endif |

1633 | |

1634 |
%if mmsize == 8 |

1635 |
.next8px |

1636 |
%endif |

1637 |
; read |

1638 |
lea dst2_reg, [dst_reg + stride_reg] |

1639 |
%ifidn %2, v |

1640 |
%if %4 == 8 && mmsize == 16 |

1641 |
%define movrow movh |

1642 |
%else |

1643 |
%define movrow mova |

1644 |
%endif |

1645 |
movrow m0, [dst_reg +mstride_reg*4] ; p3 |

1646 |
movrow m1, [dst2_reg+mstride_reg*4] ; p2 |

1647 |
movrow m2, [dst_reg +mstride_reg*2] ; p1 |

1648 |
movrow m5, [dst2_reg] ; q1 |

1649 |
movrow m6, [dst2_reg+ stride_reg] ; q2 |

1650 |
movrow m7, [dst2_reg+ stride_reg*2] ; q3 |

1651 |
%if mmsize == 16 && %4 == 8 |

1652 |
movhps m0, [dst8_reg+mstride_reg*4] |

1653 |
movhps m2, [dst8_reg+mstride_reg*2] |

1654 |
add dst8_reg, stride_reg |

1655 |
movhps m1, [dst8_reg+mstride_reg*4] |

1656 |
movhps m5, [dst8_reg] |

1657 |
movhps m6, [dst8_reg+ stride_reg] |

1658 |
movhps m7, [dst8_reg+ stride_reg*2] |

1659 |
add dst8_reg, mstride_reg |

1660 |
%endif |

1661 |
%elif mmsize == 8 ; mmx/mmxext (h) |

1662 |
; read 8 rows of 8px each |

1663 |
movu m0, [dst_reg +mstride_reg*4] |

1664 |
movu m1, [dst2_reg+mstride_reg*4] |

1665 |
movu m2, [dst_reg +mstride_reg*2] |

1666 |
movu m3, [dst_reg +mstride_reg] |

1667 |
movu m4, [dst_reg] |

1668 |
movu m5, [dst2_reg] |

1669 |
movu m6, [dst2_reg+ stride_reg] |

1670 | |

1671 |
; 8x8 transpose |

1672 |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |

1673 |
mova q0backup, m1 |

1674 |
movu m7, [dst2_reg+ stride_reg*2] |

1675 |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |

1676 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |

1677 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |

1678 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |

1679 |
mova m1, q0backup |

1680 |
mova q0backup, m2 ; store q0 |

1681 |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |

1682 |
mova p0backup, m5 ; store p0 |

1683 |
SWAP 1, 4 |

1684 |
SWAP 2, 4 |

1685 |
SWAP 6, 3 |

1686 |
SWAP 5, 3 |

1687 |
%else ; sse2 (h) |

1688 |
%if %4 == 16 |

1689 |
lea dst8_reg, [dst_reg + stride_reg*8] |

1690 |
%endif |

1691 | |

1692 |
; read 16 rows of 8px each, interleave |

1693 |
movh m0, [dst_reg +mstride_reg*4] |

1694 |
movh m1, [dst8_reg+mstride_reg*4] |

1695 |
movh m2, [dst_reg +mstride_reg*2] |

1696 |
movh m5, [dst8_reg+mstride_reg*2] |

1697 |
movh m3, [dst_reg +mstride_reg] |

1698 |
movh m6, [dst8_reg+mstride_reg] |

1699 |
movh m4, [dst_reg] |

1700 |
movh m7, [dst8_reg] |

1701 |
punpcklbw m0, m1 ; A/I |

1702 |
punpcklbw m2, m5 ; C/K |

1703 |
punpcklbw m3, m6 ; D/L |

1704 |
punpcklbw m4, m7 ; E/M |

1705 | |

1706 |
add dst8_reg, stride_reg |

1707 |
movh m1, [dst2_reg+mstride_reg*4] |

1708 |
movh m6, [dst8_reg+mstride_reg*4] |

1709 |
movh m5, [dst2_reg] |

1710 |
movh m7, [dst8_reg] |

1711 |
punpcklbw m1, m6 ; B/J |

1712 |
punpcklbw m5, m7 ; F/N |

1713 |
movh m6, [dst2_reg+ stride_reg] |

1714 |
movh m7, [dst8_reg+ stride_reg] |

1715 |
punpcklbw m6, m7 ; G/O |

1716 | |

1717 |
; 8x16 transpose |

1718 |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |

1719 |
%ifdef m8 |

1720 |
SWAP 1, 8 |

1721 |
%else |

1722 |
mova q0backup, m1 |

1723 |
%endif |

1724 |
movh m7, [dst2_reg+ stride_reg*2] |

1725 |
movh m1, [dst8_reg+ stride_reg*2] |

1726 |
punpcklbw m7, m1 ; H/P |

1727 |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |

1728 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |

1729 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |

1730 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |

1731 |
%ifdef m8 |

1732 |
SWAP 1, 8 |

1733 |
SWAP 2, 8 |

1734 |
%else |

1735 |
mova m1, q0backup |

1736 |
mova q0backup, m2 ; store q0 |

1737 |
%endif |

1738 |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |

1739 |
%ifdef m12 |

1740 |
SWAP 5, 12 |

1741 |
%else |

1742 |
mova p0backup, m5 ; store p0 |

1743 |
%endif |

1744 |
SWAP 1, 4 |

1745 |
SWAP 2, 4 |

1746 |
SWAP 6, 3 |

1747 |
SWAP 5, 3 |

1748 |
%endif |

1749 | |

1750 |
; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |

1751 |
mova m4, m1 |

1752 |
SWAP 4, 1 |

1753 |
psubusb m4, m0 ; p2-p3 |

1754 |
psubusb m0, m1 ; p3-p2 |

1755 |
por m0, m4 ; abs(p3-p2) |

1756 | |

1757 |
mova m4, m2 |

1758 |
SWAP 4, 2 |

1759 |
psubusb m4, m1 ; p1-p2 |

1760 |
psubusb m1, m2 ; p2-p1 |

1761 |
por m1, m4 ; abs(p2-p1) |

1762 | |

1763 |
mova m4, m6 |

1764 |
SWAP 4, 6 |

1765 |
psubusb m4, m7 ; q2-q3 |

1766 |
psubusb m7, m6 ; q3-q2 |

1767 |
por m7, m4 ; abs(q3-q2) |

1768 | |

1769 |
mova m4, m5 |

1770 |
SWAP 4, 5 |

1771 |
psubusb m4, m6 ; q1-q2 |

1772 |
psubusb m6, m5 ; q2-q1 |

1773 |
por m6, m4 ; abs(q2-q1) |

1774 | |

1775 |
%ifidn %1, mmx |

1776 |
mova m4, flim_I |

1777 |
pxor m3, m3 |

1778 |
psubusb m0, m4 |

1779 |
psubusb m1, m4 |

1780 |
psubusb m7, m4 |

1781 |
psubusb m6, m4 |

1782 |
pcmpeqb m0, m3 ; abs(p3-p2) <= I |

1783 |
pcmpeqb m1, m3 ; abs(p2-p1) <= I |

1784 |
pcmpeqb m7, m3 ; abs(q3-q2) <= I |

1785 |
pcmpeqb m6, m3 ; abs(q2-q1) <= I |

1786 |
pand m0, m1 |

1787 |
pand m7, m6 |

1788 |
pand m0, m7 |

1789 |
%else ; mmxext/sse2 |

1790 |
pmaxub m0, m1 |

1791 |
pmaxub m6, m7 |

1792 |
pmaxub m0, m6 |

1793 |
%endif |

1794 | |

1795 |
; normal_limit and high_edge_variance for p1-p0, q1-q0 |

1796 |
SWAP 7, 3 ; now m7 is zero |

1797 |
%ifidn %2, v |

1798 |
movrow m3, [dst_reg +mstride_reg] ; p0 |

1799 |
%if mmsize == 16 && %4 == 8 |

1800 |
movhps m3, [dst8_reg+mstride_reg] |

1801 |
%endif |

1802 |
%elifdef m12 |

1803 |
SWAP 3, 12 |

1804 |
%else |

1805 |
mova m3, p0backup |

1806 |
%endif |

1807 | |

1808 |
mova m1, m2 |

1809 |
SWAP 1, 2 |

1810 |
mova m6, m3 |

1811 |
SWAP 3, 6 |

1812 |
psubusb m1, m3 ; p1-p0 |

1813 |
psubusb m6, m2 ; p0-p1 |

1814 |
por m1, m6 ; abs(p1-p0) |

1815 |
%ifidn %1, mmx |

1816 |
mova m6, m1 |

1817 |
psubusb m1, m4 |

1818 |
psubusb m6, hev_thr |

1819 |
pcmpeqb m1, m7 ; abs(p1-p0) <= I |

1820 |
pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |

1821 |
pand m0, m1 |

1822 |
mova mask_res, m6 |

1823 |
%else ; mmxext/sse2 |

1824 |
pmaxub m0, m1 ; max_I |

1825 |
SWAP 1, 4 ; max_hev_thresh |

1826 |
%endif |

1827 | |

1828 |
SWAP 6, 4 ; now m6 is I |

1829 |
%ifidn %2, v |

1830 |
movrow m4, [dst_reg] ; q0 |

1831 |
%if mmsize == 16 && %4 == 8 |

1832 |
movhps m4, [dst8_reg] |

1833 |
%endif |

1834 |
%elifdef m8 |

1835 |
SWAP 4, 8 |

1836 |
%else |

1837 |
mova m4, q0backup |

1838 |
%endif |

1839 |
mova m1, m4 |

1840 |
SWAP 1, 4 |

1841 |
mova m7, m5 |

1842 |
SWAP 7, 5 |

1843 |
psubusb m1, m5 ; q0-q1 |

1844 |
psubusb m7, m4 ; q1-q0 |

1845 |
por m1, m7 ; abs(q1-q0) |

1846 |
%ifidn %1, mmx |

1847 |
mova m7, m1 |

1848 |
psubusb m1, m6 |

1849 |
psubusb m7, hev_thr |

1850 |
pxor m6, m6 |

1851 |
pcmpeqb m1, m6 ; abs(q1-q0) <= I |

1852 |
pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |

1853 |
mova m6, mask_res |

1854 |
pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |

1855 |
pand m6, m7 |

1856 |
%else ; mmxext/sse2 |

1857 |
pxor m7, m7 |

1858 |
pmaxub m0, m1 |

1859 |
pmaxub m6, m1 |

1860 |
psubusb m0, flim_I |

1861 |
psubusb m6, hev_thr |

1862 |
pcmpeqb m0, m7 ; max(abs(..)) <= I |

1863 |
pcmpeqb m6, m7 ; !(max(abs..) > thresh) |

1864 |
%endif |

1865 |
%ifdef m12 |

1866 |
SWAP 6, 12 |

1867 |
%else |

1868 |
mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |

1869 |
%endif |

1870 | |

1871 |
; simple_limit |

1872 |
mova m1, m3 |

1873 |
SWAP 1, 3 |

1874 |
mova m6, m4 ; keep copies of p0/q0 around for later use |

1875 |
SWAP 6, 4 |

1876 |
psubusb m1, m4 ; p0-q0 |

1877 |
psubusb m6, m3 ; q0-p0 |

1878 |
por m1, m6 ; abs(q0-p0) |

1879 |
paddusb m1, m1 ; m1=2*abs(q0-p0) |

1880 | |

1881 |
mova m7, m2 |

1882 |
SWAP 7, 2 |

1883 |
mova m6, m5 |

1884 |
SWAP 6, 5 |

1885 |
psubusb m7, m5 ; p1-q1 |

1886 |
psubusb m6, m2 ; q1-p1 |

1887 |
por m7, m6 ; abs(q1-p1) |

1888 |
pxor m6, m6 |

1889 |
pand m7, [pb_FE] |

1890 |
psrlq m7, 1 ; abs(q1-p1)/2 |

1891 |
paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |

1892 |
psubusb m7, flim_E |

1893 |
pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |

1894 |
pand m0, m7 ; normal_limit result |

1895 | |

1896 |
; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |

1897 |
%ifdef m8 ; x86-64 && sse2 |

1898 |
mova m8, [pb_80] |

1899 |
%define pb_80_var m8 |

1900 |
%else ; x86-32 or mmx/mmxext |

1901 |
%define pb_80_var [pb_80] |

1902 |
%endif |

1903 |
mova m1, m4 |

1904 |
mova m7, m3 |

1905 |
pxor m1, pb_80_var |

1906 |
pxor m7, pb_80_var |

1907 |
psubsb m1, m7 ; (signed) q0-p0 |

1908 |
mova m6, m2 |

1909 |
mova m7, m5 |

1910 |
pxor m6, pb_80_var |

1911 |
pxor m7, pb_80_var |

1912 |
psubsb m6, m7 ; (signed) p1-q1 |

1913 |
mova m7, mask_res |

1914 |
pandn m7, m6 |

1915 |
paddsb m7, m1 |

1916 |
paddsb m7, m1 |

1917 |
paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) |

1918 | |

1919 |
pand m7, m0 |

1920 |
mova m1, [pb_F8] |

1921 |
mova m6, m7 |

1922 |
paddsb m7, [pb_3] |

1923 |
paddsb m6, [pb_4] |

1924 |
pand m7, m1 |

1925 |
pand m6, m1 |

1926 | |

1927 |
pxor m1, m1 |

1928 |
pxor m0, m0 |

1929 |
pcmpgtb m1, m7 |

1930 |
psubb m0, m7 |

1931 |
psrlq m7, 3 ; +f2 |

1932 |
psrlq m0, 3 ; -f2 |

1933 |
pand m0, m1 |

1934 |
pandn m1, m7 |

1935 |
psubusb m3, m0 |

1936 |
paddusb m3, m1 ; p0+f2 |

1937 | |

1938 |
pxor m1, m1 |

1939 |
pxor m0, m0 |

1940 |
pcmpgtb m0, m6 |

1941 |
psubb m1, m6 |

1942 |
psrlq m6, 3 ; +f1 |

1943 |
psrlq m1, 3 ; -f1 |

1944 |
pand m1, m0 |

1945 |
pandn m0, m6 |

1946 |
psubusb m4, m0 |

1947 |
paddusb m4, m1 ; q0-f1 |

1948 | |

1949 |
%ifdef m12 |

1950 |
SWAP 6, 12 |

1951 |
%else |

1952 |
mova m6, mask_res |

1953 |
%endif |

1954 |
%ifidn %1, mmx |

1955 |
mova m7, [pb_1] |

1956 |
%else ; mmxext/sse2 |

1957 |
pxor m7, m7 |

1958 |
%endif |

1959 |
pand m0, m6 |

1960 |
pand m1, m6 |

1961 |
%ifidn %1, mmx |

1962 |
paddusb m0, m7 |

1963 |
pand m1, [pb_FE] |

1964 |
pandn m7, m0 |

1965 |
psrlq m1, 1 |

1966 |
psrlq m7, 1 |

1967 |
SWAP 0, 7 |

1968 |
%else ; mmxext/sse2 |

1969 |
psubusb m1, [pb_1] |

1970 |
pavgb m0, m7 ; a |

1971 |
pavgb m1, m7 ; -a |

1972 |
%endif |

1973 |
psubusb m5, m0 |

1974 |
psubusb m2, m1 |

1975 |
paddusb m5, m1 ; q1-a |

1976 |
paddusb m2, m0 ; p1+a |

1977 | |

1978 |
; store |

1979 |
%ifidn %2, v |

1980 |
movrow [dst_reg +mstride_reg*2], m2 |

1981 |
movrow [dst_reg +mstride_reg ], m3 |

1982 |
movrow [dst_reg], m4 |

1983 |
movrow [dst_reg + stride_reg ], m5 |

1984 |
%if mmsize == 16 && %4 == 8 |

1985 |
movhps [dst8_reg+mstride_reg*2], m2 |

1986 |
movhps [dst8_reg+mstride_reg ], m3 |

1987 |
movhps [dst8_reg], m4 |

1988 |
movhps [dst8_reg+ stride_reg ], m5 |

1989 |
%endif |

1990 |
%else ; h |

1991 |
add dst_reg, 2 |

1992 |
add dst2_reg, 2 |

1993 | |

1994 |
; 4x8/16 transpose |

1995 |
TRANSPOSE4x4B 2, 3, 4, 5, 6 |

1996 | |

1997 |
%if mmsize == 8 ; mmx/mmxext (h) |

1998 |
WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg |

1999 |
%else ; sse2 (h) |

2000 |
lea dst8_reg, [dst8_reg+mstride_reg+2] |

2001 |
WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |

2002 |
%endif |

2003 |
%endif |

2004 | |

2005 |
%if mmsize == 8 |

2006 |
%if %4 == 8 ; chroma |

2007 |
%ifidn %2, h |

2008 |
sub dst_reg, 2 |

2009 |
%endif |

2010 |
cmp dst_reg, dst8_reg |

2011 |
mov dst_reg, dst8_reg |

2012 |
jnz .next8px |

2013 |
%else |

2014 |
%ifidn %2, h |

2015 |
lea dst_reg, [dst_reg + stride_reg*8-2] |

2016 |
%else ; v |

2017 |
add dst_reg, 8 |

2018 |
%endif |

2019 |
dec cnt_reg |

2020 |
jg .next8px |

2021 |
%endif |

2022 |
%endif |

2023 | |

2024 |
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext |

2025 |
mov rsp, stack_reg ; restore stack pointer |

2026 |
%endif |

2027 |
RET |

2028 |
%endmacro |

2029 | |

2030 |
INIT_MMX |

2031 |
INNER_LOOPFILTER mmx, v, 6, 16, 0 |

2032 |
INNER_LOOPFILTER mmx, h, 6, 16, 0 |

2033 |
INNER_LOOPFILTER mmxext, v, 6, 16, 0 |

2034 |
INNER_LOOPFILTER mmxext, h, 6, 16, 0 |

2035 | |

2036 |
INNER_LOOPFILTER mmx, v, 6, 8, 0 |

2037 |
INNER_LOOPFILTER mmx, h, 6, 8, 0 |

2038 |
INNER_LOOPFILTER mmxext, v, 6, 8, 0 |

2039 |
INNER_LOOPFILTER mmxext, h, 6, 8, 0 |

2040 | |

2041 |
INIT_XMM |

2042 |
INNER_LOOPFILTER sse2, v, 5, 16, 13 |

2043 |
%ifdef m8 |

2044 |
INNER_LOOPFILTER sse2, h, 5, 16, 13 |

2045 |
%else |

2046 |
INNER_LOOPFILTER sse2, h, 6, 16, 13 |

2047 |
%endif |

2048 |
INNER_LOOPFILTER sse2, v, 6, 8, 13 |

2049 |
INNER_LOOPFILTER sse2, h, 6, 8, 13 |

2050 | |

2051 |
INNER_LOOPFILTER ssse3, v, 5, 16, 13 |

2052 |
%ifdef m8 |

2053 |
INNER_LOOPFILTER ssse3, h, 5, 16, 13 |

2054 |
%else |

2055 |
INNER_LOOPFILTER ssse3, h, 6, 16, 13 |

2056 |
%endif |

2057 |
INNER_LOOPFILTER ssse3, v, 6, 8, 13 |

2058 |
INNER_LOOPFILTER ssse3, h, 6, 8, 13 |

2059 | |

2060 |
;----------------------------------------------------------------------------- |

2061 |
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |

2062 |
; int flimE, int flimI, int hev_thr); |

2063 |
;----------------------------------------------------------------------------- |

2064 | |

2065 |
; write 4 or 8 words in the mmx/xmm registers as 8 lines |

2066 |
; 1 and 2 are the registers to write, this can be the same (for SSE2) |

2067 |
; for pre-SSE4: |

2068 |
; 3 is a general-purpose register that we will clobber |

2069 |
; for SSE4: |

2070 |
; 3 is a pointer to the destination's 5th line |

2071 |
; 4 is a pointer to the destination's 4th line |

2072 |
; 5/6 is -stride and +stride |

2073 |
; 7 is optimization string |

2074 |
%macro WRITE_8W 7 |

2075 |
%ifidn %7, sse4 |

2076 |
pextrw [%4+%5*4], %1, 0 |

2077 |
pextrw [%3+%5*4], %1, 1 |

2078 |
pextrw [%4+%5*2], %1, 2 |

2079 |
pextrw [%4+%5 ], %1, 3 |

2080 |
pextrw [%4 ], %1, 4 |

2081 |
pextrw [%3 ], %1, 5 |

2082 |
pextrw [%3+%6 ], %1, 6 |

2083 |
pextrw [%3+%6*2], %1, 7 |

2084 |
%else |

2085 |
movd %3, %1 |

2086 |
%if mmsize == 8 |

2087 |
punpckhdq %1, %1 |

2088 |
%else |

2089 |
psrldq %1, 4 |

2090 |
%endif |

2091 |
mov [%4+%5*4], %3w |

2092 |
shr %3, 16 |

2093 |
add %4, %6 |

2094 |
mov [%4+%5*4], %3w |

2095 | |

2096 |
movd %3, %1 |

2097 |
%if mmsize == 16 |

2098 |
psrldq %1, 4 |

2099 |
%endif |

2100 |
add %4, %5 |

2101 |
mov [%4+%5*2], %3w |

2102 |
shr %3, 16 |

2103 |
mov [%4+%5 ], %3w |

2104 | |

2105 |
movd %3, %2 |

2106 |
%if mmsize == 8 |

2107 |
punpckhdq %2, %2 |

2108 |
%else |

2109 |
psrldq %2, 4 |

2110 |
%endif |

2111 |
mov [%4 ], %3w |

2112 |
shr %3, 16 |

2113 |
mov [%4+%6 ], %3w |

2114 | |

2115 |
movd %3, %2 |

2116 |
add %4, %6 |

2117 |
mov [%4+%6 ], %3w |

2118 |
shr %3, 16 |

2119 |
mov [%4+%6*2], %3w |

2120 |
%if mmsize == 8 |

2121 |
add %4, %5 |

2122 |
%endif |

2123 |
%endif |

2124 |
%endmacro |

2125 | |

2126 |
%macro MBEDGE_LOOPFILTER 5 |

2127 |
%if %4 == 8 ; chroma |

2128 |
cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 |

2129 |
%define dst8_reg r1 |

2130 |
%define mstride_reg r2 |

2131 |
%define E_reg r3 |

2132 |
%define I_reg r4 |

2133 |
%define hev_thr_reg r5 |

2134 |
%else ; luma |

2135 |
cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 |

2136 |
%define mstride_reg r1 |

2137 |
%define E_reg r2 |

2138 |
%define I_reg r3 |

2139 |
%define hev_thr_reg r4 |

2140 |
%ifdef m8 ; x86-64, sse2 |

2141 |
%define dst8_reg r4 |

2142 |
%elif mmsize == 16 ; x86-32, sse2 |

2143 |
%define dst8_reg r5 |

2144 |
%else ; x86-32, mmx/mmxext |

2145 |
%define cnt_reg r5 |

2146 |
%endif |

2147 |
%endif |

2148 |
%define dst_reg r0 |

2149 |
%define stride_reg E_reg |

2150 |
%define dst2_reg I_reg |

2151 |
%ifndef m8 |

2152 |
%define stack_reg hev_thr_reg |

2153 |
%endif |

2154 | |

2155 |
%ifidn %1, ssse3 |

2156 |
pxor m7, m7 |

2157 |
%endif |

2158 | |

2159 |
%ifndef m8 ; mmx/mmxext or sse2 on x86-32 |

2160 |
; splat function arguments |

2161 |
SPLATB_REG m0, E_reg, %1, m7 ; E |

2162 |
SPLATB_REG m1, I_reg, %1, m7 ; I |

2163 |
SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh |

2164 | |

2165 |
; align stack |

2166 |
mov stack_reg, rsp ; backup stack pointer |

2167 |
and rsp, ~(mmsize-1) ; align stack |

2168 |
sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |

2169 |
; [3]=hev() result |

2170 |
; [4]=filter tmp result |

2171 |
; [5]/[6] = p2/q2 backup |

2172 |
; [7]=lim_res sign result |

2173 | |

2174 |
%define flim_E [rsp] |

2175 |
%define flim_I [rsp+mmsize] |

2176 |
%define hev_thr [rsp+mmsize*2] |

2177 |
%define mask_res [rsp+mmsize*3] |

2178 |
%define lim_res [rsp+mmsize*4] |

2179 |
%define p0backup [rsp+mmsize*3] |

2180 |
%define q0backup [rsp+mmsize*4] |

2181 |
%define p2backup [rsp+mmsize*5] |

2182 |
%define q2backup [rsp+mmsize*6] |

2183 |
%define lim_sign [rsp+mmsize*7] |

2184 | |

2185 |
mova flim_E, m0 |

2186 |
mova flim_I, m1 |

2187 |
mova hev_thr, m2 |

2188 | |

2189 |
%else ; sse2 on x86-64 |

2190 | |

2191 |
%define flim_E m9 |

2192 |
%define flim_I m10 |

2193 |
%define hev_thr m11 |

2194 |
%define mask_res m12 |

2195 |
%define lim_res m8 |

2196 |
%define p0backup m12 |

2197 |
%define q0backup m8 |

2198 |
%define p2backup m13 |

2199 |
%define q2backup m14 |

2200 |
%define lim_sign m15 |

2201 | |

2202 |
; splat function arguments |

2203 |
SPLATB_REG flim_E, E_reg, %1, m7 ; E |

2204 |
SPLATB_REG flim_I, I_reg, %1, m7 ; I |

2205 |
SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh |

2206 |
%endif |

2207 | |

2208 |
%if mmsize == 8 && %4 == 16 ; mmx/mmxext |

2209 |
mov cnt_reg, 2 |

2210 |
%endif |

2211 |
mov stride_reg, mstride_reg |

2212 |
neg mstride_reg |

2213 |
%ifidn %2, h |

2214 |
lea dst_reg, [dst_reg + stride_reg*4-4] |

2215 |
%if %4 == 8 |

2216 |
lea dst8_reg, [dst8_reg+ stride_reg*4-4] |

2217 |
%endif |

2218 |
%endif |

2219 | |

2220 |
%if mmsize == 8 |

2221 |
.next8px |

2222 |
%endif |

2223 |
; read |

2224 |
lea dst2_reg, [dst_reg + stride_reg] |

2225 |
%ifidn %2, v |

2226 |
%if %4 == 8 && mmsize == 16 |

2227 |
%define movrow movh |

2228 |
%else |

2229 |
%define movrow mova |

2230 |
%endif |

2231 |
movrow m0, [dst_reg +mstride_reg*4] ; p3 |

2232 |
movrow m1, [dst2_reg+mstride_reg*4] ; p2 |

2233 |
movrow m2, [dst_reg +mstride_reg*2] ; p1 |

2234 |
movrow m5, [dst2_reg] ; q1 |

2235 |
movrow m6, [dst2_reg+ stride_reg] ; q2 |

2236 |
movrow m7, [dst2_reg+ stride_reg*2] ; q3 |

2237 |
%if mmsize == 16 && %4 == 8 |

2238 |
movhps m0, [dst8_reg+mstride_reg*4] |

2239 |
movhps m2, [dst8_reg+mstride_reg*2] |

2240 |
add dst8_reg, stride_reg |

2241 |
movhps m1, [dst8_reg+mstride_reg*4] |

2242 |
movhps m5, [dst8_reg] |

2243 |
movhps m6, [dst8_reg+ stride_reg] |

2244 |
movhps m7, [dst8_reg+ stride_reg*2] |

2245 |
add dst8_reg, mstride_reg |

2246 |
%endif |

2247 |
%elif mmsize == 8 ; mmx/mmxext (h) |

2248 |
; read 8 rows of 8px each |

2249 |
movu m0, [dst_reg +mstride_reg*4] |

2250 |
movu m1, [dst2_reg+mstride_reg*4] |

2251 |
movu m2, [dst_reg +mstride_reg*2] |

2252 |
movu m3, [dst_reg +mstride_reg] |

2253 |
movu m4, [dst_reg] |

2254 |
movu m5, [dst2_reg] |

2255 |
movu m6, [dst2_reg+ stride_reg] |

2256 | |

2257 |
; 8x8 transpose |

2258 |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |

2259 |
mova q0backup, m1 |

2260 |
movu m7, [dst2_reg+ stride_reg*2] |

2261 |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |

2262 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |

2263 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |

2264 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |

2265 |
mova m1, q0backup |

2266 |
mova q0backup, m2 ; store q0 |

2267 |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |

2268 |
mova p0backup, m5 ; store p0 |

2269 |
SWAP 1, 4 |

2270 |
SWAP 2, 4 |

2271 |
SWAP 6, 3 |

2272 |
SWAP 5, 3 |

2273 |
%else ; sse2 (h) |

2274 |
%if %4 == 16 |

2275 |
lea dst8_reg, [dst_reg + stride_reg*8] |

2276 |
%endif |

2277 | |

2278 |
; read 16 rows of 8px each, interleave |

2279 |
movh m0, [dst_reg +mstride_reg*4] |

2280 |
movh m1, [dst8_reg+mstride_reg*4] |

2281 |
movh m2, [dst_reg +mstride_reg*2] |

2282 |
movh m5, [dst8_reg+mstride_reg*2] |

2283 |
movh m3, [dst_reg +mstride_reg] |

2284 |
movh m6, [dst8_reg+mstride_reg] |

2285 |
movh m4, [dst_reg] |

2286 |
movh m7, [dst8_reg] |

2287 |
punpcklbw m0, m1 ; A/I |

2288 |
punpcklbw m2, m5 ; C/K |

2289 |
punpcklbw m3, m6 ; D/L |

2290 |
punpcklbw m4, m7 ; E/M |

2291 | |

2292 |
add dst8_reg, stride_reg |

2293 |
movh m1, [dst2_reg+mstride_reg*4] |

2294 |
movh m6, [dst8_reg+mstride_reg*4] |

2295 |
movh m5, [dst2_reg] |

2296 |
movh m7, [dst8_reg] |

2297 |
punpcklbw m1, m6 ; B/J |

2298 |
punpcklbw m5, m7 ; F/N |

2299 |
movh m6, [dst2_reg+ stride_reg] |

2300 |
movh m7, [dst8_reg+ stride_reg] |

2301 |
punpcklbw m6, m7 ; G/O |

2302 | |

2303 |
; 8x16 transpose |

2304 |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |

2305 |
%ifdef m8 |

2306 |
SWAP 1, 8 |

2307 |
%else |

2308 |
mova q0backup, m1 |

2309 |
%endif |

2310 |
movh m7, [dst2_reg+ stride_reg*2] |

2311 |
movh m1, [dst8_reg+ stride_reg*2] |

2312 |
punpcklbw m7, m1 ; H/P |

2313 |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |

2314 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |

2315 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |

2316 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |

2317 |
%ifdef m8 |

2318 |
SWAP 1, 8 |

2319 |
SWAP 2, 8 |

2320 |
%else |

2321 |
mova m1, q0backup |

2322 |
mova q0backup, m2 ; store q0 |

2323 |
%endif |

2324 |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |

2325 |
%ifdef m12 |

2326 |
SWAP 5, 12 |

2327 |
%else |

2328 |
mova p0backup, m5 ; store p0 |

2329 |
%endif |

2330 |
SWAP 1, 4 |

2331 |
SWAP 2, 4 |

2332 |
SWAP 6, 3 |

2333 |
SWAP 5, 3 |

2334 |
%endif |

2335 | |

2336 |
; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |

2337 |
mova m4, m1 |

2338 |
SWAP 4, 1 |

2339 |
psubusb m4, m0 ; p2-p3 |

2340 |
psubusb m0, m1 ; p3-p2 |

2341 |
por m0, m4 ; abs(p3-p2) |

2342 | |

2343 |
mova m4, m2 |

2344 |
SWAP 4, 2 |

2345 |
psubusb m4, m1 ; p1-p2 |

2346 |
mova p2backup, m1 |

2347 |
psubusb m1, m2 ; p2-p1 |

2348 |
por m1, m4 ; abs(p2-p1) |

2349 | |

2350 |
mova m4, m6 |

2351 |
SWAP 4, 6 |

2352 |
psubusb m4, m7 ; q2-q3 |

2353 |
psubusb m7, m6 ; q3-q2 |

2354 |
por m7, m4 ; abs(q3-q2) |

2355 | |

2356 |
mova m4, m5 |

2357 |
SWAP 4, 5 |

2358 |
psubusb m4, m6 ; q1-q2 |

2359 |
mova q2backup, m6 |

2360 |
psubusb m6, m5 ; q2-q1 |

2361 |
por m6, m4 ; abs(q2-q1) |

2362 | |

2363 |
%ifidn %1, mmx |

2364 |
mova m4, flim_I |

2365 |
pxor m3, m3 |

2366 |
psubusb m0, m4 |

2367 |
psubusb m1, m4 |

2368 |
psubusb m7, m4 |

2369 |
psubusb m6, m4 |

2370 |
pcmpeqb m0, m3 ; abs(p3-p2) <= I |

2371 |
pcmpeqb m1, m3 ; abs(p2-p1) <= I |

2372 |
pcmpeqb m7, m3 ; abs(q3-q2) <= I |

2373 |
pcmpeqb m6, m3 ; abs(q2-q1) <= I |

2374 |
pand m0, m1 |

2375 |
pand m7, m6 |

2376 |
pand m0, m7 |

2377 |
%else ; mmxext/sse2 |

2378 |
pmaxub m0, m1 |

2379 |
pmaxub m6, m7 |

2380 |
pmaxub m0, m6 |

2381 |
%endif |

2382 | |

2383 |
; normal_limit and high_edge_variance for p1-p0, q1-q0 |

2384 |
SWAP 7, 3 ; now m7 is zero |

2385 |
%ifidn %2, v |

2386 |
movrow m3, [dst_reg +mstride_reg] ; p0 |

2387 |
%if mmsize == 16 && %4 == 8 |

2388 |
movhps m3, [dst8_reg+mstride_reg] |

2389 |
%endif |

2390 |
%elifdef m12 |

2391 |
SWAP 3, 12 |

2392 |
%else |

2393 |
mova m3, p0backup |

2394 |
%endif |

2395 | |

2396 |
mova m1, m2 |

2397 |
SWAP 1, 2 |

2398 |
mova m6, m3 |

2399 |
SWAP 3, 6 |

2400 |
psubusb m1, m3 ; p1-p0 |

2401 |
psubusb m6, m2 ; p0-p1 |

2402 |
por m1, m6 ; abs(p1-p0) |

2403 |
%ifidn %1, mmx |

2404 |
mova m6, m1 |

2405 |
psubusb m1, m4 |

2406 |
psubusb m6, hev_thr |

2407 |
pcmpeqb m1, m7 ; abs(p1-p0) <= I |

2408 |
pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |

2409 |
pand m0, m1 |

2410 |
mova mask_res, m6 |

2411 |
%else ; mmxext/sse2 |

2412 |
pmaxub m0, m1 ; max_I |

2413 |
SWAP 1, 4 ; max_hev_thresh |

2414 |
%endif |

2415 | |

2416 |
SWAP 6, 4 ; now m6 is I |

2417 |
%ifidn %2, v |

2418 |
movrow m4, [dst_reg] ; q0 |

2419 |
%if mmsize == 16 && %4 == 8 |

2420 |
movhps m4, [dst8_reg] |

2421 |
%endif |

2422 |
%elifdef m8 |

2423 |
SWAP 4, 8 |

2424 |
%else |

2425 |
mova m4, q0backup |

2426 |
%endif |

2427 |
mova m1, m4 |

2428 |
SWAP 1, 4 |

2429 |
mova m7, m5 |

2430 |
SWAP 7, 5 |

2431 |
psubusb m1, m5 ; q0-q1 |

2432 |
psubusb m7, m4 ; q1-q0 |

2433 |
por m1, m7 ; abs(q1-q0) |

2434 |
%ifidn %1, mmx |

2435 |
mova m7, m1 |

2436 |
psubusb m1, m6 |

2437 |
psubusb m7, hev_thr |

2438 |
pxor m6, m6 |

2439 |
pcmpeqb m1, m6 ; abs(q1-q0) <= I |

2440 |
pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |

2441 |
mova m6, mask_res |

2442 |
pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |

2443 |
pand m6, m7 |

2444 |
%else ; mmxext/sse2 |

2445 |
pxor m7, m7 |

2446 |
pmaxub m0, m1 |

2447 |
pmaxub m6, m1 |

2448 |
psubusb m0, flim_I |

2449 |
psubusb m6, hev_thr |

2450 |
pcmpeqb m0, m7 ; max(abs(..)) <= I |

2451 |
pcmpeqb m6, m7 ; !(max(abs..) > thresh) |

2452 |
%endif |

2453 |
%ifdef m12 |

2454 |
SWAP 6, 12 |

2455 |
%else |

2456 |
mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |

2457 |
%endif |

2458 | |

2459 |
; simple_limit |

2460 |
mova m1, m3 |

2461 |
SWAP 1, 3 |

2462 |
mova m6, m4 ; keep copies of p0/q0 around for later use |

2463 |
SWAP 6, 4 |

2464 |
psubusb m1, m4 ; p0-q0 |

2465 |
psubusb m6, m3 ; q0-p0 |

2466 |
por m1, m6 ; abs(q0-p0) |

2467 |
paddusb m1, m1 ; m1=2*abs(q0-p0) |

2468 | |

2469 |
mova m7, m2 |

2470 |
SWAP 7, 2 |

2471 |
mova m6, m5 |

2472 |
SWAP 6, 5 |

2473 |
psubusb m7, m5 ; p1-q1 |

2474 |
psubusb m6, m2 ; q1-p1 |

2475 |
por m7, m6 ; abs(q1-p1) |

2476 |
pxor m6, m6 |

2477 |
pand m7, [pb_FE] |

2478 |
psrlq m7, 1 ; abs(q1-p1)/2 |

2479 |
paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |

2480 |
psubusb m7, flim_E |

2481 |
pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |

2482 |
pand m0, m7 ; normal_limit result |

2483 | |

2484 |
; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |

2485 |
%ifdef m8 ; x86-64 && sse2 |

2486 |
mova m8, [pb_80] |

2487 |
%define pb_80_var m8 |

2488 |
%else ; x86-32 or mmx/mmxext |

2489 |
%define pb_80_var [pb_80] |

2490 |
%endif |

2491 |
mova m1, m4 |

2492 |
mova m7, m3 |

2493 |
pxor m1, pb_80_var |

2494 |
pxor m7, pb_80_var |

2495 |
psubsb m1, m7 ; (signed) q0-p0 |

2496 |
mova m6, m2 |

2497 |
mova m7, m5 |

2498 |
pxor m6, pb_80_var |

2499 |
pxor m7, pb_80_var |

2500 |
psubsb m6, m7 ; (signed) p1-q1 |

2501 |
mova m7, mask_res |

2502 |
paddsb m6, m1 |

2503 |
paddsb m6, m1 |

2504 |
paddsb m6, m1 |

2505 |
pand m6, m0 |

2506 |
%ifdef m8 |

2507 |
mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge |

2508 |
pand lim_res, m7 |

2509 |
%else |

2510 |
mova m0, m6 |

2511 |
pand m0, m7 |

2512 |
mova lim_res, m0 |

2513 |
%endif |

2514 |
pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common |

2515 | |

2516 |
mova m1, [pb_F8] |

2517 |
mova m6, m7 |

2518 |
paddsb m7, [pb_3] |

2519 |
paddsb m6, [pb_4] |

2520 |
pand m7, m1 |

2521 |
pand m6, m1 |

2522 | |

2523 |
pxor m1, m1 |

2524 |
pxor m0, m0 |

2525 |
pcmpgtb m1, m7 |

2526 |
psubb m0, m7 |

2527 |
psrlq m7, 3 ; +f2 |

2528 |
psrlq m0, 3 ; -f2 |

2529 |
pand m0, m1 |

2530 |
pandn m1, m7 |

2531 |
psubusb m3, m0 |

2532 |
paddusb m3, m1 ; p0+f2 |

2533 | |

2534 |
pxor m1, m1 |

2535 |
pxor m0, m0 |

2536 |
pcmpgtb m0, m6 |

2537 |
psubb m1, m6 |

2538 |
psrlq m6, 3 ; +f1 |

2539 |
psrlq m1, 3 ; -f1 |

2540 |
pand m1, m0 |

2541 |
pandn m0, m6 |

2542 |
psubusb m4, m0 |

2543 |
paddusb m4, m1 ; q0-f1 |

2544 | |

2545 |
; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) |

2546 |
mova m7, [pw_63] |

2547 |
%ifdef m8 |

2548 |
SWAP 1, 8 |

2549 |
%else |

2550 |
mova m1, lim_res |

2551 |
%endif |

2552 |
pxor m0, m0 |

2553 |
mova m6, m1 |

2554 |
pcmpgtb m0, m1 ; which are negative |

2555 |
punpcklbw m6, m0 ; signed byte->word |

2556 |
punpckhbw m1, m0 |

2557 |
mova lim_sign, m0 |

2558 |
mova mask_res, m6 ; backup for later in filter |

2559 |
mova lim_res, m1 |

2560 |
pmullw m6, [pw_27] |

2561 |
pmullw m1, [pw_27] |

2562 |
paddw m6, m7 |

2563 |
paddw m1, m7 |

2564 |
psraw m6, 7 |

2565 |
psraw m1, 7 |

2566 |
packsswb m6, m1 ; a0 |

2567 |
pxor m1, m1 |

2568 |
psubb m1, m6 |

2569 |
pand m1, m0 ; -a0 |

2570 |
pandn m0, m6 ; +a0 |

2571 |
psubusb m3, m1 |

2572 |
paddusb m4, m1 |

2573 |
paddusb m3, m0 ; p0+a0 |

2574 |
psubusb m4, m0 ; q0-a0 |

2575 | |

2576 |
mova m6, mask_res |

2577 |
mova m1, lim_res |

2578 |
mova m0, lim_sign |

2579 |
pmullw m6, [pw_18] |

2580 |
pmullw m1, [pw_18] |

2581 |
paddw m6, m7 |

2582 |
paddw m1, m7 |

2583 |
psraw m6, 7 |

2584 |
psraw m1, 7 |

2585 |
packsswb m6, m1 ; a1 |

2586 |
pxor m1, m1 |

2587 |
psubb m1, m6 |

2588 |
pand m1, m0 ; -a1 |

2589 |
pandn m0, m6 ; +a1 |

2590 |
psubusb m2, m1 |

2591 |
paddusb m5, m1 |

2592 |
paddusb m2, m0 ; p1+a1 |

2593 |
psubusb m5, m0 ; q1-a1 |

2594 | |

2595 |
%ifdef m8 |

2596 |
SWAP 6, 12 |

2597 |
SWAP 1, 8 |

2598 |
%else |

2599 |
mova m6, mask_res |

2600 |
mova m1, lim_res |

2601 |
%endif |

2602 |
pmullw m6, [pw_9] |

2603 |
pmullw m1, [pw_9] |

2604 |
paddw m6, m7 |

2605 |
paddw m1, m7 |

2606 |
%ifdef m15 |

2607 |
SWAP 7, 15 |

2608 |
%else |

2609 |
mova m7, lim_sign |

2610 |
%endif |

2611 |
psraw m6, 7 |

2612 |
psraw m1, 7 |

2613 |
packsswb m6, m1 ; a1 |

2614 |
pxor m0, m0 |

2615 |
psubb m0, m6 |

2616 |
pand m0, m7 ; -a1 |

2617 |
pandn m7, m6 ; +a1 |

2618 |
%ifdef m8 |

2619 |
SWAP 1, 13 |

2620 |
SWAP 6, 14 |

2621 |
%else |

2622 |
mova m1, p2backup |

2623 |
mova m6, q2backup |

2624 |
%endif |

2625 |
psubusb m1, m0 |

2626 |
paddusb m6, m0 |

2627 |
paddusb m1, m7 ; p1+a1 |

2628 |
psubusb m6, m7 ; q1-a1 |

2629 | |

2630 |
; store |

2631 |
%ifidn %2, v |

2632 |
movrow [dst2_reg+mstride_reg*4], m1 |

2633 |
movrow [dst_reg +mstride_reg*2], m2 |

2634 |
movrow [dst_reg +mstride_reg ], m3 |

2635 |
movrow [dst_reg], m4 |

2636 |
movrow [dst2_reg], m5 |

2637 |
movrow [dst2_reg+ stride_reg ], m6 |

2638 |
%if mmsize == 16 && %4 == 8 |

2639 |
add dst8_reg, mstride_reg |

2640 |
movhps [dst8_reg+mstride_reg*2], m1 |

2641 |
movhps [dst8_reg+mstride_reg ], m2 |

2642 |
movhps [dst8_reg], m3 |

2643 |
add dst8_reg, stride_reg |

2644 |
movhps [dst8_reg], m4 |

2645 |
movhps [dst8_reg+ stride_reg ], m5 |

2646 |
movhps [dst8_reg+ stride_reg*2], m6 |

2647 |
%endif |

2648 |
%else ; h |

2649 |
inc dst_reg |

2650 |
inc dst2_reg |

2651 | |

2652 |
; 4x8/16 transpose |

2653 |
TRANSPOSE4x4B 1, 2, 3, 4, 0 |

2654 |
SBUTTERFLY bw, 5, 6, 0 |

2655 | |

2656 |
%if mmsize == 8 ; mmx/mmxext (h) |

2657 |
WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg |

2658 |
add dst_reg, 4 |

2659 |
WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4 |

2660 |
%else ; sse2 (h) |

2661 |
lea dst8_reg, [dst8_reg+mstride_reg+1] |

2662 |
WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |

2663 |
lea dst_reg, [dst2_reg+mstride_reg+4] |

2664 |
lea dst8_reg, [dst8_reg+mstride_reg+4] |

2665 |
WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg, %2 |

2666 |
%ifidn %2, sse4 |

2667 |
lea dst_reg, [dst8_reg+ stride_reg] |

2668 |
%endif |

2669 |
WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2 |

2670 |
%endif |

2671 |
%endif |

2672 | |

2673 |
%if mmsize == 8 |

2674 |
%if %4 == 8 ; chroma |

2675 |
%ifidn %2, h |

2676 |
sub dst_reg, 5 |

2677 |
%endif |

2678 |
cmp dst_reg, dst8_reg |

2679 |
mov dst_reg, dst8_reg |

2680 |
jnz .next8px |

2681 |
%else |

2682 |
%ifidn %2, h |

2683 |
lea dst_reg, [dst_reg + stride_reg*8-5] |

2684 |
%else ; v |

2685 |
add dst_reg, 8 |

2686 |
%endif |

2687 |
dec cnt_reg |

2688 |
jg .next8px |

2689 |
%endif |

2690 |
%endif |

2691 | |

2692 |
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext |

2693 |
mov rsp, stack_reg ; restore stack pointer |

2694 |
%endif |

2695 |
RET |

2696 |
%endmacro |

2697 | |

2698 |
INIT_MMX |

2699 |
MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 |

2700 |
MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 |

2701 |
MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 |

2702 |
MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 |

2703 | |

2704 |
MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 |

2705 |
MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 |

2706 |
MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 |

2707 |
MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 |

2708 | |

2709 |
INIT_XMM |

2710 |
MBEDGE_LOOPFILTER sse2, v, 5, 16, 16 |

2711 |
%ifdef m8 |

2712 |
MBEDGE_LOOPFILTER sse2, h, 5, 16, 16 |

2713 |
%else |

2714 |
MBEDGE_LOOPFILTER sse2, h, 6, 16, 16 |

2715 |
%endif |

2716 |
MBEDGE_LOOPFILTER sse2, v, 6, 8, 16 |

2717 |
MBEDGE_LOOPFILTER sse2, h, 6, 8, 16 |

2718 | |

2719 |
MBEDGE_LOOPFILTER ssse3, v, 5, 16, 16 |

2720 |
%ifdef m8 |

2721 |
MBEDGE_LOOPFILTER ssse3, h, 5, 16, 16 |

2722 |
%else |

2723 |
MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 |

2724 |
%endif |

2725 |
MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 |

2726 |
MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16 |

2727 | |

2728 |
%ifdef m8 |

2729 |
MBEDGE_LOOPFILTER sse4, h, 5, 16, 16 |

2730 |
%else |

2731 |
MBEDGE_LOOPFILTER sse4, h, 6, 16, 16 |

2732 |
%endif |

2733 |
MBEDGE_LOOPFILTER sse4, h, 6, 8, 16 |