## ffmpeg / libavcodec / x86 / vp8dsp.asm @ 565344e7

History | View | Annotate | Download (27.8 KB)

1 |
;****************************************************************************** |
---|---|

2 |
;* VP8 MMXEXT optimizations |

3 |
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |

4 |
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |

5 |
;* |

6 |
;* This file is part of FFmpeg. |

7 |
;* |

8 |
;* FFmpeg is free software; you can redistribute it and/or |

9 |
;* modify it under the terms of the GNU Lesser General Public |

10 |
;* License as published by the Free Software Foundation; either |

11 |
;* version 2.1 of the License, or (at your option) any later version. |

12 |
;* |

13 |
;* FFmpeg is distributed in the hope that it will be useful, |

14 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

15 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

16 |
;* Lesser General Public License for more details. |

17 |
;* |

18 |
;* You should have received a copy of the GNU Lesser General Public |

19 |
;* License along with FFmpeg; if not, write to the Free Software |

20 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

21 |
;****************************************************************************** |

22 | |

23 |
%include "x86inc.asm" |

24 |
%include "x86util.asm" |

25 | |

26 |
SECTION_RODATA |

27 | |

28 |
fourtap_filter_hw_m: times 4 dw -6, 123 |

29 |
times 4 dw 12, -1 |

30 |
times 4 dw -9, 93 |

31 |
times 4 dw 50, -6 |

32 |
times 4 dw -6, 50 |

33 |
times 4 dw 93, -9 |

34 |
times 4 dw -1, 12 |

35 |
times 4 dw 123, -6 |

36 | |

37 |
sixtap_filter_hw_m: times 4 dw 2, -11 |

38 |
times 4 dw 108, 36 |

39 |
times 4 dw -8, 1 |

40 |
times 4 dw 3, -16 |

41 |
times 4 dw 77, 77 |

42 |
times 4 dw -16, 3 |

43 |
times 4 dw 1, -8 |

44 |
times 4 dw 36, 108 |

45 |
times 4 dw -11, 2 |

46 | |

47 |
fourtap_filter_hb_m: times 8 db -6, -1 |

48 |
times 8 db 123, 12 |

49 |
times 8 db -9, -6 |

50 |
times 8 db 93, 50 |

51 |
times 8 db -6, -9 |

52 |
times 8 db 50, 93 |

53 |
times 8 db -1, -6 |

54 |
times 8 db 12, 123 |

55 | |

56 |
sixtap_filter_hb_m: times 8 db 2, 1 |

57 |
times 8 db -11, 108 |

58 |
times 8 db 36, -8 |

59 |
times 8 db 3, 3 |

60 |
times 8 db -16, 77 |

61 |
times 8 db 77, -16 |

62 |
times 8 db 1, 2 |

63 |
times 8 db -8, 36 |

64 |
times 8 db 108, -11 |

65 | |

66 |
fourtap_filter_v_m: times 8 dw -6 |

67 |
times 8 dw 123 |

68 |
times 8 dw 12 |

69 |
times 8 dw -1 |

70 |
times 8 dw -9 |

71 |
times 8 dw 93 |

72 |
times 8 dw 50 |

73 |
times 8 dw -6 |

74 |
times 8 dw -6 |

75 |
times 8 dw 50 |

76 |
times 8 dw 93 |

77 |
times 8 dw -9 |

78 |
times 8 dw -1 |

79 |
times 8 dw 12 |

80 |
times 8 dw 123 |

81 |
times 8 dw -6 |

82 | |

83 |
sixtap_filter_v_m: times 8 dw 2 |

84 |
times 8 dw -11 |

85 |
times 8 dw 108 |

86 |
times 8 dw 36 |

87 |
times 8 dw -8 |

88 |
times 8 dw 1 |

89 |
times 8 dw 3 |

90 |
times 8 dw -16 |

91 |
times 8 dw 77 |

92 |
times 8 dw 77 |

93 |
times 8 dw -16 |

94 |
times 8 dw 3 |

95 |
times 8 dw 1 |

96 |
times 8 dw -8 |

97 |
times 8 dw 36 |

98 |
times 8 dw 108 |

99 |
times 8 dw -11 |

100 |
times 8 dw 2 |

101 | |

102 |
bilinear_filter_vw_m: times 8 dw 1 |

103 |
times 8 dw 2 |

104 |
times 8 dw 3 |

105 |
times 8 dw 4 |

106 |
times 8 dw 5 |

107 |
times 8 dw 6 |

108 |
times 8 dw 7 |

109 | |

110 |
bilinear_filter_vb_m: times 8 db 7, 1 |

111 |
times 8 db 6, 2 |

112 |
times 8 db 5, 3 |

113 |
times 8 db 4, 4 |

114 |
times 8 db 3, 5 |

115 |
times 8 db 2, 6 |

116 |
times 8 db 1, 7 |

117 | |

118 |
%ifdef PIC |

119 |
%define fourtap_filter_hw r11 |

120 |
%define sixtap_filter_hw r11 |

121 |
%define fourtap_filter_hb r11 |

122 |
%define sixtap_filter_hb r11 |

123 |
%define fourtap_filter_v r11 |

124 |
%define sixtap_filter_v r11 |

125 |
%define bilinear_filter_vw r11 |

126 |
%define bilinear_filter_vb r11 |

127 |
%else |

128 |
%define fourtap_filter_hw fourtap_filter_hw_m |

129 |
%define sixtap_filter_hw sixtap_filter_hw_m |

130 |
%define fourtap_filter_hb fourtap_filter_hb_m |

131 |
%define sixtap_filter_hb sixtap_filter_hb_m |

132 |
%define fourtap_filter_v fourtap_filter_v_m |

133 |
%define sixtap_filter_v sixtap_filter_v_m |

134 |
%define bilinear_filter_vw bilinear_filter_vw_m |

135 |
%define bilinear_filter_vb bilinear_filter_vb_m |

136 |
%endif |

137 | |

138 |
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |

139 |
filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 |

140 | |

141 |
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |

142 |
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |

143 |
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |

144 | |

145 |
pw_20091: times 4 dw 20091 |

146 |
pw_17734: times 4 dw 17734 |

147 | |

148 |
cextern pw_3 |

149 |
cextern pw_4 |

150 |
cextern pw_64 |

151 | |

152 |
SECTION .text |

153 | |

154 |
;----------------------------------------------------------------------------- |

155 |
; subpel MC functions: |

156 |
; |

157 |
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, |

158 |
; uint8_t *src, int srcstride, |

159 |
; int height, int mx, int my); |

160 |
;----------------------------------------------------------------------------- |

161 | |

162 |
; 4x4 block, H-only 4-tap filter |

163 |
cglobal put_vp8_epel4_h4_mmxext, 6, 6 |

164 |
shl r5d, 4 |

165 |
%ifdef PIC |

166 |
lea r11, [fourtap_filter_hw_m] |

167 |
%endif |

168 |
movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |

169 |
movq mm5, [fourtap_filter_hw+r5] |

170 |
movq mm7, [pw_64] |

171 |
pxor mm6, mm6 |

172 | |

173 |
.nextrow |

174 |
movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels |

175 | |

176 |
; first set of 2 pixels |

177 |
movq mm2, mm1 ; byte ABCD.. |

178 |
punpcklbw mm1, mm6 ; byte->word ABCD |

179 |
pshufw mm0, mm2, 9 ; byte CDEF.. |

180 |
punpcklbw mm0, mm6 ; byte->word CDEF |

181 |
pshufw mm3, mm1, 0x94 ; word ABBC |

182 |
pshufw mm1, mm0, 0x94 ; word CDDE |

183 |
pmaddwd mm3, mm4 ; multiply 2px with F0/F1 |

184 |
movq mm0, mm1 ; backup for second set of pixels |

185 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |

186 |
paddd mm3, mm1 ; finish 1st 2px |

187 | |

188 |
; second set of 2 pixels, use backup of above |

189 |
punpckhbw mm2, mm6 ; byte->word EFGH |

190 |
pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 |

191 |
pshufw mm1, mm2, 0x94 ; word EFFG |

192 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |

193 |
paddd mm0, mm1 ; finish 2nd 2px |

194 | |

195 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |

196 |
packssdw mm3, mm0 ; merge dword->word (4px) |

197 |
paddsw mm3, mm7 ; rounding |

198 |
psraw mm3, 7 |

199 |
packuswb mm3, mm6 ; clip and word->bytes |

200 |
movd [r0], mm3 ; store |

201 | |

202 |
; go to next line |

203 |
add r0, r1 |

204 |
add r2, r3 |

205 |
dec r4 ; next row |

206 |
jg .nextrow |

207 |
REP_RET |

208 | |

209 |
; 4x4 block, H-only 6-tap filter |

210 |
cglobal put_vp8_epel4_h6_mmxext, 6, 6 |

211 |
lea r5d, [r5*3] |

212 |
%ifdef PIC |

213 |
lea r11, [sixtap_filter_hw_m] |

214 |
%endif |

215 |
movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words |

216 |
movq mm5, [sixtap_filter_hw+r5*8-32] |

217 |
movq mm6, [sixtap_filter_hw+r5*8-16] |

218 |
movq mm7, [pw_64] |

219 |
pxor mm3, mm3 |

220 | |

221 |
.nextrow |

222 |
movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels |

223 | |

224 |
; first set of 2 pixels |

225 |
movq mm2, mm1 ; byte ABCD.. |

226 |
punpcklbw mm1, mm3 ; byte->word ABCD |

227 |
pshufw mm0, mm2, 0x9 ; byte CDEF.. |

228 |
punpckhbw mm2, mm3 ; byte->word EFGH |

229 |
punpcklbw mm0, mm3 ; byte->word CDEF |

230 |
pshufw mm1, mm1, 0x94 ; word ABBC |

231 |
pshufw mm2, mm2, 0x94 ; word EFFG |

232 |
pmaddwd mm1, mm4 ; multiply 2px with F0/F1 |

233 |
pshufw mm3, mm0, 0x94 ; word CDDE |

234 |
movq mm0, mm3 ; backup for second set of pixels |

235 |
pmaddwd mm3, mm5 ; multiply 2px with F2/F3 |

236 |
paddd mm1, mm3 ; add to 1st 2px cache |

237 |
movq mm3, mm2 ; backup for second set of pixels |

238 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |

239 |
paddd mm1, mm2 ; finish 1st 2px |

240 | |

241 |
; second set of 2 pixels, use backup of above |

242 |
movd mm2, [r2+3] ; byte FGHI (prevent overreads) |

243 |
pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 |

244 |
pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 |

245 |
paddd mm0, mm3 ; add to 2nd 2px cache |

246 |
pxor mm3, mm3 |

247 |
punpcklbw mm2, mm3 ; byte->word FGHI |

248 |
pshufw mm2, mm2, 0xE9 ; word GHHI |

249 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |

250 |
paddd mm0, mm2 ; finish 2nd 2px |

251 | |

252 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |

253 |
packssdw mm1, mm0 ; merge dword->word (4px) |

254 |
paddsw mm1, mm7 ; rounding |

255 |
psraw mm1, 7 |

256 |
packuswb mm1, mm3 ; clip and word->bytes |

257 |
movd [r0], mm1 ; store |

258 | |

259 |
; go to next line |

260 |
add r0, r1 |

261 |
add r2, r3 |

262 |
dec r4 ; next row |

263 |
jg .nextrow |

264 |
REP_RET |

265 | |

266 |
; 4x4 block, H-only 4-tap filter |

267 |
INIT_XMM |

268 |
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 |

269 |
shl r5d, 4 |

270 |
%ifdef PIC |

271 |
lea r11, [fourtap_filter_hw_m] |

272 |
%endif |

273 |
mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |

274 |
mova m6, [fourtap_filter_hw+r5] |

275 |
pxor m7, m7 |

276 | |

277 |
.nextrow |

278 |
movh m0, [r2-1] |

279 |
punpcklbw m0, m7 ; ABCDEFGH |

280 |
mova m1, m0 |

281 |
mova m2, m0 |

282 |
mova m3, m0 |

283 |
psrldq m1, 2 ; BCDEFGH |

284 |
psrldq m2, 4 ; CDEFGH |

285 |
psrldq m3, 6 ; DEFGH |

286 |
punpcklwd m0, m1 ; ABBCCDDE |

287 |
punpcklwd m2, m3 ; CDDEEFFG |

288 |
pmaddwd m0, m5 |

289 |
pmaddwd m2, m6 |

290 |
paddd m0, m2 |

291 | |

292 |
movh m1, [r2+3] |

293 |
punpcklbw m1, m7 ; ABCDEFGH |

294 |
mova m2, m1 |

295 |
mova m3, m1 |

296 |
mova m4, m1 |

297 |
psrldq m2, 2 ; BCDEFGH |

298 |
psrldq m3, 4 ; CDEFGH |

299 |
psrldq m4, 6 ; DEFGH |

300 |
punpcklwd m1, m2 ; ABBCCDDE |

301 |
punpcklwd m3, m4 ; CDDEEFFG |

302 |
pmaddwd m1, m5 |

303 |
pmaddwd m3, m6 |

304 |
paddd m1, m3 |

305 | |

306 |
packssdw m0, m1 |

307 |
paddsw m0, [pw_64] |

308 |
psraw m0, 7 |

309 |
packuswb m0, m7 |

310 |
movh [r0], m0 ; store |

311 | |

312 |
; go to next line |

313 |
add r0, r1 |

314 |
add r2, r3 |

315 |
dec r4 ; next row |

316 |
jg .nextrow |

317 |
REP_RET |

318 | |

319 |
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 |

320 |
lea r5d, [r5*3] |

321 |
%ifdef PIC |

322 |
lea r11, [sixtap_filter_hw_m] |

323 |
%endif |

324 |
lea r5, [sixtap_filter_hw+r5*8] |

325 |
pxor m7, m7 |

326 | |

327 |
.nextrow |

328 |
movu m0, [r2-2] |

329 |
mova m6, m0 |

330 |
mova m4, m0 |

331 |
punpcklbw m0, m7 ; ABCDEFGHI |

332 |
mova m1, m0 |

333 |
mova m2, m0 |

334 |
mova m3, m0 |

335 |
psrldq m1, 2 ; BCDEFGH |

336 |
psrldq m2, 4 ; CDEFGH |

337 |
psrldq m3, 6 ; DEFGH |

338 |
psrldq m4, 4 |

339 |
punpcklbw m4, m7 ; EFGH |

340 |
mova m5, m4 |

341 |
psrldq m5, 2 ; FGH |

342 |
punpcklwd m0, m1 ; ABBCCDDE |

343 |
punpcklwd m2, m3 ; CDDEEFFG |

344 |
punpcklwd m4, m5 ; EFFGGHHI |

345 |
pmaddwd m0, [r5-48] |

346 |
pmaddwd m2, [r5-32] |

347 |
pmaddwd m4, [r5-16] |

348 |
paddd m0, m2 |

349 |
paddd m0, m4 |

350 | |

351 |
psrldq m6, 4 |

352 |
mova m4, m6 |

353 |
punpcklbw m6, m7 ; ABCDEFGHI |

354 |
mova m1, m6 |

355 |
mova m2, m6 |

356 |
mova m3, m6 |

357 |
psrldq m1, 2 ; BCDEFGH |

358 |
psrldq m2, 4 ; CDEFGH |

359 |
psrldq m3, 6 ; DEFGH |

360 |
psrldq m4, 4 |

361 |
punpcklbw m4, m7 ; EFGH |

362 |
mova m5, m4 |

363 |
psrldq m5, 2 ; FGH |

364 |
punpcklwd m6, m1 ; ABBCCDDE |

365 |
punpcklwd m2, m3 ; CDDEEFFG |

366 |
punpcklwd m4, m5 ; EFFGGHHI |

367 |
pmaddwd m6, [r5-48] |

368 |
pmaddwd m2, [r5-32] |

369 |
pmaddwd m4, [r5-16] |

370 |
paddd m6, m2 |

371 |
paddd m6, m4 |

372 | |

373 |
packssdw m0, m6 |

374 |
paddsw m0, [pw_64] |

375 |
psraw m0, 7 |

376 |
packuswb m0, m7 |

377 |
movh [r0], m0 ; store |

378 | |

379 |
; go to next line |

380 |
add r0, r1 |

381 |
add r2, r3 |

382 |
dec r4 ; next row |

383 |
jg .nextrow |

384 |
REP_RET |

385 | |

386 |
cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7 |

387 |
shl r5d, 4 |

388 |
mova m2, [pw_64] |

389 |
mova m3, [filter_h4_shuf] |

390 |
mova m4, [filter_h6_shuf2] |

391 |
%ifdef PIC |

392 |
lea r11, [fourtap_filter_hb_m] |

393 |
%endif |

394 |
mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |

395 |
mova m6, [fourtap_filter_hb+r5] |

396 | |

397 |
.nextrow |

398 |
movu m0, [r2-1] |

399 |
mova m1, m0 |

400 |
pshufb m0, m3 |

401 |
pshufb m1, m4 |

402 |
pmaddubsw m0, m5 |

403 |
pmaddubsw m1, m6 |

404 |
paddsw m0, m2 |

405 |
paddsw m0, m1 |

406 |
psraw m0, 7 |

407 |
packuswb m0, m0 |

408 |
movh [r0], m0 ; store |

409 | |

410 |
; go to next line |

411 |
add r0, r1 |

412 |
add r2, r3 |

413 |
dec r4 ; next row |

414 |
jg .nextrow |

415 |
REP_RET |

416 | |

417 |
cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8 |

418 |
lea r5d, [r5*3] |

419 |
mova m3, [filter_h6_shuf1] |

420 |
mova m4, [filter_h6_shuf2] |

421 |
%ifdef PIC |

422 |
lea r11, [sixtap_filter_hb_m] |

423 |
%endif |

424 |
mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |

425 |
mova m6, [sixtap_filter_hb+r5*8-32] |

426 |
mova m7, [sixtap_filter_hb+r5*8-16] |

427 | |

428 |
.nextrow |

429 |
movu m0, [r2-2] |

430 |
mova m1, m0 |

431 |
mova m2, m0 |

432 |
pshufb m0, m3 |

433 |
pshufb m1, m4 |

434 |
pshufb m2, [filter_h6_shuf3] |

435 |
pmaddubsw m0, m5 |

436 |
pmaddubsw m1, m6 |

437 |
pmaddubsw m2, m7 |

438 |
paddsw m0, m1 |

439 |
paddsw m0, m2 |

440 |
paddsw m0, [pw_64] |

441 |
psraw m0, 7 |

442 |
packuswb m0, m0 |

443 |
movh [r0], m0 ; store |

444 | |

445 |
; go to next line |

446 |
add r0, r1 |

447 |
add r2, r3 |

448 |
dec r4 ; next row |

449 |
jg .nextrow |

450 |
REP_RET |

451 | |

452 |
%macro FILTER_V 3 |

453 |
; 4x4 block, V-only 4-tap filter |

454 |
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 |

455 |
shl r6d, 5 |

456 |
%ifdef PIC |

457 |
lea r11, [fourtap_filter_v_m] |

458 |
%endif |

459 |
lea r6, [fourtap_filter_v+r6-32] |

460 |
mova m6, [pw_64] |

461 |
pxor m7, m7 |

462 |
mova m5, [r6+48] |

463 | |

464 |
; read 3 lines |

465 |
sub r2, r3 |

466 |
movh m0, [r2] |

467 |
movh m1, [r2+ r3] |

468 |
movh m2, [r2+2*r3] |

469 |
add r2, r3 |

470 |
punpcklbw m0, m7 |

471 |
punpcklbw m1, m7 |

472 |
punpcklbw m2, m7 |

473 | |

474 |
.nextrow |

475 |
; first calculate negative taps (to prevent losing positive overflows) |

476 |
movh m4, [r2+2*r3] ; read new row |

477 |
punpcklbw m4, m7 |

478 |
mova m3, m4 |

479 |
pmullw m0, [r6+0] |

480 |
pmullw m4, m5 |

481 |
paddsw m4, m0 |

482 | |

483 |
; then calculate positive taps |

484 |
mova m0, m1 |

485 |
pmullw m1, [r6+16] |

486 |
paddsw m4, m1 |

487 |
mova m1, m2 |

488 |
pmullw m2, [r6+32] |

489 |
paddsw m4, m2 |

490 |
mova m2, m3 |

491 | |

492 |
; round/clip/store |

493 |
paddsw m4, m6 |

494 |
psraw m4, 7 |

495 |
packuswb m4, m7 |

496 |
movh [r0], m4 |

497 | |

498 |
; go to next line |

499 |
add r0, r1 |

500 |
add r2, r3 |

501 |
dec r4 ; next row |

502 |
jg .nextrow |

503 |
REP_RET |

504 | |

505 | |

506 |
; 4x4 block, V-only 6-tap filter |

507 |
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 |

508 |
shl r6d, 4 |

509 |
lea r6, [r6*3] |

510 |
%ifdef PIC |

511 |
lea r11, [sixtap_filter_v_m] |

512 |
%endif |

513 |
lea r6, [sixtap_filter_v+r6-96] |

514 |
pxor m7, m7 |

515 | |

516 |
; read 5 lines |

517 |
sub r2, r3 |

518 |
sub r2, r3 |

519 |
movh m0, [r2] |

520 |
movh m1, [r2+r3] |

521 |
movh m2, [r2+r3*2] |

522 |
lea r2, [r2+r3*2] |

523 |
add r2, r3 |

524 |
movh m3, [r2] |

525 |
movh m4, [r2+r3] |

526 |
punpcklbw m0, m7 |

527 |
punpcklbw m1, m7 |

528 |
punpcklbw m2, m7 |

529 |
punpcklbw m3, m7 |

530 |
punpcklbw m4, m7 |

531 | |

532 |
.nextrow |

533 |
; first calculate negative taps (to prevent losing positive overflows) |

534 |
mova m5, m1 |

535 |
pmullw m5, [r6+16] |

536 |
mova m6, m4 |

537 |
pmullw m6, [r6+64] |

538 |
paddsw m6, m5 |

539 | |

540 |
; then calculate positive taps |

541 |
movh m5, [r2+2*r3] ; read new row |

542 |
punpcklbw m5, m7 |

543 |
pmullw m0, [r6+0] |

544 |
paddsw m6, m0 |

545 |
mova m0, m1 |

546 |
mova m1, m2 |

547 |
pmullw m2, [r6+32] |

548 |
paddsw m6, m2 |

549 |
mova m2, m3 |

550 |
pmullw m3, [r6+48] |

551 |
paddsw m6, m3 |

552 |
mova m3, m4 |

553 |
mova m4, m5 |

554 |
pmullw m5, [r6+80] |

555 |
paddsw m6, m5 |

556 | |

557 |
; round/clip/store |

558 |
paddsw m6, [pw_64] |

559 |
psraw m6, 7 |

560 |
packuswb m6, m7 |

561 |
movh [r0], m6 |

562 | |

563 |
; go to next line |

564 |
add r0, r1 |

565 |
add r2, r3 |

566 |
dec r4 ; next row |

567 |
jg .nextrow |

568 |
REP_RET |

569 |
%endmacro |

570 | |

571 |
INIT_MMX |

572 |
FILTER_V mmxext, 4, 0 |

573 |
INIT_XMM |

574 |
FILTER_V sse2, 8, 8 |

575 | |

576 |
cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8 |

577 |
shl r6d, 4 |

578 |
%ifdef PIC |

579 |
lea r11, [fourtap_filter_hb_m] |

580 |
%endif |

581 |
mova m5, [fourtap_filter_hb+r6-16] |

582 |
mova m6, [fourtap_filter_hb+r6] |

583 |
mova m7, [pw_64] |

584 | |

585 |
; read 3 lines |

586 |
sub r2, r3 |

587 |
movh m0, [r2] |

588 |
movh m1, [r2+ r3] |

589 |
movh m2, [r2+2*r3] |

590 |
add r2, r3 |

591 | |

592 |
.nextrow |

593 |
movh m3, [r2+2*r3] ; read new row |

594 |
mova m4, m0 |

595 |
mova m0, m1 |

596 |
punpcklbw m4, m3 |

597 |
punpcklbw m1, m2 |

598 |
pmaddubsw m4, m5 |

599 |
pmaddubsw m1, m6 |

600 |
paddsw m4, m1 |

601 |
mova m1, m2 |

602 |
paddsw m4, m7 |

603 |
mova m2, m3 |

604 |
psraw m4, 7 |

605 |
packuswb m4, m4 |

606 |
movh [r0], m4 |

607 | |

608 |
; go to next line |

609 |
add r0, r1 |

610 |
add r2, r3 |

611 |
dec r4 ; next row |

612 |
jg .nextrow |

613 |
REP_RET |

614 | |

615 |
cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8 |

616 |
lea r6d, [r6*3] |

617 |
%ifdef PIC |

618 |
lea r11, [sixtap_filter_hb_m] |

619 |
%endif |

620 |
lea r6, [sixtap_filter_hb+r6*8] |

621 | |

622 |
; read 5 lines |

623 |
sub r2, r3 |

624 |
sub r2, r3 |

625 |
movh m0, [r2] |

626 |
movh m1, [r2+r3] |

627 |
movh m2, [r2+r3*2] |

628 |
lea r2, [r2+r3*2] |

629 |
add r2, r3 |

630 |
movh m3, [r2] |

631 |
movh m4, [r2+r3] |

632 | |

633 |
.nextrow |

634 |
movh m5, [r2+2*r3] ; read new row |

635 |
mova m6, m0 |

636 |
punpcklbw m6, m5 |

637 |
mova m0, m1 |

638 |
punpcklbw m1, m2 |

639 |
mova m7, m3 |

640 |
punpcklbw m7, m4 |

641 |
pmaddubsw m6, [r6-48] |

642 |
pmaddubsw m1, [r6-32] |

643 |
pmaddubsw m7, [r6-16] |

644 |
paddsw m6, m1 |

645 |
paddsw m6, m7 |

646 |
mova m1, m2 |

647 |
paddsw m6, [pw_64] |

648 |
mova m2, m3 |

649 |
psraw m6, 7 |

650 |
mova m3, m4 |

651 |
packuswb m6, m6 |

652 |
mova m4, m5 |

653 |
movh [r0], m6 |

654 | |

655 |
; go to next line |

656 |
add r0, r1 |

657 |
add r2, r3 |

658 |
dec r4 ; next row |

659 |
jg .nextrow |

660 |
REP_RET |

661 | |

662 |
%macro FILTER_BILINEAR 3 |

663 |
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 |

664 |
mov r5d, 8*16 |

665 |
shl r6d, 4 |

666 |
sub r5d, r6d |

667 |
%ifdef PIC |

668 |
lea r11, [bilinear_filter_vw_m] |

669 |
%endif |

670 |
pxor m6, m6 |

671 |
mova m4, [bilinear_filter_vw+r5-16] |

672 |
mova m5, [bilinear_filter_vw+r6-16] |

673 |
.nextrow |

674 |
movh m0, [r2+r3*0] |

675 |
movh m1, [r2+r3*1] |

676 |
movh m3, [r2+r3*2] |

677 |
punpcklbw m0, m6 |

678 |
punpcklbw m1, m6 |

679 |
punpcklbw m3, m6 |

680 |
mova m2, m1 |

681 |
pmullw m0, m4 |

682 |
pmullw m1, m5 |

683 |
pmullw m2, m4 |

684 |
pmullw m3, m5 |

685 |
paddsw m0, m1 |

686 |
paddsw m2, m3 |

687 |
psraw m0, 2 |

688 |
psraw m2, 2 |

689 |
pavgw m0, m6 |

690 |
pavgw m2, m6 |

691 |
%ifidn %1, mmxext |

692 |
packuswb m0, m0 |

693 |
packuswb m2, m2 |

694 |
movh [r0+r1*0], m0 |

695 |
movh [r0+r1*1], m2 |

696 |
%else |

697 |
packuswb m0, m2 |

698 |
movh [r0+r1*0], m0 |

699 |
movhps [r0+r1*1], m0 |

700 |
%endif |

701 | |

702 |
lea r0, [r0+r1*2] |

703 |
lea r2, [r2+r3*2] |

704 |
sub r4, 2 |

705 |
jg .nextrow |

706 |
REP_RET |

707 | |

708 |
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 |

709 |
mov r6d, 8*16 |

710 |
shl r5d, 4 |

711 |
sub r6d, r5d |

712 |
%ifdef PIC |

713 |
lea r11, [bilinear_filter_vw_m] |

714 |
%endif |

715 |
pxor m6, m6 |

716 |
mova m4, [bilinear_filter_vw+r6-16] |

717 |
mova m5, [bilinear_filter_vw+r5-16] |

718 |
.nextrow |

719 |
movh m0, [r2+r3*0+0] |

720 |
movh m1, [r2+r3*0+1] |

721 |
movh m2, [r2+r3*1+0] |

722 |
movh m3, [r2+r3*1+1] |

723 |
punpcklbw m0, m6 |

724 |
punpcklbw m1, m6 |

725 |
punpcklbw m2, m6 |

726 |
punpcklbw m3, m6 |

727 |
pmullw m0, m4 |

728 |
pmullw m1, m5 |

729 |
pmullw m2, m4 |

730 |
pmullw m3, m5 |

731 |
paddsw m0, m1 |

732 |
paddsw m2, m3 |

733 |
psraw m0, 2 |

734 |
psraw m2, 2 |

735 |
pavgw m0, m6 |

736 |
pavgw m2, m6 |

737 |
%ifidn %1, mmxext |

738 |
packuswb m0, m0 |

739 |
packuswb m2, m2 |

740 |
movh [r0+r1*0], m0 |

741 |
movh [r0+r1*1], m2 |

742 |
%else |

743 |
packuswb m0, m2 |

744 |
movh [r0+r1*0], m0 |

745 |
movhps [r0+r1*1], m0 |

746 |
%endif |

747 | |

748 |
lea r0, [r0+r1*2] |

749 |
lea r2, [r2+r3*2] |

750 |
sub r4, 2 |

751 |
jg .nextrow |

752 |
REP_RET |

753 |
%endmacro |

754 | |

755 |
INIT_MMX |

756 |
FILTER_BILINEAR mmxext, 4, 0 |

757 |
INIT_XMM |

758 |
FILTER_BILINEAR sse2, 8, 7 |

759 | |

760 |
cglobal put_vp8_bilinear8_v_ssse3, 7,7,5 |

761 |
shl r6d, 4 |

762 |
%ifdef PIC |

763 |
lea r11, [bilinear_filter_vb_m] |

764 |
%endif |

765 |
pxor m4, m4 |

766 |
mova m3, [bilinear_filter_vb+r6-16] |

767 |
.nextrow |

768 |
movh m0, [r2+r3*0] |

769 |
movh m1, [r2+r3*1] |

770 |
movh m2, [r2+r3*2] |

771 |
punpcklbw m0, m1 |

772 |
punpcklbw m1, m2 |

773 |
pmaddubsw m0, m3 |

774 |
pmaddubsw m1, m3 |

775 |
psraw m0, 2 |

776 |
psraw m1, 2 |

777 |
pavgw m0, m4 |

778 |
pavgw m1, m4 |

779 |
packuswb m0, m1 |

780 |
movh [r0+r1*0], m0 |

781 |
movhps [r0+r1*1], m0 |

782 | |

783 |
lea r0, [r0+r1*2] |

784 |
lea r2, [r2+r3*2] |

785 |
sub r4, 2 |

786 |
jg .nextrow |

787 |
REP_RET |

788 | |

789 |
cglobal put_vp8_bilinear8_h_ssse3, 7,7,5 |

790 |
shl r5d, 4 |

791 |
%ifdef PIC |

792 |
lea r11, [bilinear_filter_vb_m] |

793 |
%endif |

794 |
pxor m4, m4 |

795 |
mova m2, [filter_h2_shuf] |

796 |
mova m3, [bilinear_filter_vb+r5-16] |

797 |
.nextrow |

798 |
movu m0, [r2+r3*0] |

799 |
movu m1, [r2+r3*1] |

800 |
pshufb m0, m2 |

801 |
pshufb m1, m2 |

802 |
pmaddubsw m0, m3 |

803 |
pmaddubsw m1, m3 |

804 |
psraw m0, 2 |

805 |
psraw m1, 2 |

806 |
pavgw m0, m4 |

807 |
pavgw m1, m4 |

808 |
packuswb m0, m1 |

809 |
movh [r0+r1*0], m0 |

810 |
movhps [r0+r1*1], m0 |

811 | |

812 |
lea r0, [r0+r1*2] |

813 |
lea r2, [r2+r3*2] |

814 |
sub r4, 2 |

815 |
jg .nextrow |

816 |
REP_RET |

817 | |

818 |
cglobal put_vp8_pixels8_mmx, 5,5 |

819 |
.nextrow: |

820 |
movq mm0, [r2+r3*0] |

821 |
movq mm1, [r2+r3*1] |

822 |
lea r2, [r2+r3*2] |

823 |
movq [r0+r1*0], mm0 |

824 |
movq [r0+r1*1], mm1 |

825 |
lea r0, [r0+r1*2] |

826 |
sub r4d, 2 |

827 |
jg .nextrow |

828 |
REP_RET |

829 | |

830 |
cglobal put_vp8_pixels16_mmx, 5,5 |

831 |
.nextrow: |

832 |
movq mm0, [r2+r3*0+0] |

833 |
movq mm1, [r2+r3*0+8] |

834 |
movq mm2, [r2+r3*1+0] |

835 |
movq mm3, [r2+r3*1+8] |

836 |
lea r2, [r2+r3*2] |

837 |
movq [r0+r1*0+0], mm0 |

838 |
movq [r0+r1*0+8], mm1 |

839 |
movq [r0+r1*1+0], mm2 |

840 |
movq [r0+r1*1+8], mm3 |

841 |
lea r0, [r0+r1*2] |

842 |
sub r4d, 2 |

843 |
jg .nextrow |

844 |
REP_RET |

845 | |

846 |
cglobal put_vp8_pixels16_sse, 5,5,2 |

847 |
.nextrow: |

848 |
movups xmm0, [r2+r3*0] |

849 |
movups xmm1, [r2+r3*1] |

850 |
lea r2, [r2+r3*2] |

851 |
movaps [r0+r1*0], xmm0 |

852 |
movaps [r0+r1*1], xmm1 |

853 |
lea r0, [r0+r1*2] |

854 |
sub r4d, 2 |

855 |
jg .nextrow |

856 |
REP_RET |

857 | |

858 |
;----------------------------------------------------------------------------- |

859 |
; IDCT functions: |

860 |
; |

861 |
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |

862 |
;----------------------------------------------------------------------------- |

863 | |

864 |
cglobal vp8_idct_dc_add_mmx, 3, 3 |

865 |
; load data |

866 |
movd mm0, [r1] |

867 | |

868 |
; calculate DC |

869 |
paddw mm0, [pw_4] |

870 |
pxor mm1, mm1 |

871 |
psraw mm0, 3 |

872 |
psubw mm1, mm0 |

873 |
packuswb mm0, mm0 |

874 |
packuswb mm1, mm1 |

875 |
punpcklbw mm0, mm0 |

876 |
punpcklbw mm1, mm1 |

877 |
punpcklwd mm0, mm0 |

878 |
punpcklwd mm1, mm1 |

879 | |

880 |
; add DC |

881 |
lea r1, [r0+r2*2] |

882 |
movd mm2, [r0] |

883 |
movd mm3, [r0+r2] |

884 |
movd mm4, [r1] |

885 |
movd mm5, [r1+r2] |

886 |
paddusb mm2, mm0 |

887 |
paddusb mm3, mm0 |

888 |
paddusb mm4, mm0 |

889 |
paddusb mm5, mm0 |

890 |
psubusb mm2, mm1 |

891 |
psubusb mm3, mm1 |

892 |
psubusb mm4, mm1 |

893 |
psubusb mm5, mm1 |

894 |
movd [r0], mm2 |

895 |
movd [r0+r2], mm3 |

896 |
movd [r1], mm4 |

897 |
movd [r1+r2], mm5 |

898 |
RET |

899 | |

900 |
cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |

901 |
; load data |

902 |
movd xmm0, [r1] |

903 |
lea r1, [r0+r2*2] |

904 |
pxor xmm1, xmm1 |

905 |
movq xmm2, [pw_4] |

906 | |

907 |
; calculate DC |

908 |
paddw xmm0, xmm2 |

909 |
movd xmm2, [r0] |

910 |
movd xmm3, [r0+r2] |

911 |
movd xmm4, [r1] |

912 |
movd xmm5, [r1+r2] |

913 |
psraw xmm0, 3 |

914 |
pshuflw xmm0, xmm0, 0 |

915 |
punpcklqdq xmm0, xmm0 |

916 |
punpckldq xmm2, xmm3 |

917 |
punpckldq xmm4, xmm5 |

918 |
punpcklbw xmm2, xmm1 |

919 |
punpcklbw xmm4, xmm1 |

920 |
paddw xmm2, xmm0 |

921 |
paddw xmm4, xmm0 |

922 |
packuswb xmm2, xmm4 |

923 |
movd [r0], xmm2 |

924 |
pextrd [r0+r2], xmm2, 1 |

925 |
pextrd [r1], xmm2, 2 |

926 |
pextrd [r1+r2], xmm2, 3 |

927 |
RET |

928 | |

929 |
;----------------------------------------------------------------------------- |

930 |
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |

931 |
;----------------------------------------------------------------------------- |

932 | |

933 |
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) |

934 |
; this macro assumes that m6/m7 have words for 20091/17734 loaded |

935 |
%macro VP8_MULTIPLY_SUMSUB 4 |

936 |
mova %3, %1 |

937 |
mova %4, %2 |

938 |
pmulhw %3, m6 ;20091(1) |

939 |
pmulhw %4, m6 ;20091(2) |

940 |
paddw %3, %1 |

941 |
paddw %4, %2 |

942 |
psllw %1, 1 |

943 |
psllw %2, 1 |

944 |
pmulhw %1, m7 ;35468(1) |

945 |
pmulhw %2, m7 ;35468(2) |

946 |
psubw %1, %4 |

947 |
paddw %2, %3 |

948 |
%endmacro |

949 | |

950 |
; calculate x0=%1+%3; x1=%1-%3 |

951 |
; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) |

952 |
; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) |

953 |
; %5/%6 are temporary registers |

954 |
; we assume m6/m7 have constant words 20091/17734 loaded in them |

955 |
%macro VP8_IDCT_TRANSFORM4x4_1D 6 |

956 |
SUMSUB_BA m%3, m%1, m%5 ;t0, t1 |

957 |
VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 |

958 |
SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 |

959 |
SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 |

960 |
SWAP %4, %1 |

961 |
SWAP %4, %3 |

962 |
%endmacro |

963 | |

964 |
INIT_MMX |

965 |
cglobal vp8_idct_add_mmx, 3, 3 |

966 |
; load block data |

967 |
movq m0, [r1] |

968 |
movq m1, [r1+8] |

969 |
movq m2, [r1+16] |

970 |
movq m3, [r1+24] |

971 |
movq m6, [pw_20091] |

972 |
movq m7, [pw_17734] |

973 | |

974 |
; actual IDCT |

975 |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |

976 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

977 |
paddw m0, [pw_4] |

978 |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |

979 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

980 | |

981 |
; store |

982 |
pxor m4, m4 |

983 |
lea r1, [r0+2*r2] |

984 |
STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 |

985 |
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 |

986 | |

987 |
RET |

988 | |

989 |
;----------------------------------------------------------------------------- |

990 |
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |

991 |
;----------------------------------------------------------------------------- |

992 | |

993 |
%macro SCATTER_WHT 1 |

994 |
pextrw r1d, m0, %1 |

995 |
pextrw r2d, m1, %1 |

996 |
mov [r0+2*16*0], r1w |

997 |
mov [r0+2*16*1], r2w |

998 |
pextrw r1d, m2, %1 |

999 |
pextrw r2d, m3, %1 |

1000 |
mov [r0+2*16*2], r1w |

1001 |
mov [r0+2*16*3], r2w |

1002 |
%endmacro |

1003 | |

1004 |
%macro HADAMARD4_1D 4 |

1005 |
SUMSUB_BADC m%2, m%1, m%4, m%3 |

1006 |
SUMSUB_BADC m%4, m%2, m%3, m%1 |

1007 |
SWAP %1, %4, %3 |

1008 |
%endmacro |

1009 | |

1010 |
INIT_MMX |

1011 |
cglobal vp8_luma_dc_wht_mmxext, 2,3 |

1012 |
movq m0, [r1] |

1013 |
movq m1, [r1+8] |

1014 |
movq m2, [r1+16] |

1015 |
movq m3, [r1+24] |

1016 |
HADAMARD4_1D 0, 1, 2, 3 |

1017 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

1018 |
paddw m0, [pw_3] |

1019 |
HADAMARD4_1D 0, 1, 2, 3 |

1020 |
psraw m0, 3 |

1021 |
psraw m1, 3 |

1022 |
psraw m2, 3 |

1023 |
psraw m3, 3 |

1024 |
SCATTER_WHT 0 |

1025 |
add r0, 2*16*4 |

1026 |
SCATTER_WHT 1 |

1027 |
add r0, 2*16*4 |

1028 |
SCATTER_WHT 2 |

1029 |
add r0, 2*16*4 |

1030 |
SCATTER_WHT 3 |

1031 |
RET |