## ffmpeg / libavcodec / x86 / vp8dsp.asm @ 004cda8e

History | View | Annotate | Download (25.8 KB)

1 |
;****************************************************************************** |
---|---|

2 |
;* VP8 MMXEXT optimizations |

3 |
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |

4 |
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |

5 |
;* |

6 |
;* This file is part of FFmpeg. |

7 |
;* |

8 |
;* FFmpeg is free software; you can redistribute it and/or |

9 |
;* modify it under the terms of the GNU Lesser General Public |

10 |
;* License as published by the Free Software Foundation; either |

11 |
;* version 2.1 of the License, or (at your option) any later version. |

12 |
;* |

13 |
;* FFmpeg is distributed in the hope that it will be useful, |

14 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

15 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

16 |
;* Lesser General Public License for more details. |

17 |
;* |

18 |
;* You should have received a copy of the GNU Lesser General Public |

19 |
;* License along with FFmpeg; if not, write to the Free Software |

20 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

21 |
;****************************************************************************** |

22 | |

23 |
%include "x86inc.asm" |

24 |
%include "x86util.asm" |

25 | |

26 |
SECTION_RODATA |

27 | |

28 |
fourtap_filter_hw_m: times 4 dw -6, 123 |

29 |
times 4 dw 12, -1 |

30 |
times 4 dw -9, 93 |

31 |
times 4 dw 50, -6 |

32 |
times 4 dw -6, 50 |

33 |
times 4 dw 93, -9 |

34 |
times 4 dw -1, 12 |

35 |
times 4 dw 123, -6 |

36 | |

37 |
sixtap_filter_hw_m: times 4 dw 2, -11 |

38 |
times 4 dw 108, 36 |

39 |
times 4 dw -8, 1 |

40 |
times 4 dw 3, -16 |

41 |
times 4 dw 77, 77 |

42 |
times 4 dw -16, 3 |

43 |
times 4 dw 1, -8 |

44 |
times 4 dw 36, 108 |

45 |
times 4 dw -11, 2 |

46 | |

47 |
fourtap_filter_hb_m: times 8 db -6, -1 |

48 |
times 8 db 123, 12 |

49 |
times 8 db -9, -6 |

50 |
times 8 db 93, 50 |

51 |
times 8 db -6, -9 |

52 |
times 8 db 50, 93 |

53 |
times 8 db -1, -6 |

54 |
times 8 db 12, 123 |

55 | |

56 |
sixtap_filter_hb_m: times 8 db 2, 1 |

57 |
times 8 db -11, 108 |

58 |
times 8 db 36, -8 |

59 |
times 8 db 3, 3 |

60 |
times 8 db -16, 77 |

61 |
times 8 db 77, -16 |

62 |
times 8 db 1, 2 |

63 |
times 8 db -8, 36 |

64 |
times 8 db 108, -11 |

65 | |

66 |
fourtap_filter_v_m: times 8 dw -6 |

67 |
times 8 dw 123 |

68 |
times 8 dw 12 |

69 |
times 8 dw -1 |

70 |
times 8 dw -9 |

71 |
times 8 dw 93 |

72 |
times 8 dw 50 |

73 |
times 8 dw -6 |

74 |
times 8 dw -6 |

75 |
times 8 dw 50 |

76 |
times 8 dw 93 |

77 |
times 8 dw -9 |

78 |
times 8 dw -1 |

79 |
times 8 dw 12 |

80 |
times 8 dw 123 |

81 |
times 8 dw -6 |

82 | |

83 |
sixtap_filter_v_m: times 8 dw 2 |

84 |
times 8 dw -11 |

85 |
times 8 dw 108 |

86 |
times 8 dw 36 |

87 |
times 8 dw -8 |

88 |
times 8 dw 1 |

89 |
times 8 dw 3 |

90 |
times 8 dw -16 |

91 |
times 8 dw 77 |

92 |
times 8 dw 77 |

93 |
times 8 dw -16 |

94 |
times 8 dw 3 |

95 |
times 8 dw 1 |

96 |
times 8 dw -8 |

97 |
times 8 dw 36 |

98 |
times 8 dw 108 |

99 |
times 8 dw -11 |

100 |
times 8 dw 2 |

101 | |

102 |
bilinear_filter_vw_m: times 8 dw 1 |

103 |
times 8 dw 2 |

104 |
times 8 dw 3 |

105 |
times 8 dw 4 |

106 |
times 8 dw 5 |

107 |
times 8 dw 6 |

108 |
times 8 dw 7 |

109 | |

110 |
bilinear_filter_vb_m: times 8 db 7, 1 |

111 |
times 8 db 6, 2 |

112 |
times 8 db 5, 3 |

113 |
times 8 db 4, 4 |

114 |
times 8 db 3, 5 |

115 |
times 8 db 2, 6 |

116 |
times 8 db 1, 7 |

117 | |

118 |
%ifdef PIC |

119 |
%define fourtap_filter_hw r11 |

120 |
%define sixtap_filter_hw r11 |

121 |
%define fourtap_filter_hb r11 |

122 |
%define sixtap_filter_hb r11 |

123 |
%define fourtap_filter_v r11 |

124 |
%define sixtap_filter_v r11 |

125 |
%define bilinear_filter_vw r11 |

126 |
%define bilinear_filter_vb r11 |

127 |
%else |

128 |
%define fourtap_filter_hw fourtap_filter_hw_m |

129 |
%define sixtap_filter_hw sixtap_filter_hw_m |

130 |
%define fourtap_filter_hb fourtap_filter_hb_m |

131 |
%define sixtap_filter_hb sixtap_filter_hb_m |

132 |
%define fourtap_filter_v fourtap_filter_v_m |

133 |
%define sixtap_filter_v sixtap_filter_v_m |

134 |
%define bilinear_filter_vw bilinear_filter_vw_m |

135 |
%define bilinear_filter_vb bilinear_filter_vb_m |

136 |
%endif |

137 | |

138 |
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |

139 |
filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 |

140 | |

141 |
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |

142 |
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |

143 |
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |

144 | |

145 |
cextern pw_3 |

146 |
cextern pw_4 |

147 |
cextern pw_64 |

148 | |

149 |
SECTION .text |

150 | |

151 |
;----------------------------------------------------------------------------- |

152 |
; subpel MC functions: |

153 |
; |

154 |
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, |

155 |
; uint8_t *src, int srcstride, |

156 |
; int height, int mx, int my); |

157 |
;----------------------------------------------------------------------------- |

158 | |

159 |
; 4x4 block, H-only 4-tap filter |

160 |
cglobal put_vp8_epel4_h4_mmxext, 6, 6 |

161 |
shl r5d, 4 |

162 |
%ifdef PIC |

163 |
lea r11, [fourtap_filter_hw_m] |

164 |
%endif |

165 |
movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |

166 |
movq mm5, [fourtap_filter_hw+r5] |

167 |
movq mm7, [pw_64] |

168 |
pxor mm6, mm6 |

169 | |

170 |
.nextrow |

171 |
movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels |

172 | |

173 |
; first set of 2 pixels |

174 |
movq mm2, mm1 ; byte ABCD.. |

175 |
punpcklbw mm1, mm6 ; byte->word ABCD |

176 |
pshufw mm0, mm2, 9 ; byte CDEF.. |

177 |
punpcklbw mm0, mm6 ; byte->word CDEF |

178 |
pshufw mm3, mm1, 0x94 ; word ABBC |

179 |
pshufw mm1, mm0, 0x94 ; word CDDE |

180 |
pmaddwd mm3, mm4 ; multiply 2px with F0/F1 |

181 |
movq mm0, mm1 ; backup for second set of pixels |

182 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |

183 |
paddd mm3, mm1 ; finish 1st 2px |

184 | |

185 |
; second set of 2 pixels, use backup of above |

186 |
punpckhbw mm2, mm6 ; byte->word EFGH |

187 |
pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 |

188 |
pshufw mm1, mm2, 0x94 ; word EFFG |

189 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |

190 |
paddd mm0, mm1 ; finish 2nd 2px |

191 | |

192 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |

193 |
packssdw mm3, mm0 ; merge dword->word (4px) |

194 |
paddsw mm3, mm7 ; rounding |

195 |
psraw mm3, 7 |

196 |
packuswb mm3, mm6 ; clip and word->bytes |

197 |
movd [r0], mm3 ; store |

198 | |

199 |
; go to next line |

200 |
add r0, r1 |

201 |
add r2, r3 |

202 |
dec r4 ; next row |

203 |
jg .nextrow |

204 |
REP_RET |

205 | |

206 |
; 4x4 block, H-only 6-tap filter |

207 |
cglobal put_vp8_epel4_h6_mmxext, 6, 6 |

208 |
lea r5d, [r5*3] |

209 |
%ifdef PIC |

210 |
lea r11, [sixtap_filter_hw_m] |

211 |
%endif |

212 |
movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words |

213 |
movq mm5, [sixtap_filter_hw+r5*8-32] |

214 |
movq mm6, [sixtap_filter_hw+r5*8-16] |

215 |
movq mm7, [pw_64] |

216 |
pxor mm3, mm3 |

217 | |

218 |
.nextrow |

219 |
movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels |

220 | |

221 |
; first set of 2 pixels |

222 |
movq mm2, mm1 ; byte ABCD.. |

223 |
punpcklbw mm1, mm3 ; byte->word ABCD |

224 |
pshufw mm0, mm2, 0x9 ; byte CDEF.. |

225 |
punpckhbw mm2, mm3 ; byte->word EFGH |

226 |
punpcklbw mm0, mm3 ; byte->word CDEF |

227 |
pshufw mm1, mm1, 0x94 ; word ABBC |

228 |
pshufw mm2, mm2, 0x94 ; word EFFG |

229 |
pmaddwd mm1, mm4 ; multiply 2px with F0/F1 |

230 |
pshufw mm3, mm0, 0x94 ; word CDDE |

231 |
movq mm0, mm3 ; backup for second set of pixels |

232 |
pmaddwd mm3, mm5 ; multiply 2px with F2/F3 |

233 |
paddd mm1, mm3 ; add to 1st 2px cache |

234 |
movq mm3, mm2 ; backup for second set of pixels |

235 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |

236 |
paddd mm1, mm2 ; finish 1st 2px |

237 | |

238 |
; second set of 2 pixels, use backup of above |

239 |
movd mm2, [r2+3] ; byte FGHI (prevent overreads) |

240 |
pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 |

241 |
pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 |

242 |
paddd mm0, mm3 ; add to 2nd 2px cache |

243 |
pxor mm3, mm3 |

244 |
punpcklbw mm2, mm3 ; byte->word FGHI |

245 |
pshufw mm2, mm2, 0xE9 ; word GHHI |

246 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |

247 |
paddd mm0, mm2 ; finish 2nd 2px |

248 | |

249 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |

250 |
packssdw mm1, mm0 ; merge dword->word (4px) |

251 |
paddsw mm1, mm7 ; rounding |

252 |
psraw mm1, 7 |

253 |
packuswb mm1, mm3 ; clip and word->bytes |

254 |
movd [r0], mm1 ; store |

255 | |

256 |
; go to next line |

257 |
add r0, r1 |

258 |
add r2, r3 |

259 |
dec r4 ; next row |

260 |
jg .nextrow |

261 |
REP_RET |

262 | |

263 |
; 4x4 block, H-only 4-tap filter |

264 |
INIT_XMM |

265 |
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 |

266 |
shl r5d, 4 |

267 |
%ifdef PIC |

268 |
lea r11, [fourtap_filter_hw_m] |

269 |
%endif |

270 |
mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |

271 |
mova m6, [fourtap_filter_hw+r5] |

272 |
pxor m7, m7 |

273 | |

274 |
.nextrow |

275 |
movh m0, [r2-1] |

276 |
punpcklbw m0, m7 ; ABCDEFGH |

277 |
mova m1, m0 |

278 |
mova m2, m0 |

279 |
mova m3, m0 |

280 |
psrldq m1, 2 ; BCDEFGH |

281 |
psrldq m2, 4 ; CDEFGH |

282 |
psrldq m3, 6 ; DEFGH |

283 |
punpcklwd m0, m1 ; ABBCCDDE |

284 |
punpcklwd m2, m3 ; CDDEEFFG |

285 |
pmaddwd m0, m5 |

286 |
pmaddwd m2, m6 |

287 |
paddd m0, m2 |

288 | |

289 |
movh m1, [r2+3] |

290 |
punpcklbw m1, m7 ; ABCDEFGH |

291 |
mova m2, m1 |

292 |
mova m3, m1 |

293 |
mova m4, m1 |

294 |
psrldq m2, 2 ; BCDEFGH |

295 |
psrldq m3, 4 ; CDEFGH |

296 |
psrldq m4, 6 ; DEFGH |

297 |
punpcklwd m1, m2 ; ABBCCDDE |

298 |
punpcklwd m3, m4 ; CDDEEFFG |

299 |
pmaddwd m1, m5 |

300 |
pmaddwd m3, m6 |

301 |
paddd m1, m3 |

302 | |

303 |
packssdw m0, m1 |

304 |
paddsw m0, [pw_64] |

305 |
psraw m0, 7 |

306 |
packuswb m0, m7 |

307 |
movh [r0], m0 ; store |

308 | |

309 |
; go to next line |

310 |
add r0, r1 |

311 |
add r2, r3 |

312 |
dec r4 ; next row |

313 |
jg .nextrow |

314 |
REP_RET |

315 | |

316 |
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 |

317 |
lea r5d, [r5*3] |

318 |
%ifdef PIC |

319 |
lea r11, [sixtap_filter_hw_m] |

320 |
%endif |

321 |
lea r5, [sixtap_filter_hw+r5*8] |

322 |
pxor m7, m7 |

323 | |

324 |
.nextrow |

325 |
movu m0, [r2-2] |

326 |
mova m6, m0 |

327 |
mova m4, m0 |

328 |
punpcklbw m0, m7 ; ABCDEFGHI |

329 |
mova m1, m0 |

330 |
mova m2, m0 |

331 |
mova m3, m0 |

332 |
psrldq m1, 2 ; BCDEFGH |

333 |
psrldq m2, 4 ; CDEFGH |

334 |
psrldq m3, 6 ; DEFGH |

335 |
psrldq m4, 4 |

336 |
punpcklbw m4, m7 ; EFGH |

337 |
mova m5, m4 |

338 |
psrldq m5, 2 ; FGH |

339 |
punpcklwd m0, m1 ; ABBCCDDE |

340 |
punpcklwd m2, m3 ; CDDEEFFG |

341 |
punpcklwd m4, m5 ; EFFGGHHI |

342 |
pmaddwd m0, [r5-48] |

343 |
pmaddwd m2, [r5-32] |

344 |
pmaddwd m4, [r5-16] |

345 |
paddd m0, m2 |

346 |
paddd m0, m4 |

347 | |

348 |
psrldq m6, 4 |

349 |
mova m4, m6 |

350 |
punpcklbw m6, m7 ; ABCDEFGHI |

351 |
mova m1, m6 |

352 |
mova m2, m6 |

353 |
mova m3, m6 |

354 |
psrldq m1, 2 ; BCDEFGH |

355 |
psrldq m2, 4 ; CDEFGH |

356 |
psrldq m3, 6 ; DEFGH |

357 |
psrldq m4, 4 |

358 |
punpcklbw m4, m7 ; EFGH |

359 |
mova m5, m4 |

360 |
psrldq m5, 2 ; FGH |

361 |
punpcklwd m6, m1 ; ABBCCDDE |

362 |
punpcklwd m2, m3 ; CDDEEFFG |

363 |
punpcklwd m4, m5 ; EFFGGHHI |

364 |
pmaddwd m6, [r5-48] |

365 |
pmaddwd m2, [r5-32] |

366 |
pmaddwd m4, [r5-16] |

367 |
paddd m6, m2 |

368 |
paddd m6, m4 |

369 | |

370 |
packssdw m0, m6 |

371 |
paddsw m0, [pw_64] |

372 |
psraw m0, 7 |

373 |
packuswb m0, m7 |

374 |
movh [r0], m0 ; store |

375 | |

376 |
; go to next line |

377 |
add r0, r1 |

378 |
add r2, r3 |

379 |
dec r4 ; next row |

380 |
jg .nextrow |

381 |
REP_RET |

382 | |

383 |
cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7 |

384 |
shl r5d, 4 |

385 |
mova m2, [pw_64] |

386 |
mova m3, [filter_h4_shuf] |

387 |
mova m4, [filter_h6_shuf2] |

388 |
%ifdef PIC |

389 |
lea r11, [fourtap_filter_hb_m] |

390 |
%endif |

391 |
mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |

392 |
mova m6, [fourtap_filter_hb+r5] |

393 | |

394 |
.nextrow |

395 |
movu m0, [r2-1] |

396 |
mova m1, m0 |

397 |
pshufb m0, m3 |

398 |
pshufb m1, m4 |

399 |
pmaddubsw m0, m5 |

400 |
pmaddubsw m1, m6 |

401 |
paddsw m0, m2 |

402 |
paddsw m0, m1 |

403 |
psraw m0, 7 |

404 |
packuswb m0, m0 |

405 |
movh [r0], m0 ; store |

406 | |

407 |
; go to next line |

408 |
add r0, r1 |

409 |
add r2, r3 |

410 |
dec r4 ; next row |

411 |
jg .nextrow |

412 |
REP_RET |

413 | |

414 |
cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8 |

415 |
lea r5d, [r5*3] |

416 |
mova m3, [filter_h6_shuf1] |

417 |
mova m4, [filter_h6_shuf2] |

418 |
%ifdef PIC |

419 |
lea r11, [sixtap_filter_hb_m] |

420 |
%endif |

421 |
mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |

422 |
mova m6, [sixtap_filter_hb+r5*8-32] |

423 |
mova m7, [sixtap_filter_hb+r5*8-16] |

424 | |

425 |
.nextrow |

426 |
movu m0, [r2-2] |

427 |
mova m1, m0 |

428 |
mova m2, m0 |

429 |
pshufb m0, m3 |

430 |
pshufb m1, m4 |

431 |
pshufb m2, [filter_h6_shuf3] |

432 |
pmaddubsw m0, m5 |

433 |
pmaddubsw m1, m6 |

434 |
pmaddubsw m2, m7 |

435 |
paddsw m0, m1 |

436 |
paddsw m0, m2 |

437 |
paddsw m0, [pw_64] |

438 |
psraw m0, 7 |

439 |
packuswb m0, m0 |

440 |
movh [r0], m0 ; store |

441 | |

442 |
; go to next line |

443 |
add r0, r1 |

444 |
add r2, r3 |

445 |
dec r4 ; next row |

446 |
jg .nextrow |

447 |
REP_RET |

448 | |

449 |
%macro FILTER_V 3 |

450 |
; 4x4 block, V-only 4-tap filter |

451 |
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 |

452 |
shl r6d, 5 |

453 |
%ifdef PIC |

454 |
lea r11, [fourtap_filter_v_m] |

455 |
%endif |

456 |
lea r6, [fourtap_filter_v+r6-32] |

457 |
mova m6, [pw_64] |

458 |
pxor m7, m7 |

459 |
mova m5, [r6+48] |

460 | |

461 |
; read 3 lines |

462 |
sub r2, r3 |

463 |
movh m0, [r2] |

464 |
movh m1, [r2+ r3] |

465 |
movh m2, [r2+2*r3] |

466 |
add r2, r3 |

467 |
punpcklbw m0, m7 |

468 |
punpcklbw m1, m7 |

469 |
punpcklbw m2, m7 |

470 | |

471 |
.nextrow |

472 |
; first calculate negative taps (to prevent losing positive overflows) |

473 |
movh m4, [r2+2*r3] ; read new row |

474 |
punpcklbw m4, m7 |

475 |
mova m3, m4 |

476 |
pmullw m0, [r6+0] |

477 |
pmullw m4, m5 |

478 |
paddsw m4, m0 |

479 | |

480 |
; then calculate positive taps |

481 |
mova m0, m1 |

482 |
pmullw m1, [r6+16] |

483 |
paddsw m4, m1 |

484 |
mova m1, m2 |

485 |
pmullw m2, [r6+32] |

486 |
paddsw m4, m2 |

487 |
mova m2, m3 |

488 | |

489 |
; round/clip/store |

490 |
paddsw m4, m6 |

491 |
psraw m4, 7 |

492 |
packuswb m4, m7 |

493 |
movh [r0], m4 |

494 | |

495 |
; go to next line |

496 |
add r0, r1 |

497 |
add r2, r3 |

498 |
dec r4 ; next row |

499 |
jg .nextrow |

500 |
REP_RET |

501 | |

502 | |

503 |
; 4x4 block, V-only 6-tap filter |

504 |
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 |

505 |
shl r6d, 4 |

506 |
lea r6, [r6*3] |

507 |
%ifdef PIC |

508 |
lea r11, [sixtap_filter_v_m] |

509 |
%endif |

510 |
lea r6, [sixtap_filter_v+r6-96] |

511 |
pxor m7, m7 |

512 | |

513 |
; read 5 lines |

514 |
sub r2, r3 |

515 |
sub r2, r3 |

516 |
movh m0, [r2] |

517 |
movh m1, [r2+r3] |

518 |
movh m2, [r2+r3*2] |

519 |
lea r2, [r2+r3*2] |

520 |
add r2, r3 |

521 |
movh m3, [r2] |

522 |
movh m4, [r2+r3] |

523 |
punpcklbw m0, m7 |

524 |
punpcklbw m1, m7 |

525 |
punpcklbw m2, m7 |

526 |
punpcklbw m3, m7 |

527 |
punpcklbw m4, m7 |

528 | |

529 |
.nextrow |

530 |
; first calculate negative taps (to prevent losing positive overflows) |

531 |
mova m5, m1 |

532 |
pmullw m5, [r6+16] |

533 |
mova m6, m4 |

534 |
pmullw m6, [r6+64] |

535 |
paddsw m6, m5 |

536 | |

537 |
; then calculate positive taps |

538 |
movh m5, [r2+2*r3] ; read new row |

539 |
punpcklbw m5, m7 |

540 |
pmullw m0, [r6+0] |

541 |
paddsw m6, m0 |

542 |
mova m0, m1 |

543 |
mova m1, m2 |

544 |
pmullw m2, [r6+32] |

545 |
paddsw m6, m2 |

546 |
mova m2, m3 |

547 |
pmullw m3, [r6+48] |

548 |
paddsw m6, m3 |

549 |
mova m3, m4 |

550 |
mova m4, m5 |

551 |
pmullw m5, [r6+80] |

552 |
paddsw m6, m5 |

553 | |

554 |
; round/clip/store |

555 |
paddsw m6, [pw_64] |

556 |
psraw m6, 7 |

557 |
packuswb m6, m7 |

558 |
movh [r0], m6 |

559 | |

560 |
; go to next line |

561 |
add r0, r1 |

562 |
add r2, r3 |

563 |
dec r4 ; next row |

564 |
jg .nextrow |

565 |
REP_RET |

566 |
%endmacro |

567 | |

568 |
INIT_MMX |

569 |
FILTER_V mmxext, 4, 0 |

570 |
INIT_XMM |

571 |
FILTER_V sse2, 8, 8 |

572 | |

573 |
cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8 |

574 |
shl r6d, 4 |

575 |
%ifdef PIC |

576 |
lea r11, [fourtap_filter_hb_m] |

577 |
%endif |

578 |
mova m5, [fourtap_filter_hb+r6-16] |

579 |
mova m6, [fourtap_filter_hb+r6] |

580 |
mova m7, [pw_64] |

581 | |

582 |
; read 3 lines |

583 |
sub r2, r3 |

584 |
movh m0, [r2] |

585 |
movh m1, [r2+ r3] |

586 |
movh m2, [r2+2*r3] |

587 |
add r2, r3 |

588 | |

589 |
.nextrow |

590 |
movh m3, [r2+2*r3] ; read new row |

591 |
mova m4, m0 |

592 |
mova m0, m1 |

593 |
punpcklbw m4, m3 |

594 |
punpcklbw m1, m2 |

595 |
pmaddubsw m4, m5 |

596 |
pmaddubsw m1, m6 |

597 |
paddsw m4, m1 |

598 |
mova m1, m2 |

599 |
paddsw m4, m7 |

600 |
mova m2, m3 |

601 |
psraw m4, 7 |

602 |
packuswb m4, m4 |

603 |
movh [r0], m4 |

604 | |

605 |
; go to next line |

606 |
add r0, r1 |

607 |
add r2, r3 |

608 |
dec r4 ; next row |

609 |
jg .nextrow |

610 |
REP_RET |

611 | |

612 |
cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8 |

613 |
lea r6d, [r6*3] |

614 |
%ifdef PIC |

615 |
lea r11, [sixtap_filter_hb_m] |

616 |
%endif |

617 |
lea r6, [sixtap_filter_hb+r6*8] |

618 | |

619 |
; read 5 lines |

620 |
sub r2, r3 |

621 |
sub r2, r3 |

622 |
movh m0, [r2] |

623 |
movh m1, [r2+r3] |

624 |
movh m2, [r2+r3*2] |

625 |
lea r2, [r2+r3*2] |

626 |
add r2, r3 |

627 |
movh m3, [r2] |

628 |
movh m4, [r2+r3] |

629 | |

630 |
.nextrow |

631 |
movh m5, [r2+2*r3] ; read new row |

632 |
mova m6, m0 |

633 |
punpcklbw m6, m5 |

634 |
mova m0, m1 |

635 |
punpcklbw m1, m2 |

636 |
mova m7, m3 |

637 |
punpcklbw m7, m4 |

638 |
pmaddubsw m6, [r6-48] |

639 |
pmaddubsw m1, [r6-32] |

640 |
pmaddubsw m7, [r6-16] |

641 |
paddsw m6, m1 |

642 |
paddsw m6, m7 |

643 |
mova m1, m2 |

644 |
paddsw m6, [pw_64] |

645 |
mova m2, m3 |

646 |
psraw m6, 7 |

647 |
mova m3, m4 |

648 |
packuswb m6, m6 |

649 |
mova m4, m5 |

650 |
movh [r0], m6 |

651 | |

652 |
; go to next line |

653 |
add r0, r1 |

654 |
add r2, r3 |

655 |
dec r4 ; next row |

656 |
jg .nextrow |

657 |
REP_RET |

658 | |

659 |
%macro FILTER_BILINEAR 3 |

660 |
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 |

661 |
mov r5d, 8*16 |

662 |
shl r6d, 4 |

663 |
sub r5d, r6d |

664 |
%ifdef PIC |

665 |
lea r11, [bilinear_filter_vw_m] |

666 |
%endif |

667 |
pxor m6, m6 |

668 |
mova m4, [bilinear_filter_vw+r5-16] |

669 |
mova m5, [bilinear_filter_vw+r6-16] |

670 |
.nextrow |

671 |
movh m0, [r2+r3*0] |

672 |
movh m1, [r2+r3*1] |

673 |
movh m3, [r2+r3*2] |

674 |
punpcklbw m0, m6 |

675 |
punpcklbw m1, m6 |

676 |
punpcklbw m3, m6 |

677 |
mova m2, m1 |

678 |
pmullw m0, m4 |

679 |
pmullw m1, m5 |

680 |
pmullw m2, m4 |

681 |
pmullw m3, m5 |

682 |
paddsw m0, m1 |

683 |
paddsw m2, m3 |

684 |
psraw m0, 2 |

685 |
psraw m2, 2 |

686 |
pavgw m0, m6 |

687 |
pavgw m2, m6 |

688 |
%ifidn %1, mmxext |

689 |
packuswb m0, m0 |

690 |
packuswb m2, m2 |

691 |
movh [r0+r1*0], m0 |

692 |
movh [r0+r1*1], m2 |

693 |
%else |

694 |
packuswb m0, m2 |

695 |
movh [r0+r1*0], m0 |

696 |
movhps [r0+r1*1], m0 |

697 |
%endif |

698 | |

699 |
lea r0, [r0+r1*2] |

700 |
lea r2, [r2+r3*2] |

701 |
sub r4, 2 |

702 |
jg .nextrow |

703 |
REP_RET |

704 | |

705 |
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 |

706 |
mov r6d, 8*16 |

707 |
shl r5d, 4 |

708 |
sub r6d, r5d |

709 |
%ifdef PIC |

710 |
lea r11, [bilinear_filter_vw_m] |

711 |
%endif |

712 |
pxor m6, m6 |

713 |
mova m4, [bilinear_filter_vw+r6-16] |

714 |
mova m5, [bilinear_filter_vw+r5-16] |

715 |
.nextrow |

716 |
movh m0, [r2+r3*0+0] |

717 |
movh m1, [r2+r3*0+1] |

718 |
movh m2, [r2+r3*1+0] |

719 |
movh m3, [r2+r3*1+1] |

720 |
punpcklbw m0, m6 |

721 |
punpcklbw m1, m6 |

722 |
punpcklbw m2, m6 |

723 |
punpcklbw m3, m6 |

724 |
pmullw m0, m4 |

725 |
pmullw m1, m5 |

726 |
pmullw m2, m4 |

727 |
pmullw m3, m5 |

728 |
paddsw m0, m1 |

729 |
paddsw m2, m3 |

730 |
psraw m0, 2 |

731 |
psraw m2, 2 |

732 |
pavgw m0, m6 |

733 |
pavgw m2, m6 |

734 |
%ifidn %1, mmxext |

735 |
packuswb m0, m0 |

736 |
packuswb m2, m2 |

737 |
movh [r0+r1*0], m0 |

738 |
movh [r0+r1*1], m2 |

739 |
%else |

740 |
packuswb m0, m2 |

741 |
movh [r0+r1*0], m0 |

742 |
movhps [r0+r1*1], m0 |

743 |
%endif |

744 | |

745 |
lea r0, [r0+r1*2] |

746 |
lea r2, [r2+r3*2] |

747 |
sub r4, 2 |

748 |
jg .nextrow |

749 |
REP_RET |

750 |
%endmacro |

751 | |

752 |
INIT_MMX |

753 |
FILTER_BILINEAR mmxext, 4, 0 |

754 |
INIT_XMM |

755 |
FILTER_BILINEAR sse2, 8, 7 |

756 | |

757 |
cglobal put_vp8_bilinear8_v_ssse3, 7,7,5 |

758 |
shl r6d, 4 |

759 |
%ifdef PIC |

760 |
lea r11, [bilinear_filter_vb_m] |

761 |
%endif |

762 |
pxor m4, m4 |

763 |
mova m3, [bilinear_filter_vb+r6-16] |

764 |
.nextrow |

765 |
movh m0, [r2+r3*0] |

766 |
movh m1, [r2+r3*1] |

767 |
movh m2, [r2+r3*2] |

768 |
punpcklbw m0, m1 |

769 |
punpcklbw m1, m2 |

770 |
pmaddubsw m0, m3 |

771 |
pmaddubsw m1, m3 |

772 |
psraw m0, 2 |

773 |
psraw m1, 2 |

774 |
pavgw m0, m4 |

775 |
pavgw m1, m4 |

776 |
packuswb m0, m1 |

777 |
movh [r0+r1*0], m0 |

778 |
movhps [r0+r1*1], m0 |

779 | |

780 |
lea r0, [r0+r1*2] |

781 |
lea r2, [r2+r3*2] |

782 |
sub r4, 2 |

783 |
jg .nextrow |

784 |
REP_RET |

785 | |

786 |
cglobal put_vp8_bilinear8_h_ssse3, 7,7,5 |

787 |
shl r5d, 4 |

788 |
%ifdef PIC |

789 |
lea r11, [bilinear_filter_vb_m] |

790 |
%endif |

791 |
pxor m4, m4 |

792 |
mova m2, [filter_h2_shuf] |

793 |
mova m3, [bilinear_filter_vb+r5-16] |

794 |
.nextrow |

795 |
movu m0, [r2+r3*0] |

796 |
movu m1, [r2+r3*1] |

797 |
pshufb m0, m2 |

798 |
pshufb m1, m2 |

799 |
pmaddubsw m0, m3 |

800 |
pmaddubsw m1, m3 |

801 |
psraw m0, 2 |

802 |
psraw m1, 2 |

803 |
pavgw m0, m4 |

804 |
pavgw m1, m4 |

805 |
packuswb m0, m1 |

806 |
movh [r0+r1*0], m0 |

807 |
movhps [r0+r1*1], m0 |

808 | |

809 |
lea r0, [r0+r1*2] |

810 |
lea r2, [r2+r3*2] |

811 |
sub r4, 2 |

812 |
jg .nextrow |

813 |
REP_RET |

814 | |

815 |
cglobal put_vp8_pixels8_mmx, 5,5 |

816 |
.nextrow: |

817 |
movq mm0, [r2+r3*0] |

818 |
movq mm1, [r2+r3*1] |

819 |
lea r2, [r2+r3*2] |

820 |
movq [r0+r1*0], mm0 |

821 |
movq [r0+r1*1], mm1 |

822 |
lea r0, [r0+r1*2] |

823 |
sub r4d, 2 |

824 |
jg .nextrow |

825 |
REP_RET |

826 | |

827 |
cglobal put_vp8_pixels16_mmx, 5,5 |

828 |
.nextrow: |

829 |
movq mm0, [r2+r3*0+0] |

830 |
movq mm1, [r2+r3*0+8] |

831 |
movq mm2, [r2+r3*1+0] |

832 |
movq mm3, [r2+r3*1+8] |

833 |
lea r2, [r2+r3*2] |

834 |
movq [r0+r1*0+0], mm0 |

835 |
movq [r0+r1*0+8], mm1 |

836 |
movq [r0+r1*1+0], mm2 |

837 |
movq [r0+r1*1+8], mm3 |

838 |
lea r0, [r0+r1*2] |

839 |
sub r4d, 2 |

840 |
jg .nextrow |

841 |
REP_RET |

842 | |

843 |
cglobal put_vp8_pixels16_sse, 5,5,2 |

844 |
.nextrow: |

845 |
movups xmm0, [r2+r3*0] |

846 |
movups xmm1, [r2+r3*1] |

847 |
lea r2, [r2+r3*2] |

848 |
movaps [r0+r1*0], xmm0 |

849 |
movaps [r0+r1*1], xmm1 |

850 |
lea r0, [r0+r1*2] |

851 |
sub r4d, 2 |

852 |
jg .nextrow |

853 |
REP_RET |

854 | |

855 |
;----------------------------------------------------------------------------- |

856 |
; IDCT functions: |

857 |
; |

858 |
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |

859 |
;----------------------------------------------------------------------------- |

860 | |

861 |
cglobal vp8_idct_dc_add_mmx, 3, 3 |

862 |
; load data |

863 |
movd mm0, [r1] |

864 | |

865 |
; calculate DC |

866 |
paddw mm0, [pw_4] |

867 |
pxor mm1, mm1 |

868 |
psraw mm0, 3 |

869 |
psubw mm1, mm0 |

870 |
packuswb mm0, mm0 |

871 |
packuswb mm1, mm1 |

872 |
punpcklbw mm0, mm0 |

873 |
punpcklbw mm1, mm1 |

874 |
punpcklwd mm0, mm0 |

875 |
punpcklwd mm1, mm1 |

876 | |

877 |
; add DC |

878 |
lea r1, [r0+r2*2] |

879 |
movd mm2, [r0] |

880 |
movd mm3, [r0+r2] |

881 |
movd mm4, [r1] |

882 |
movd mm5, [r1+r2] |

883 |
paddusb mm2, mm0 |

884 |
paddusb mm3, mm0 |

885 |
paddusb mm4, mm0 |

886 |
paddusb mm5, mm0 |

887 |
psubusb mm2, mm1 |

888 |
psubusb mm3, mm1 |

889 |
psubusb mm4, mm1 |

890 |
psubusb mm5, mm1 |

891 |
movd [r0], mm2 |

892 |
movd [r0+r2], mm3 |

893 |
movd [r1], mm4 |

894 |
movd [r1+r2], mm5 |

895 |
RET |

896 | |

897 |
cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |

898 |
; load data |

899 |
movd xmm0, [r1] |

900 |
lea r1, [r0+r2*2] |

901 |
pxor xmm1, xmm1 |

902 |
movq xmm2, [pw_4] |

903 | |

904 |
; calculate DC |

905 |
paddw xmm0, xmm2 |

906 |
movd xmm2, [r0] |

907 |
movd xmm3, [r0+r2] |

908 |
movd xmm4, [r1] |

909 |
movd xmm5, [r1+r2] |

910 |
psraw xmm0, 3 |

911 |
pshuflw xmm0, xmm0, 0 |

912 |
punpcklqdq xmm0, xmm0 |

913 |
punpckldq xmm2, xmm3 |

914 |
punpckldq xmm4, xmm5 |

915 |
punpcklbw xmm2, xmm1 |

916 |
punpcklbw xmm4, xmm1 |

917 |
paddw xmm2, xmm0 |

918 |
paddw xmm4, xmm0 |

919 |
packuswb xmm2, xmm4 |

920 |
movd [r0], xmm2 |

921 |
pextrd [r0+r2], xmm2, 1 |

922 |
pextrd [r1], xmm2, 2 |

923 |
pextrd [r1+r2], xmm2, 3 |

924 |
RET |

925 | |

926 |
;----------------------------------------------------------------------------- |

927 |
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |

928 |
;----------------------------------------------------------------------------- |

929 | |

930 |
%macro SCATTER_WHT 1 |

931 |
pextrw r1d, m0, %1 |

932 |
pextrw r2d, m1, %1 |

933 |
mov [r0+2*16*0], r1w |

934 |
mov [r0+2*16*1], r2w |

935 |
pextrw r1d, m2, %1 |

936 |
pextrw r2d, m3, %1 |

937 |
mov [r0+2*16*2], r1w |

938 |
mov [r0+2*16*3], r2w |

939 |
%endmacro |

940 | |

941 |
%macro HADAMARD4_1D 4 |

942 |
SUMSUB_BADC m%2, m%1, m%4, m%3 |

943 |
SUMSUB_BADC m%4, m%2, m%3, m%1 |

944 |
SWAP %1, %4, %3 |

945 |
%endmacro |

946 | |

947 |
INIT_MMX |

948 |
cglobal vp8_luma_dc_wht_mmxext, 2,3 |

949 |
movq m0, [r1] |

950 |
movq m1, [r1+8] |

951 |
movq m2, [r1+16] |

952 |
movq m3, [r1+24] |

953 |
HADAMARD4_1D 0, 1, 2, 3 |

954 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |

955 |
paddw m0, [pw_3] |

956 |
HADAMARD4_1D 0, 1, 2, 3 |

957 |
psraw m0, 3 |

958 |
psraw m1, 3 |

959 |
psraw m2, 3 |

960 |
psraw m3, 3 |

961 |
SCATTER_WHT 0 |

962 |
add r0, 2*16*4 |

963 |
SCATTER_WHT 1 |

964 |
add r0, 2*16*4 |

965 |
SCATTER_WHT 2 |

966 |
add r0, 2*16*4 |

967 |
SCATTER_WHT 3 |

968 |
RET |