## ffmpeg / libavcodec / x86 / vp8dsp.asm @ 565344e7

History | View | Annotate | Download (27.8 KB)

1 | 0178d14f | Jason Garrett-Glaser | ;****************************************************************************** |
---|---|---|---|

2 | ;* VP8 MMXEXT optimizations |
||

3 | ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |
||

4 | ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |
||

5 | ;* |
||

6 | ;* This file is part of FFmpeg. |
||

7 | ;* |
||

8 | ;* FFmpeg is free software; you can redistribute it and/or |
||

9 | ;* modify it under the terms of the GNU Lesser General Public |
||

10 | ;* License as published by the Free Software Foundation; either |
||

11 | ;* version 2.1 of the License, or (at your option) any later version. |
||

12 | ;* |
||

13 | ;* FFmpeg is distributed in the hope that it will be useful, |
||

14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||

15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||

16 | ;* Lesser General Public License for more details. |
||

17 | ;* |
||

18 | ;* You should have received a copy of the GNU Lesser General Public |
||

19 | ;* License along with FFmpeg; if not, write to the Free Software |
||

20 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||

21 | ;****************************************************************************** |
||

22 | |||

23 | %include "x86inc.asm" |
||

24 | 004cda8e | Jason Garrett-Glaser | %include "x86util.asm" |

25 | 0178d14f | Jason Garrett-Glaser | |

26 | SECTION_RODATA |
||

27 | |||

28 | fourtap_filter_hw_m: times 4 dw -6, 123 |
||

29 | times 4 dw 12, -1 |
||

30 | times 4 dw -9, 93 |
||

31 | times 4 dw 50, -6 |
||

32 | times 4 dw -6, 50 |
||

33 | times 4 dw 93, -9 |
||

34 | times 4 dw -1, 12 |
||

35 | times 4 dw 123, -6 |
||

36 | |||

37 | sixtap_filter_hw_m: times 4 dw 2, -11 |
||

38 | times 4 dw 108, 36 |
||

39 | times 4 dw -8, 1 |
||

40 | times 4 dw 3, -16 |
||

41 | times 4 dw 77, 77 |
||

42 | times 4 dw -16, 3 |
||

43 | times 4 dw 1, -8 |
||

44 | times 4 dw 36, 108 |
||

45 | times 4 dw -11, 2 |
||

46 | |||

47 | fourtap_filter_hb_m: times 8 db -6, -1 |
||

48 | times 8 db 123, 12 |
||

49 | times 8 db -9, -6 |
||

50 | times 8 db 93, 50 |
||

51 | times 8 db -6, -9 |
||

52 | times 8 db 50, 93 |
||

53 | times 8 db -1, -6 |
||

54 | times 8 db 12, 123 |
||

55 | |||

56 | sixtap_filter_hb_m: times 8 db 2, 1 |
||

57 | times 8 db -11, 108 |
||

58 | times 8 db 36, -8 |
||

59 | times 8 db 3, 3 |
||

60 | times 8 db -16, 77 |
||

61 | times 8 db 77, -16 |
||

62 | times 8 db 1, 2 |
||

63 | times 8 db -8, 36 |
||

64 | times 8 db 108, -11 |
||

65 | |||

66 | fourtap_filter_v_m: times 8 dw -6 |
||

67 | times 8 dw 123 |
||

68 | times 8 dw 12 |
||

69 | times 8 dw -1 |
||

70 | times 8 dw -9 |
||

71 | times 8 dw 93 |
||

72 | times 8 dw 50 |
||

73 | times 8 dw -6 |
||

74 | times 8 dw -6 |
||

75 | times 8 dw 50 |
||

76 | times 8 dw 93 |
||

77 | times 8 dw -9 |
||

78 | times 8 dw -1 |
||

79 | times 8 dw 12 |
||

80 | times 8 dw 123 |
||

81 | times 8 dw -6 |
||

82 | |||

83 | sixtap_filter_v_m: times 8 dw 2 |
||

84 | times 8 dw -11 |
||

85 | times 8 dw 108 |
||

86 | times 8 dw 36 |
||

87 | times 8 dw -8 |
||

88 | times 8 dw 1 |
||

89 | times 8 dw 3 |
||

90 | times 8 dw -16 |
||

91 | times 8 dw 77 |
||

92 | times 8 dw 77 |
||

93 | times 8 dw -16 |
||

94 | times 8 dw 3 |
||

95 | times 8 dw 1 |
||

96 | times 8 dw -8 |
||

97 | times 8 dw 36 |
||

98 | times 8 dw 108 |
||

99 | times 8 dw -11 |
||

100 | times 8 dw 2 |
||

101 | |||

102 | a173aa89 | Jason Garrett-Glaser | bilinear_filter_vw_m: times 8 dw 1 |

103 | times 8 dw 2 |
||

104 | times 8 dw 3 |
||

105 | times 8 dw 4 |
||

106 | times 8 dw 5 |
||

107 | times 8 dw 6 |
||

108 | times 8 dw 7 |
||

109 | |||

110 | bilinear_filter_vb_m: times 8 db 7, 1 |
||

111 | times 8 db 6, 2 |
||

112 | times 8 db 5, 3 |
||

113 | times 8 db 4, 4 |
||

114 | times 8 db 3, 5 |
||

115 | times 8 db 2, 6 |
||

116 | times 8 db 1, 7 |
||

117 | |||

118 | 0178d14f | Jason Garrett-Glaser | %ifdef PIC |

119 | a173aa89 | Jason Garrett-Glaser | %define fourtap_filter_hw r11 |

120 | %define sixtap_filter_hw r11 |
||

121 | %define fourtap_filter_hb r11 |
||

122 | %define sixtap_filter_hb r11 |
||

123 | %define fourtap_filter_v r11 |
||

124 | %define sixtap_filter_v r11 |
||

125 | %define bilinear_filter_vw r11 |
||

126 | %define bilinear_filter_vb r11 |
||

127 | 0178d14f | Jason Garrett-Glaser | %else |

128 | %define fourtap_filter_hw fourtap_filter_hw_m |
||

129 | %define sixtap_filter_hw sixtap_filter_hw_m |
||

130 | %define fourtap_filter_hb fourtap_filter_hb_m |
||

131 | %define sixtap_filter_hb sixtap_filter_hb_m |
||

132 | %define fourtap_filter_v fourtap_filter_v_m |
||

133 | %define sixtap_filter_v sixtap_filter_v_m |
||

134 | a173aa89 | Jason Garrett-Glaser | %define bilinear_filter_vw bilinear_filter_vw_m |

135 | %define bilinear_filter_vb bilinear_filter_vb_m |
||

136 | 0178d14f | Jason Garrett-Glaser | %endif |

137 | |||

138 | a173aa89 | Jason Garrett-Glaser | filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |

139 | filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 |
||

140 | 0178d14f | Jason Garrett-Glaser | |

141 | a173aa89 | Jason Garrett-Glaser | filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |

142 | filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |
||

143 | filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |
||

144 | 0178d14f | Jason Garrett-Glaser | |

145 | 2dd2f716 | Ronald S. Bultje | pw_20091: times 4 dw 20091 |

146 | pw_17734: times 4 dw 17734 |
||

147 | |||

148 | 004cda8e | Jason Garrett-Glaser | cextern pw_3 |

149 | 0178d14f | Jason Garrett-Glaser | cextern pw_4 |

150 | cextern pw_64 |
||

151 | |||

152 | SECTION .text |
||

153 | |||

154 | ;----------------------------------------------------------------------------- |
||

155 | ; subpel MC functions: |
||

156 | ; |
||

157 | ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, |
||

158 | ; uint8_t *src, int srcstride, |
||

159 | ; int height, int mx, int my); |
||

160 | ;----------------------------------------------------------------------------- |
||

161 | |||

162 | ; 4x4 block, H-only 4-tap filter |
||

163 | cglobal put_vp8_epel4_h4_mmxext, 6, 6 |
||

164 | shl r5d, 4 |
||

165 | %ifdef PIC |
||

166 | lea r11, [fourtap_filter_hw_m] |
||

167 | %endif |
||

168 | movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |
||

169 | movq mm5, [fourtap_filter_hw+r5] |
||

170 | movq mm7, [pw_64] |
||

171 | pxor mm6, mm6 |
||

172 | |||

173 | .nextrow |
||

174 | movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels |
||

175 | |||

176 | ; first set of 2 pixels |
||

177 | movq mm2, mm1 ; byte ABCD.. |
||

178 | punpcklbw mm1, mm6 ; byte->word ABCD |
||

179 | pshufw mm0, mm2, 9 ; byte CDEF.. |
||

180 | punpcklbw mm0, mm6 ; byte->word CDEF |
||

181 | pshufw mm3, mm1, 0x94 ; word ABBC |
||

182 | pshufw mm1, mm0, 0x94 ; word CDDE |
||

183 | pmaddwd mm3, mm4 ; multiply 2px with F0/F1 |
||

184 | movq mm0, mm1 ; backup for second set of pixels |
||

185 | pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |
||

186 | paddd mm3, mm1 ; finish 1st 2px |
||

187 | |||

188 | ; second set of 2 pixels, use backup of above |
||

189 | punpckhbw mm2, mm6 ; byte->word EFGH |
||

190 | pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 |
||

191 | pshufw mm1, mm2, 0x94 ; word EFFG |
||

192 | pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |
||

193 | paddd mm0, mm1 ; finish 2nd 2px |
||

194 | |||

195 | ; merge two sets of 2 pixels into one set of 4, round/clip/store |
||

196 | packssdw mm3, mm0 ; merge dword->word (4px) |
||

197 | paddsw mm3, mm7 ; rounding |
||

198 | psraw mm3, 7 |
||

199 | packuswb mm3, mm6 ; clip and word->bytes |
||

200 | movd [r0], mm3 ; store |
||

201 | |||

202 | ; go to next line |
||

203 | add r0, r1 |
||

204 | add r2, r3 |
||

205 | dec r4 ; next row |
||

206 | jg .nextrow |
||

207 | REP_RET |
||

208 | |||

209 | ; 4x4 block, H-only 6-tap filter |
||

210 | cglobal put_vp8_epel4_h6_mmxext, 6, 6 |
||

211 | lea r5d, [r5*3] |
||

212 | %ifdef PIC |
||

213 | lea r11, [sixtap_filter_hw_m] |
||

214 | %endif |
||

215 | movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words |
||

216 | movq mm5, [sixtap_filter_hw+r5*8-32] |
||

217 | movq mm6, [sixtap_filter_hw+r5*8-16] |
||

218 | movq mm7, [pw_64] |
||

219 | pxor mm3, mm3 |
||

220 | |||

221 | .nextrow |
||

222 | movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels |
||

223 | |||

224 | ; first set of 2 pixels |
||

225 | movq mm2, mm1 ; byte ABCD.. |
||

226 | punpcklbw mm1, mm3 ; byte->word ABCD |
||

227 | pshufw mm0, mm2, 0x9 ; byte CDEF.. |
||

228 | punpckhbw mm2, mm3 ; byte->word EFGH |
||

229 | punpcklbw mm0, mm3 ; byte->word CDEF |
||

230 | pshufw mm1, mm1, 0x94 ; word ABBC |
||

231 | pshufw mm2, mm2, 0x94 ; word EFFG |
||

232 | pmaddwd mm1, mm4 ; multiply 2px with F0/F1 |
||

233 | pshufw mm3, mm0, 0x94 ; word CDDE |
||

234 | movq mm0, mm3 ; backup for second set of pixels |
||

235 | pmaddwd mm3, mm5 ; multiply 2px with F2/F3 |
||

236 | paddd mm1, mm3 ; add to 1st 2px cache |
||

237 | movq mm3, mm2 ; backup for second set of pixels |
||

238 | pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |
||

239 | paddd mm1, mm2 ; finish 1st 2px |
||

240 | |||

241 | ; second set of 2 pixels, use backup of above |
||

242 | movd mm2, [r2+3] ; byte FGHI (prevent overreads) |
||

243 | pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 |
||

244 | pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 |
||

245 | paddd mm0, mm3 ; add to 2nd 2px cache |
||

246 | pxor mm3, mm3 |
||

247 | punpcklbw mm2, mm3 ; byte->word FGHI |
||

248 | pshufw mm2, mm2, 0xE9 ; word GHHI |
||

249 | pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |
||

250 | paddd mm0, mm2 ; finish 2nd 2px |
||

251 | |||

252 | ; merge two sets of 2 pixels into one set of 4, round/clip/store |
||

253 | packssdw mm1, mm0 ; merge dword->word (4px) |
||

254 | paddsw mm1, mm7 ; rounding |
||

255 | psraw mm1, 7 |
||

256 | packuswb mm1, mm3 ; clip and word->bytes |
||

257 | movd [r0], mm1 ; store |
||

258 | |||

259 | ; go to next line |
||

260 | add r0, r1 |
||

261 | add r2, r3 |
||

262 | dec r4 ; next row |
||

263 | jg .nextrow |
||

264 | REP_RET |
||

265 | |||

266 | ; 4x4 block, H-only 4-tap filter |
||

267 | INIT_XMM |
||

268 | cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 |
||

269 | shl r5d, 4 |
||

270 | %ifdef PIC |
||

271 | lea r11, [fourtap_filter_hw_m] |
||

272 | %endif |
||

273 | mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |
||

274 | mova m6, [fourtap_filter_hw+r5] |
||

275 | pxor m7, m7 |
||

276 | |||

277 | .nextrow |
||

278 | movh m0, [r2-1] |
||

279 | punpcklbw m0, m7 ; ABCDEFGH |
||

280 | mova m1, m0 |
||

281 | mova m2, m0 |
||

282 | mova m3, m0 |
||

283 | psrldq m1, 2 ; BCDEFGH |
||

284 | psrldq m2, 4 ; CDEFGH |
||

285 | psrldq m3, 6 ; DEFGH |
||

286 | punpcklwd m0, m1 ; ABBCCDDE |
||

287 | punpcklwd m2, m3 ; CDDEEFFG |
||

288 | pmaddwd m0, m5 |
||

289 | pmaddwd m2, m6 |
||

290 | paddd m0, m2 |
||

291 | |||

292 | movh m1, [r2+3] |
||

293 | punpcklbw m1, m7 ; ABCDEFGH |
||

294 | mova m2, m1 |
||

295 | mova m3, m1 |
||

296 | mova m4, m1 |
||

297 | psrldq m2, 2 ; BCDEFGH |
||

298 | psrldq m3, 4 ; CDEFGH |
||

299 | psrldq m4, 6 ; DEFGH |
||

300 | punpcklwd m1, m2 ; ABBCCDDE |
||

301 | punpcklwd m3, m4 ; CDDEEFFG |
||

302 | pmaddwd m1, m5 |
||

303 | pmaddwd m3, m6 |
||

304 | paddd m1, m3 |
||

305 | |||

306 | packssdw m0, m1 |
||

307 | paddsw m0, [pw_64] |
||

308 | psraw m0, 7 |
||

309 | packuswb m0, m7 |
||

310 | movh [r0], m0 ; store |
||

311 | |||

312 | ; go to next line |
||

313 | add r0, r1 |
||

314 | add r2, r3 |
||

315 | dec r4 ; next row |
||

316 | jg .nextrow |
||

317 | REP_RET |
||

318 | |||

319 | cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 |
||

320 | lea r5d, [r5*3] |
||

321 | %ifdef PIC |
||

322 | lea r11, [sixtap_filter_hw_m] |
||

323 | %endif |
||

324 | lea r5, [sixtap_filter_hw+r5*8] |
||

325 | pxor m7, m7 |
||

326 | |||

327 | .nextrow |
||

328 | movu m0, [r2-2] |
||

329 | mova m6, m0 |
||

330 | mova m4, m0 |
||

331 | punpcklbw m0, m7 ; ABCDEFGHI |
||

332 | mova m1, m0 |
||

333 | mova m2, m0 |
||

334 | mova m3, m0 |
||

335 | psrldq m1, 2 ; BCDEFGH |
||

336 | psrldq m2, 4 ; CDEFGH |
||

337 | psrldq m3, 6 ; DEFGH |
||

338 | psrldq m4, 4 |
||

339 | punpcklbw m4, m7 ; EFGH |
||

340 | mova m5, m4 |
||

341 | psrldq m5, 2 ; FGH |
||

342 | punpcklwd m0, m1 ; ABBCCDDE |
||

343 | punpcklwd m2, m3 ; CDDEEFFG |
||

344 | punpcklwd m4, m5 ; EFFGGHHI |
||

345 | pmaddwd m0, [r5-48] |
||

346 | pmaddwd m2, [r5-32] |
||

347 | pmaddwd m4, [r5-16] |
||

348 | paddd m0, m2 |
||

349 | paddd m0, m4 |
||

350 | |||

351 | psrldq m6, 4 |
||

352 | mova m4, m6 |
||

353 | punpcklbw m6, m7 ; ABCDEFGHI |
||

354 | mova m1, m6 |
||

355 | mova m2, m6 |
||

356 | mova m3, m6 |
||

357 | psrldq m1, 2 ; BCDEFGH |
||

358 | psrldq m2, 4 ; CDEFGH |
||

359 | psrldq m3, 6 ; DEFGH |
||

360 | psrldq m4, 4 |
||

361 | punpcklbw m4, m7 ; EFGH |
||

362 | mova m5, m4 |
||

363 | psrldq m5, 2 ; FGH |
||

364 | punpcklwd m6, m1 ; ABBCCDDE |
||

365 | punpcklwd m2, m3 ; CDDEEFFG |
||

366 | punpcklwd m4, m5 ; EFFGGHHI |
||

367 | pmaddwd m6, [r5-48] |
||

368 | pmaddwd m2, [r5-32] |
||

369 | pmaddwd m4, [r5-16] |
||

370 | paddd m6, m2 |
||

371 | paddd m6, m4 |
||

372 | |||

373 | packssdw m0, m6 |
||

374 | paddsw m0, [pw_64] |
||

375 | psraw m0, 7 |
||

376 | packuswb m0, m7 |
||

377 | movh [r0], m0 ; store |
||

378 | |||

379 | ; go to next line |
||

380 | add r0, r1 |
||

381 | add r2, r3 |
||

382 | dec r4 ; next row |
||

383 | jg .nextrow |
||

384 | REP_RET |
||

385 | |||

386 | cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7 |
||

387 | shl r5d, 4 |
||

388 | mova m2, [pw_64] |
||

389 | a173aa89 | Jason Garrett-Glaser | mova m3, [filter_h4_shuf] |

390 | mova m4, [filter_h6_shuf2] |
||

391 | 0178d14f | Jason Garrett-Glaser | %ifdef PIC |

392 | lea r11, [fourtap_filter_hb_m] |
||

393 | %endif |
||

394 | mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |
||

395 | mova m6, [fourtap_filter_hb+r5] |
||

396 | |||

397 | .nextrow |
||

398 | movu m0, [r2-1] |
||

399 | mova m1, m0 |
||

400 | pshufb m0, m3 |
||

401 | pshufb m1, m4 |
||

402 | pmaddubsw m0, m5 |
||

403 | pmaddubsw m1, m6 |
||

404 | paddsw m0, m2 |
||

405 | paddsw m0, m1 |
||

406 | psraw m0, 7 |
||

407 | packuswb m0, m0 |
||

408 | movh [r0], m0 ; store |
||

409 | |||

410 | ; go to next line |
||

411 | add r0, r1 |
||

412 | add r2, r3 |
||

413 | dec r4 ; next row |
||

414 | jg .nextrow |
||

415 | REP_RET |
||

416 | |||

417 | cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8 |
||

418 | lea r5d, [r5*3] |
||

419 | a173aa89 | Jason Garrett-Glaser | mova m3, [filter_h6_shuf1] |

420 | mova m4, [filter_h6_shuf2] |
||

421 | 0178d14f | Jason Garrett-Glaser | %ifdef PIC |

422 | lea r11, [sixtap_filter_hb_m] |
||

423 | %endif |
||

424 | mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |
||

425 | mova m6, [sixtap_filter_hb+r5*8-32] |
||

426 | mova m7, [sixtap_filter_hb+r5*8-16] |
||

427 | |||

428 | .nextrow |
||

429 | movu m0, [r2-2] |
||

430 | mova m1, m0 |
||

431 | mova m2, m0 |
||

432 | pshufb m0, m3 |
||

433 | pshufb m1, m4 |
||

434 | a173aa89 | Jason Garrett-Glaser | pshufb m2, [filter_h6_shuf3] |

435 | 0178d14f | Jason Garrett-Glaser | pmaddubsw m0, m5 |

436 | pmaddubsw m1, m6 |
||

437 | pmaddubsw m2, m7 |
||

438 | paddsw m0, m1 |
||

439 | paddsw m0, m2 |
||

440 | paddsw m0, [pw_64] |
||

441 | psraw m0, 7 |
||

442 | packuswb m0, m0 |
||

443 | movh [r0], m0 ; store |
||

444 | |||

445 | ; go to next line |
||

446 | add r0, r1 |
||

447 | add r2, r3 |
||

448 | dec r4 ; next row |
||

449 | jg .nextrow |
||

450 | REP_RET |
||

451 | |||

452 | %macro FILTER_V 3 |
||

453 | ; 4x4 block, V-only 4-tap filter |
||

454 | cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 |
||

455 | shl r6d, 5 |
||

456 | %ifdef PIC |
||

457 | lea r11, [fourtap_filter_v_m] |
||

458 | %endif |
||

459 | lea r6, [fourtap_filter_v+r6-32] |
||

460 | mova m6, [pw_64] |
||

461 | pxor m7, m7 |
||

462 | mova m5, [r6+48] |
||

463 | |||

464 | ; read 3 lines |
||

465 | sub r2, r3 |
||

466 | movh m0, [r2] |
||

467 | movh m1, [r2+ r3] |
||

468 | movh m2, [r2+2*r3] |
||

469 | add r2, r3 |
||

470 | punpcklbw m0, m7 |
||

471 | punpcklbw m1, m7 |
||

472 | punpcklbw m2, m7 |
||

473 | |||

474 | .nextrow |
||

475 | ; first calculate negative taps (to prevent losing positive overflows) |
||

476 | movh m4, [r2+2*r3] ; read new row |
||

477 | punpcklbw m4, m7 |
||

478 | mova m3, m4 |
||

479 | pmullw m0, [r6+0] |
||

480 | pmullw m4, m5 |
||

481 | paddsw m4, m0 |
||

482 | |||

483 | ; then calculate positive taps |
||

484 | mova m0, m1 |
||

485 | pmullw m1, [r6+16] |
||

486 | paddsw m4, m1 |
||

487 | mova m1, m2 |
||

488 | pmullw m2, [r6+32] |
||

489 | paddsw m4, m2 |
||

490 | mova m2, m3 |
||

491 | |||

492 | ; round/clip/store |
||

493 | paddsw m4, m6 |
||

494 | psraw m4, 7 |
||

495 | packuswb m4, m7 |
||

496 | movh [r0], m4 |
||

497 | |||

498 | ; go to next line |
||

499 | add r0, r1 |
||

500 | add r2, r3 |
||

501 | dec r4 ; next row |
||

502 | jg .nextrow |
||

503 | REP_RET |
||

504 | |||

505 | |||

506 | ; 4x4 block, V-only 6-tap filter |
||

507 | cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 |
||

508 | shl r6d, 4 |
||

509 | lea r6, [r6*3] |
||

510 | %ifdef PIC |
||

511 | lea r11, [sixtap_filter_v_m] |
||

512 | %endif |
||

513 | lea r6, [sixtap_filter_v+r6-96] |
||

514 | pxor m7, m7 |
||

515 | |||

516 | ; read 5 lines |
||

517 | sub r2, r3 |
||

518 | sub r2, r3 |
||

519 | movh m0, [r2] |
||

520 | movh m1, [r2+r3] |
||

521 | movh m2, [r2+r3*2] |
||

522 | lea r2, [r2+r3*2] |
||

523 | add r2, r3 |
||

524 | movh m3, [r2] |
||

525 | movh m4, [r2+r3] |
||

526 | punpcklbw m0, m7 |
||

527 | punpcklbw m1, m7 |
||

528 | punpcklbw m2, m7 |
||

529 | punpcklbw m3, m7 |
||

530 | punpcklbw m4, m7 |
||

531 | |||

532 | .nextrow |
||

533 | ; first calculate negative taps (to prevent losing positive overflows) |
||

534 | mova m5, m1 |
||

535 | pmullw m5, [r6+16] |
||

536 | mova m6, m4 |
||

537 | pmullw m6, [r6+64] |
||

538 | paddsw m6, m5 |
||

539 | |||

540 | ; then calculate positive taps |
||

541 | movh m5, [r2+2*r3] ; read new row |
||

542 | punpcklbw m5, m7 |
||

543 | pmullw m0, [r6+0] |
||

544 | paddsw m6, m0 |
||

545 | mova m0, m1 |
||

546 | mova m1, m2 |
||

547 | pmullw m2, [r6+32] |
||

548 | paddsw m6, m2 |
||

549 | mova m2, m3 |
||

550 | pmullw m3, [r6+48] |
||

551 | paddsw m6, m3 |
||

552 | mova m3, m4 |
||

553 | mova m4, m5 |
||

554 | pmullw m5, [r6+80] |
||

555 | paddsw m6, m5 |
||

556 | |||

557 | ; round/clip/store |
||

558 | paddsw m6, [pw_64] |
||

559 | psraw m6, 7 |
||

560 | packuswb m6, m7 |
||

561 | movh [r0], m6 |
||

562 | |||

563 | ; go to next line |
||

564 | add r0, r1 |
||

565 | add r2, r3 |
||

566 | dec r4 ; next row |
||

567 | jg .nextrow |
||

568 | REP_RET |
||

569 | %endmacro |
||

570 | |||

571 | INIT_MMX |
||

572 | FILTER_V mmxext, 4, 0 |
||

573 | INIT_XMM |
||

574 | FILTER_V sse2, 8, 8 |
||

575 | |||

576 | cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8 |
||

577 | shl r6d, 4 |
||

578 | %ifdef PIC |
||

579 | lea r11, [fourtap_filter_hb_m] |
||

580 | %endif |
||

581 | mova m5, [fourtap_filter_hb+r6-16] |
||

582 | mova m6, [fourtap_filter_hb+r6] |
||

583 | mova m7, [pw_64] |
||

584 | |||

585 | ; read 3 lines |
||

586 | sub r2, r3 |
||

587 | movh m0, [r2] |
||

588 | movh m1, [r2+ r3] |
||

589 | movh m2, [r2+2*r3] |
||

590 | add r2, r3 |
||

591 | |||

592 | .nextrow |
||

593 | movh m3, [r2+2*r3] ; read new row |
||

594 | mova m4, m0 |
||

595 | mova m0, m1 |
||

596 | punpcklbw m4, m3 |
||

597 | punpcklbw m1, m2 |
||

598 | pmaddubsw m4, m5 |
||

599 | pmaddubsw m1, m6 |
||

600 | paddsw m4, m1 |
||

601 | mova m1, m2 |
||

602 | paddsw m4, m7 |
||

603 | mova m2, m3 |
||

604 | psraw m4, 7 |
||

605 | packuswb m4, m4 |
||

606 | movh [r0], m4 |
||

607 | |||

608 | ; go to next line |
||

609 | add r0, r1 |
||

610 | add r2, r3 |
||

611 | dec r4 ; next row |
||

612 | jg .nextrow |
||

613 | REP_RET |
||

614 | |||

615 | cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8 |
||

616 | lea r6d, [r6*3] |
||

617 | %ifdef PIC |
||

618 | lea r11, [sixtap_filter_hb_m] |
||

619 | %endif |
||

620 | lea r6, [sixtap_filter_hb+r6*8] |
||

621 | |||

622 | ; read 5 lines |
||

623 | sub r2, r3 |
||

624 | sub r2, r3 |
||

625 | movh m0, [r2] |
||

626 | movh m1, [r2+r3] |
||

627 | movh m2, [r2+r3*2] |
||

628 | lea r2, [r2+r3*2] |
||

629 | add r2, r3 |
||

630 | movh m3, [r2] |
||

631 | movh m4, [r2+r3] |
||

632 | |||

633 | .nextrow |
||

634 | movh m5, [r2+2*r3] ; read new row |
||

635 | mova m6, m0 |
||

636 | punpcklbw m6, m5 |
||

637 | mova m0, m1 |
||

638 | punpcklbw m1, m2 |
||

639 | mova m7, m3 |
||

640 | punpcklbw m7, m4 |
||

641 | pmaddubsw m6, [r6-48] |
||

642 | pmaddubsw m1, [r6-32] |
||

643 | pmaddubsw m7, [r6-16] |
||

644 | paddsw m6, m1 |
||

645 | paddsw m6, m7 |
||

646 | mova m1, m2 |
||

647 | paddsw m6, [pw_64] |
||

648 | mova m2, m3 |
||

649 | psraw m6, 7 |
||

650 | mova m3, m4 |
||

651 | packuswb m6, m6 |
||

652 | mova m4, m5 |
||

653 | movh [r0], m6 |
||

654 | |||

655 | ; go to next line |
||

656 | add r0, r1 |
||

657 | add r2, r3 |
||

658 | dec r4 ; next row |
||

659 | jg .nextrow |
||

660 | REP_RET |
||

661 | |||

662 | a173aa89 | Jason Garrett-Glaser | %macro FILTER_BILINEAR 3 |

663 | cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 |
||

664 | mov r5d, 8*16 |
||

665 | shl r6d, 4 |
||

666 | sub r5d, r6d |
||

667 | %ifdef PIC |
||

668 | lea r11, [bilinear_filter_vw_m] |
||

669 | %endif |
||

670 | pxor m6, m6 |
||

671 | a912da76 | Jason Garrett-Glaser | mova m4, [bilinear_filter_vw+r5-16] |

672 | mova m5, [bilinear_filter_vw+r6-16] |
||

673 | a173aa89 | Jason Garrett-Glaser | .nextrow |

674 | movh m0, [r2+r3*0] |
||

675 | movh m1, [r2+r3*1] |
||

676 | movh m3, [r2+r3*2] |
||

677 | punpcklbw m0, m6 |
||

678 | punpcklbw m1, m6 |
||

679 | punpcklbw m3, m6 |
||

680 | mova m2, m1 |
||

681 | pmullw m0, m4 |
||

682 | pmullw m1, m5 |
||

683 | pmullw m2, m4 |
||

684 | pmullw m3, m5 |
||

685 | paddsw m0, m1 |
||

686 | paddsw m2, m3 |
||

687 | psraw m0, 2 |
||

688 | psraw m2, 2 |
||

689 | pavgw m0, m6 |
||

690 | pavgw m2, m6 |
||

691 | %ifidn %1, mmxext |
||

692 | packuswb m0, m0 |
||

693 | packuswb m2, m2 |
||

694 | movh [r0+r1*0], m0 |
||

695 | movh [r0+r1*1], m2 |
||

696 | %else |
||

697 | packuswb m0, m2 |
||

698 | movh [r0+r1*0], m0 |
||

699 | movhps [r0+r1*1], m0 |
||

700 | %endif |
||

701 | |||

702 | lea r0, [r0+r1*2] |
||

703 | lea r2, [r2+r3*2] |
||

704 | sub r4, 2 |
||

705 | jg .nextrow |
||

706 | REP_RET |
||

707 | |||

708 | cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 |
||

709 | mov r6d, 8*16 |
||

710 | shl r5d, 4 |
||

711 | sub r6d, r5d |
||

712 | %ifdef PIC |
||

713 | lea r11, [bilinear_filter_vw_m] |
||

714 | %endif |
||

715 | pxor m6, m6 |
||

716 | a912da76 | Jason Garrett-Glaser | mova m4, [bilinear_filter_vw+r6-16] |

717 | mova m5, [bilinear_filter_vw+r5-16] |
||

718 | a173aa89 | Jason Garrett-Glaser | .nextrow |

719 | movh m0, [r2+r3*0+0] |
||

720 | movh m1, [r2+r3*0+1] |
||

721 | movh m2, [r2+r3*1+0] |
||

722 | movh m3, [r2+r3*1+1] |
||

723 | punpcklbw m0, m6 |
||

724 | punpcklbw m1, m6 |
||

725 | punpcklbw m2, m6 |
||

726 | punpcklbw m3, m6 |
||

727 | pmullw m0, m4 |
||

728 | pmullw m1, m5 |
||

729 | pmullw m2, m4 |
||

730 | pmullw m3, m5 |
||

731 | paddsw m0, m1 |
||

732 | paddsw m2, m3 |
||

733 | psraw m0, 2 |
||

734 | psraw m2, 2 |
||

735 | pavgw m0, m6 |
||

736 | pavgw m2, m6 |
||

737 | %ifidn %1, mmxext |
||

738 | packuswb m0, m0 |
||

739 | packuswb m2, m2 |
||

740 | movh [r0+r1*0], m0 |
||

741 | movh [r0+r1*1], m2 |
||

742 | %else |
||

743 | packuswb m0, m2 |
||

744 | movh [r0+r1*0], m0 |
||

745 | movhps [r0+r1*1], m0 |
||

746 | %endif |
||

747 | |||

748 | lea r0, [r0+r1*2] |
||

749 | lea r2, [r2+r3*2] |
||

750 | sub r4, 2 |
||

751 | jg .nextrow |
||

752 | REP_RET |
||

753 | %endmacro |
||

754 | |||

755 | INIT_MMX |
||

756 | FILTER_BILINEAR mmxext, 4, 0 |
||

757 | INIT_XMM |
||

758 | FILTER_BILINEAR sse2, 8, 7 |
||

759 | |||

760 | cglobal put_vp8_bilinear8_v_ssse3, 7,7,5 |
||

761 | shl r6d, 4 |
||

762 | %ifdef PIC |
||

763 | lea r11, [bilinear_filter_vb_m] |
||

764 | %endif |
||

765 | pxor m4, m4 |
||

766 | a912da76 | Jason Garrett-Glaser | mova m3, [bilinear_filter_vb+r6-16] |

767 | a173aa89 | Jason Garrett-Glaser | .nextrow |

768 | movh m0, [r2+r3*0] |
||

769 | movh m1, [r2+r3*1] |
||

770 | movh m2, [r2+r3*2] |
||

771 | punpcklbw m0, m1 |
||

772 | punpcklbw m1, m2 |
||

773 | pmaddubsw m0, m3 |
||

774 | pmaddubsw m1, m3 |
||

775 | psraw m0, 2 |
||

776 | psraw m1, 2 |
||

777 | pavgw m0, m4 |
||

778 | pavgw m1, m4 |
||

779 | packuswb m0, m1 |
||

780 | movh [r0+r1*0], m0 |
||

781 | movhps [r0+r1*1], m0 |
||

782 | |||

783 | lea r0, [r0+r1*2] |
||

784 | lea r2, [r2+r3*2] |
||

785 | sub r4, 2 |
||

786 | jg .nextrow |
||

787 | REP_RET |
||

788 | |||

789 | cglobal put_vp8_bilinear8_h_ssse3, 7,7,5 |
||

790 | shl r5d, 4 |
||

791 | %ifdef PIC |
||

792 | lea r11, [bilinear_filter_vb_m] |
||

793 | %endif |
||

794 | pxor m4, m4 |
||

795 | mova m2, [filter_h2_shuf] |
||

796 | a912da76 | Jason Garrett-Glaser | mova m3, [bilinear_filter_vb+r5-16] |

797 | a173aa89 | Jason Garrett-Glaser | .nextrow |

798 | movu m0, [r2+r3*0] |
||

799 | movu m1, [r2+r3*1] |
||

800 | pshufb m0, m2 |
||

801 | pshufb m1, m2 |
||

802 | pmaddubsw m0, m3 |
||

803 | pmaddubsw m1, m3 |
||

804 | psraw m0, 2 |
||

805 | psraw m1, 2 |
||

806 | pavgw m0, m4 |
||

807 | pavgw m1, m4 |
||

808 | packuswb m0, m1 |
||

809 | movh [r0+r1*0], m0 |
||

810 | movhps [r0+r1*1], m0 |
||

811 | |||

812 | lea r0, [r0+r1*2] |
||

813 | lea r2, [r2+r3*2] |
||

814 | sub r4, 2 |
||

815 | jg .nextrow |
||

816 | REP_RET |
||

817 | |||

818 | 0fecad09 | Jason Garrett-Glaser | cglobal put_vp8_pixels8_mmx, 5,5 |

819 | .nextrow: |
||

820 | movq mm0, [r2+r3*0] |
||

821 | movq mm1, [r2+r3*1] |
||

822 | lea r2, [r2+r3*2] |
||

823 | movq [r0+r1*0], mm0 |
||

824 | movq [r0+r1*1], mm1 |
||

825 | lea r0, [r0+r1*2] |
||

826 | sub r4d, 2 |
||

827 | jg .nextrow |
||

828 | REP_RET |
||

829 | |||

830 | cglobal put_vp8_pixels16_mmx, 5,5 |
||

831 | .nextrow: |
||

832 | movq mm0, [r2+r3*0+0] |
||

833 | movq mm1, [r2+r3*0+8] |
||

834 | movq mm2, [r2+r3*1+0] |
||

835 | movq mm3, [r2+r3*1+8] |
||

836 | lea r2, [r2+r3*2] |
||

837 | movq [r0+r1*0+0], mm0 |
||

838 | movq [r0+r1*0+8], mm1 |
||

839 | movq [r0+r1*1+0], mm2 |
||

840 | movq [r0+r1*1+8], mm3 |
||

841 | lea r0, [r0+r1*2] |
||

842 | sub r4d, 2 |
||

843 | jg .nextrow |
||

844 | REP_RET |
||

845 | |||

846 | cglobal put_vp8_pixels16_sse, 5,5,2 |
||

847 | .nextrow: |
||

848 | movups xmm0, [r2+r3*0] |
||

849 | movups xmm1, [r2+r3*1] |
||

850 | lea r2, [r2+r3*2] |
||

851 | movaps [r0+r1*0], xmm0 |
||

852 | movaps [r0+r1*1], xmm1 |
||

853 | lea r0, [r0+r1*2] |
||

854 | sub r4d, 2 |
||

855 | jg .nextrow |
||

856 | REP_RET |
||

857 | |||

858 | 0178d14f | Jason Garrett-Glaser | ;----------------------------------------------------------------------------- |

859 | ; IDCT functions: |
||

860 | ; |
||

861 | ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
||

862 | ;----------------------------------------------------------------------------- |
||

863 | |||

864 | cglobal vp8_idct_dc_add_mmx, 3, 3 |
||

865 | ; load data |
||

866 | movd mm0, [r1] |
||

867 | |||

868 | ; calculate DC |
||

869 | paddw mm0, [pw_4] |
||

870 | pxor mm1, mm1 |
||

871 | psraw mm0, 3 |
||

872 | psubw mm1, mm0 |
||

873 | packuswb mm0, mm0 |
||

874 | packuswb mm1, mm1 |
||

875 | punpcklbw mm0, mm0 |
||

876 | punpcklbw mm1, mm1 |
||

877 | punpcklwd mm0, mm0 |
||

878 | punpcklwd mm1, mm1 |
||

879 | |||

880 | ; add DC |
||

881 | lea r1, [r0+r2*2] |
||

882 | movd mm2, [r0] |
||

883 | movd mm3, [r0+r2] |
||

884 | movd mm4, [r1] |
||

885 | movd mm5, [r1+r2] |
||

886 | paddusb mm2, mm0 |
||

887 | paddusb mm3, mm0 |
||

888 | paddusb mm4, mm0 |
||

889 | paddusb mm5, mm0 |
||

890 | psubusb mm2, mm1 |
||

891 | psubusb mm3, mm1 |
||

892 | psubusb mm4, mm1 |
||

893 | psubusb mm5, mm1 |
||

894 | movd [r0], mm2 |
||

895 | movd [r0+r2], mm3 |
||

896 | movd [r1], mm4 |
||

897 | movd [r1+r2], mm5 |
||

898 | RET |
||

899 | |||

900 | cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |
||

901 | ; load data |
||

902 | movd xmm0, [r1] |
||

903 | lea r1, [r0+r2*2] |
||

904 | pxor xmm1, xmm1 |
||

905 | movq xmm2, [pw_4] |
||

906 | |||

907 | ; calculate DC |
||

908 | paddw xmm0, xmm2 |
||

909 | movd xmm2, [r0] |
||

910 | movd xmm3, [r0+r2] |
||

911 | movd xmm4, [r1] |
||

912 | movd xmm5, [r1+r2] |
||

913 | psraw xmm0, 3 |
||

914 | pshuflw xmm0, xmm0, 0 |
||

915 | punpcklqdq xmm0, xmm0 |
||

916 | punpckldq xmm2, xmm3 |
||

917 | punpckldq xmm4, xmm5 |
||

918 | punpcklbw xmm2, xmm1 |
||

919 | punpcklbw xmm4, xmm1 |
||

920 | paddw xmm2, xmm0 |
||

921 | paddw xmm4, xmm0 |
||

922 | packuswb xmm2, xmm4 |
||

923 | movd [r0], xmm2 |
||

924 | pextrd [r0+r2], xmm2, 1 |
||

925 | pextrd [r1], xmm2, 2 |
||

926 | pextrd [r1+r2], xmm2, 3 |
||

927 | RET |
||

928 | 004cda8e | Jason Garrett-Glaser | |

929 | ;----------------------------------------------------------------------------- |
||

930 | 2dd2f716 | Ronald S. Bultje | ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |

931 | ;----------------------------------------------------------------------------- |
||

932 | |||

933 | ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) |
||

934 | ; this macro assumes that m6/m7 have words for 20091/17734 loaded |
||

935 | %macro VP8_MULTIPLY_SUMSUB 4 |
||

936 | mova %3, %1 |
||

937 | mova %4, %2 |
||

938 | pmulhw %3, m6 ;20091(1) |
||

939 | pmulhw %4, m6 ;20091(2) |
||

940 | paddw %3, %1 |
||

941 | paddw %4, %2 |
||

942 | psllw %1, 1 |
||

943 | psllw %2, 1 |
||

944 | pmulhw %1, m7 ;35468(1) |
||

945 | pmulhw %2, m7 ;35468(2) |
||

946 | psubw %1, %4 |
||

947 | paddw %2, %3 |
||

948 | %endmacro |
||

949 | |||

950 | ; calculate x0=%1+%3; x1=%1-%3 |
||

951 | ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) |
||

952 | ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) |
||

953 | ; %5/%6 are temporary registers |
||

954 | ; we assume m6/m7 have constant words 20091/17734 loaded in them |
||

955 | %macro VP8_IDCT_TRANSFORM4x4_1D 6 |
||

956 | SUMSUB_BA m%3, m%1, m%5 ;t0, t1 |
||

957 | VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 |
||

958 | SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 |
||

959 | SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 |
||

960 | SWAP %4, %1 |
||

961 | SWAP %4, %3 |
||

962 | %endmacro |
||

963 | |||

964 | INIT_MMX |
||

965 | cglobal vp8_idct_add_mmx, 3, 3 |
||

966 | ; load block data |
||

967 | movq m0, [r1] |
||

968 | movq m1, [r1+8] |
||

969 | movq m2, [r1+16] |
||

970 | movq m3, [r1+24] |
||

971 | movq m6, [pw_20091] |
||

972 | movq m7, [pw_17734] |
||

973 | |||

974 | ; actual IDCT |
||

975 | VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |
||

976 | TRANSPOSE4x4W 0, 1, 2, 3, 4 |
||

977 | paddw m0, [pw_4] |
||

978 | VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |
||

979 | TRANSPOSE4x4W 0, 1, 2, 3, 4 |
||

980 | |||

981 | ; store |
||

982 | pxor m4, m4 |
||

983 | lea r1, [r0+2*r2] |
||

984 | STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 |
||

985 | STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 |
||

986 | |||

987 | RET |
||

988 | |||

989 | ;----------------------------------------------------------------------------- |
||

990 | 004cda8e | Jason Garrett-Glaser | ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |

991 | ;----------------------------------------------------------------------------- |
||

992 | |||

993 | %macro SCATTER_WHT 1 |
||

994 | pextrw r1d, m0, %1 |
||

995 | pextrw r2d, m1, %1 |
||

996 | mov [r0+2*16*0], r1w |
||

997 | mov [r0+2*16*1], r2w |
||

998 | pextrw r1d, m2, %1 |
||

999 | pextrw r2d, m3, %1 |
||

1000 | mov [r0+2*16*2], r1w |
||

1001 | mov [r0+2*16*3], r2w |
||

1002 | %endmacro |
||

1003 | |||

1004 | %macro HADAMARD4_1D 4 |
||

1005 | SUMSUB_BADC m%2, m%1, m%4, m%3 |
||

1006 | SUMSUB_BADC m%4, m%2, m%3, m%1 |
||

1007 | SWAP %1, %4, %3 |
||

1008 | %endmacro |
||

1009 | |||

1010 | INIT_MMX |
||

1011 | cglobal vp8_luma_dc_wht_mmxext, 2,3 |
||

1012 | movq m0, [r1] |
||

1013 | movq m1, [r1+8] |
||

1014 | movq m2, [r1+16] |
||

1015 | movq m3, [r1+24] |
||

1016 | HADAMARD4_1D 0, 1, 2, 3 |
||

1017 | TRANSPOSE4x4W 0, 1, 2, 3, 4 |
||

1018 | paddw m0, [pw_3] |
||

1019 | HADAMARD4_1D 0, 1, 2, 3 |
||

1020 | psraw m0, 3 |
||

1021 | psraw m1, 3 |
||

1022 | psraw m2, 3 |
||

1023 | psraw m3, 3 |
||

1024 | SCATTER_WHT 0 |
||

1025 | add r0, 2*16*4 |
||

1026 | SCATTER_WHT 1 |
||

1027 | add r0, 2*16*4 |
||

1028 | SCATTER_WHT 2 |
||

1029 | add r0, 2*16*4 |
||

1030 | SCATTER_WHT 3 |
||

1031 | RET |