## ffmpeg / libavcodec / arm / vp8dsp_neon.S @ ef15d71c

History | View | Annotate | Download (66.4 KB)

1 |
/** |
---|---|

2 |
* VP8 NEON optimisations |

3 |
* |

4 |
* Copyright (c) 2010 Rob Clark <rob@ti.com> |

5 |
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com> |

6 |
* |

7 |
* This file is part of FFmpeg. |

8 |
* |

9 |
* FFmpeg is free software; you can redistribute it and/or |

10 |
* modify it under the terms of the GNU Lesser General Public |

11 |
* License as published by the Free Software Foundation; either |

12 |
* version 2.1 of the License, or (at your option) any later version. |

13 |
* |

14 |
* FFmpeg is distributed in the hope that it will be useful, |

15 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |

16 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

17 |
* Lesser General Public License for more details. |

18 |
* |

19 |
* You should have received a copy of the GNU Lesser General Public |

20 |
* License along with FFmpeg; if not, write to the Free Software |

21 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

22 |
*/ |

23 | |

24 |
#include "asm.S" |

25 | |

26 |
function ff_vp8_luma_dc_wht_neon, export=1 |

27 |
vld1.16 {q0-q1}, [r1,:128] |

28 |
vmov.i16 q15, #0 |

29 | |

30 |
vadd.i16 d4, d0, d3 |

31 |
vadd.i16 d6, d1, d2 |

32 |
vst1.16 {q15}, [r1,:128]! |

33 |
vsub.i16 d7, d1, d2 |

34 |
vsub.i16 d5, d0, d3 |

35 |
vst1.16 {q15}, [r1,:128] |

36 |
vadd.i16 q0, q2, q3 |

37 |
vsub.i16 q1, q2, q3 |

38 | |

39 |
vmov.i16 q8, #3 |

40 | |

41 |
vtrn.32 d0, d2 |

42 |
vtrn.32 d1, d3 |

43 |
vtrn.16 d0, d1 |

44 |
vtrn.16 d2, d3 |

45 | |

46 |
vadd.i16 d0, d0, d16 |

47 | |

48 |
vadd.i16 d4, d0, d3 |

49 |
vadd.i16 d6, d1, d2 |

50 |
vsub.i16 d7, d1, d2 |

51 |
vsub.i16 d5, d0, d3 |

52 |
vadd.i16 q0, q2, q3 |

53 |
vsub.i16 q1, q2, q3 |

54 | |

55 |
vshr.s16 q0, q0, #3 |

56 |
vshr.s16 q1, q1, #3 |

57 | |

58 |
mov r3, #32 |

59 |
vst1.16 {d0[0]}, [r0,:16], r3 |

60 |
vst1.16 {d1[0]}, [r0,:16], r3 |

61 |
vst1.16 {d2[0]}, [r0,:16], r3 |

62 |
vst1.16 {d3[0]}, [r0,:16], r3 |

63 |
vst1.16 {d0[1]}, [r0,:16], r3 |

64 |
vst1.16 {d1[1]}, [r0,:16], r3 |

65 |
vst1.16 {d2[1]}, [r0,:16], r3 |

66 |
vst1.16 {d3[1]}, [r0,:16], r3 |

67 |
vst1.16 {d0[2]}, [r0,:16], r3 |

68 |
vst1.16 {d1[2]}, [r0,:16], r3 |

69 |
vst1.16 {d2[2]}, [r0,:16], r3 |

70 |
vst1.16 {d3[2]}, [r0,:16], r3 |

71 |
vst1.16 {d0[3]}, [r0,:16], r3 |

72 |
vst1.16 {d1[3]}, [r0,:16], r3 |

73 |
vst1.16 {d2[3]}, [r0,:16], r3 |

74 |
vst1.16 {d3[3]}, [r0,:16], r3 |

75 | |

76 |
bx lr |

77 |
endfunc |

78 | |

79 |
function ff_vp8_luma_dc_wht_dc_neon, export=1 |

80 |
ldrsh r2, [r1] |

81 |
mov r3, #0 |

82 |
add r2, r2, #3 |

83 |
strh r3, [r1] |

84 |
asr r2, r2, #3 |

85 |
.rept 16 |

86 |
strh r2, [r0], #32 |

87 |
.endr |

88 |
bx lr |

89 |
endfunc |

90 | |

91 |
function ff_vp8_idct_add_neon, export=1 |

92 |
vld1.16 {q0-q1}, [r1,:128] |

93 |
movw r3, #20091 |

94 |
movt r3, #35468/2 |

95 |
vdup.32 d4, r3 |

96 | |

97 |
vmull.s16 q12, d1, d4[0] |

98 |
vmull.s16 q13, d3, d4[0] |

99 |
vqdmulh.s16 d20, d1, d4[1] |

100 |
vqdmulh.s16 d23, d3, d4[1] |

101 |
vshrn.s32 d21, q12, #16 |

102 |
vshrn.s32 d22, q13, #16 |

103 |
vadd.s16 d21, d21, d1 |

104 |
vadd.s16 d22, d22, d3 |

105 | |

106 |
vadd.s16 d16, d0, d2 |

107 |
vsub.s16 d17, d0, d2 |

108 |
vadd.s16 d18, d21, d23 |

109 |
vsub.s16 d19, d20, d22 |

110 |
vadd.s16 q0, q8, q9 |

111 |
vsub.s16 q1, q8, q9 |

112 | |

113 |
vtrn.32 d0, d3 |

114 |
vtrn.32 d1, d2 |

115 |
vtrn.16 d0, d1 |

116 |
vtrn.16 d3, d2 |

117 | |

118 |
vmov.i16 q15, #0 |

119 |
vmull.s16 q12, d1, d4[0] |

120 |
vst1.16 {q15}, [r1,:128]! |

121 |
vmull.s16 q13, d2, d4[0] |

122 |
vst1.16 {q15}, [r1,:128] |

123 |
vqdmulh.s16 d21, d1, d4[1] |

124 |
vqdmulh.s16 d23, d2, d4[1] |

125 |
vshrn.s32 d20, q12, #16 |

126 |
vshrn.s32 d22, q13, #16 |

127 |
vadd.i16 d20, d20, d1 |

128 |
vadd.i16 d22, d22, d2 |

129 | |

130 |
vadd.i16 d16, d0, d3 |

131 |
vsub.i16 d17, d0, d3 |

132 |
vadd.i16 d18, d20, d23 |

133 |
vld1.32 {d20[]}, [r0,:32], r2 |

134 |
vsub.i16 d19, d21, d22 |

135 |
vld1.32 {d22[]}, [r0,:32], r2 |

136 |
vadd.s16 q0, q8, q9 |

137 |
vld1.32 {d23[]}, [r0,:32], r2 |

138 |
vsub.s16 q1, q8, q9 |

139 |
vld1.32 {d21[]}, [r0,:32], r2 |

140 |
vrshr.s16 q0, q0, #3 |

141 |
vtrn.32 q10, q11 |

142 |
vrshr.s16 q1, q1, #3 |

143 | |

144 |
sub r0, r0, r2, lsl #2 |

145 | |

146 |
vtrn.32 d0, d3 |

147 |
vtrn.32 d1, d2 |

148 |
vtrn.16 d0, d1 |

149 |
vtrn.16 d3, d2 |

150 | |

151 |
vaddw.u8 q0, q0, d20 |

152 |
vaddw.u8 q1, q1, d21 |

153 |
vqmovun.s16 d0, q0 |

154 |
vqmovun.s16 d1, q1 |

155 | |

156 |
vst1.32 {d0[0]}, [r0,:32], r2 |

157 |
vst1.32 {d0[1]}, [r0,:32], r2 |

158 |
vst1.32 {d1[1]}, [r0,:32], r2 |

159 |
vst1.32 {d1[0]}, [r0,:32], r2 |

160 | |

161 |
bx lr |

162 |
endfunc |

163 | |

164 |
function ff_vp8_idct_dc_add_neon, export=1 |

165 |
mov r3, #0 |

166 |
ldrsh r12, [r1] |

167 |
strh r3, [r1] |

168 |
vdup.16 q1, r12 |

169 |
vrshr.s16 q1, q1, #3 |

170 |
vld1.32 {d0[]}, [r0,:32], r2 |

171 |
vld1.32 {d1[]}, [r0,:32], r2 |

172 |
vld1.32 {d0[1]}, [r0,:32], r2 |

173 |
vld1.32 {d1[1]}, [r0,:32], r2 |

174 |
vaddw.u8 q2, q1, d0 |

175 |
vaddw.u8 q3, q1, d1 |

176 |
sub r0, r0, r2, lsl #2 |

177 |
vqmovun.s16 d0, q2 |

178 |
vqmovun.s16 d1, q3 |

179 |
vst1.32 {d0[0]}, [r0,:32], r2 |

180 |
vst1.32 {d1[0]}, [r0,:32], r2 |

181 |
vst1.32 {d0[1]}, [r0,:32], r2 |

182 |
vst1.32 {d1[1]}, [r0,:32], r2 |

183 |
bx lr |

184 |
endfunc |

185 | |

186 |
function ff_vp8_idct_dc_add4uv_neon, export=1 |

187 |
vmov.i16 d0, #0 |

188 |
mov r3, #32 |

189 |
vld1.16 {d16[]}, [r1,:16] |

190 |
vst1.16 {d0[0]}, [r1,:16], r3 |

191 |
vld1.16 {d17[]}, [r1,:16] |

192 |
vst1.16 {d0[0]}, [r1,:16], r3 |

193 |
vld1.16 {d18[]}, [r1,:16] |

194 |
vst1.16 {d0[0]}, [r1,:16], r3 |

195 |
vld1.16 {d19[]}, [r1,:16] |

196 |
vst1.16 {d0[0]}, [r1,:16], r3 |

197 |
mov r3, r0 |

198 |
vrshr.s16 q8, q8, #3 @ dc >>= 3 |

199 |
vld1.8 {d0}, [r0,:64], r2 |

200 |
vrshr.s16 q9, q9, #3 |

201 |
vld1.8 {d1}, [r0,:64], r2 |

202 |
vaddw.u8 q10, q8, d0 |

203 |
vld1.8 {d2}, [r0,:64], r2 |

204 |
vaddw.u8 q0, q8, d1 |

205 |
vld1.8 {d3}, [r0,:64], r2 |

206 |
vaddw.u8 q11, q8, d2 |

207 |
vld1.8 {d4}, [r0,:64], r2 |

208 |
vaddw.u8 q1, q8, d3 |

209 |
vld1.8 {d5}, [r0,:64], r2 |

210 |
vaddw.u8 q12, q9, d4 |

211 |
vld1.8 {d6}, [r0,:64], r2 |

212 |
vaddw.u8 q2, q9, d5 |

213 |
vld1.8 {d7}, [r0,:64], r2 |

214 |
vaddw.u8 q13, q9, d6 |

215 |
vqmovun.s16 d20, q10 |

216 |
vaddw.u8 q3, q9, d7 |

217 |
vqmovun.s16 d21, q0 |

218 |
vqmovun.s16 d22, q11 |

219 |
vst1.8 {d20}, [r3,:64], r2 |

220 |
vqmovun.s16 d23, q1 |

221 |
vst1.8 {d21}, [r3,:64], r2 |

222 |
vqmovun.s16 d24, q12 |

223 |
vst1.8 {d22}, [r3,:64], r2 |

224 |
vqmovun.s16 d25, q2 |

225 |
vst1.8 {d23}, [r3,:64], r2 |

226 |
vqmovun.s16 d26, q13 |

227 |
vst1.8 {d24}, [r3,:64], r2 |

228 |
vqmovun.s16 d27, q3 |

229 |
vst1.8 {d25}, [r3,:64], r2 |

230 |
vst1.8 {d26}, [r3,:64], r2 |

231 |
vst1.8 {d27}, [r3,:64], r2 |

232 | |

233 |
bx lr |

234 |
endfunc |

235 | |

236 |
function ff_vp8_idct_dc_add4y_neon, export=1 |

237 |
vmov.i16 d0, #0 |

238 |
mov r3, #32 |

239 |
vld1.16 {d16[]}, [r1,:16] |

240 |
vst1.16 {d0[0]}, [r1,:16], r3 |

241 |
vld1.16 {d17[]}, [r1,:16] |

242 |
vst1.16 {d0[0]}, [r1,:16], r3 |

243 |
vld1.16 {d18[]}, [r1,:16] |

244 |
vst1.16 {d0[0]}, [r1,:16], r3 |

245 |
vld1.16 {d19[]}, [r1,:16] |

246 |
vst1.16 {d0[0]}, [r1,:16], r3 |

247 |
vrshr.s16 q8, q8, #3 @ dc >>= 3 |

248 |
vld1.8 {q0}, [r0,:128], r2 |

249 |
vrshr.s16 q9, q9, #3 |

250 |
vld1.8 {q1}, [r0,:128], r2 |

251 |
vaddw.u8 q10, q8, d0 |

252 |
vld1.8 {q2}, [r0,:128], r2 |

253 |
vaddw.u8 q0, q9, d1 |

254 |
vld1.8 {q3}, [r0,:128], r2 |

255 |
vaddw.u8 q11, q8, d2 |

256 |
vaddw.u8 q1, q9, d3 |

257 |
vaddw.u8 q12, q8, d4 |

258 |
vaddw.u8 q2, q9, d5 |

259 |
vaddw.u8 q13, q8, d6 |

260 |
vaddw.u8 q3, q9, d7 |

261 |
sub r0, r0, r2, lsl #2 |

262 |
vqmovun.s16 d20, q10 |

263 |
vqmovun.s16 d21, q0 |

264 |
vqmovun.s16 d22, q11 |

265 |
vqmovun.s16 d23, q1 |

266 |
vqmovun.s16 d24, q12 |

267 |
vst1.8 {q10}, [r0,:128], r2 |

268 |
vqmovun.s16 d25, q2 |

269 |
vst1.8 {q11}, [r0,:128], r2 |

270 |
vqmovun.s16 d26, q13 |

271 |
vst1.8 {q12}, [r0,:128], r2 |

272 |
vqmovun.s16 d27, q3 |

273 |
vst1.8 {q13}, [r0,:128], r2 |

274 | |

275 |
bx lr |

276 |
endfunc |

277 | |

278 |
@ Register layout: |

279 |
@ P3..Q3 -> q0..q7 |

280 |
@ flim_E -> q14 |

281 |
@ flim_I -> q15 |

282 |
@ hev_thresh -> r12 |

283 |
@ |

284 |
.macro vp8_loop_filter, inner=0, simple=0 |

285 |
.if \simple |

286 |
vabd.u8 q9, q3, q4 @ abs(P0-Q0) |

287 |
vabd.u8 q15, q2, q5 @ abs(P1-Q1) |

288 |
vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 |

289 |
vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 |

290 |
vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) |

291 |
vmov.i8 q13, #0x80 |

292 |
vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim |

293 |
.else |

294 |
@ calculate hev and normal_limit: |

295 |
vabd.u8 q12, q2, q3 @ abs(P1-P0) |

296 |
vabd.u8 q13, q5, q4 @ abs(Q1-Q0) |

297 |
vabd.u8 q10, q0, q1 @ abs(P3-P2) |

298 |
vabd.u8 q11, q1, q2 @ abs(P2-P1) |

299 |
vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I |

300 |
vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I |

301 |
vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I |

302 |
vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I |

303 |
vand q8, q8, q9 |

304 |
vabd.u8 q9, q7, q6 @ abs(Q3-Q2) |

305 |
vand q8, q8, q11 |

306 |
vabd.u8 q11, q6, q5 @ abs(Q2-Q1) |

307 |
vand q8, q8, q10 |

308 |
vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I |

309 |
vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I |

310 |
vabd.u8 q9, q3, q4 @ abs(P0-Q0) |

311 |
vabd.u8 q15, q2, q5 @ abs(P1-Q1) |

312 |
vand q8, q8, q10 |

313 |
vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 |

314 |
vand q8, q8, q11 |

315 |
vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 |

316 |
vdup.8 q15, r12 @ hev_thresh |

317 |
vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) |

318 |
vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh |

319 |
vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E |

320 |
vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh |

321 |
vand q8, q8, q11 |

322 |
vmov.i8 q13, #0x80 |

323 |
vorr q9, q12, q14 |

324 |
.endif |

325 | |

326 |
@ at this point: |

327 |
@ q8: normal_limit |

328 |
@ q9: hev |

329 | |

330 |
@ convert to signed value: |

331 |
veor q3, q3, q13 @ PS0 = P0 ^ 0x80 |

332 |
veor q4, q4, q13 @ QS0 = Q0 ^ 0x80 |

333 | |

334 |
vmov.i16 q12, #3 |

335 |
vsubl.s8 q10, d8, d6 @ QS0 - PS0 |

336 |
vsubl.s8 q11, d9, d7 @ (widened to 16bit) |

337 |
veor q2, q2, q13 @ PS1 = P1 ^ 0x80 |

338 |
veor q5, q5, q13 @ QS1 = Q1 ^ 0x80 |

339 |
vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0) |

340 |
vmul.i16 q11, q11, q12 |

341 | |

342 |
vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1) |

343 |
vmov.i8 q14, #4 |

344 |
vmov.i8 q15, #3 |

345 |
.if \inner |

346 |
vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1) |

347 |
.endif |

348 |
vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1) |

349 |
vaddw.s8 q11, q11, d25 |

350 |
vqmovn.s16 d20, q10 @ narrow result back into q10 |

351 |
vqmovn.s16 d21, q11 |

352 |
.if !\inner && !\simple |

353 |
veor q1, q1, q13 @ PS2 = P2 ^ 0x80 |

354 |
veor q6, q6, q13 @ QS2 = Q2 ^ 0x80 |

355 |
.endif |

356 |
vand q10, q10, q8 @ w &= normal_limit |

357 | |

358 |
@ registers used at this point.. |

359 |
@ q0 -> P3 (don't corrupt) |

360 |
@ q1-q6 -> PS2-QS2 |

361 |
@ q7 -> Q3 (don't corrupt) |

362 |
@ q9 -> hev |

363 |
@ q10 -> w |

364 |
@ q13 -> #0x80 |

365 |
@ q14 -> #4 |

366 |
@ q15 -> #3 |

367 |
@ q8, q11, q12 -> unused |

368 | |

369 |
@ filter_common: is4tap==1 |

370 |
@ c1 = clamp(w + 4) >> 3; |

371 |
@ c2 = clamp(w + 3) >> 3; |

372 |
@ Q0 = s2u(QS0 - c1); |

373 |
@ P0 = s2u(PS0 + c2); |

374 | |

375 |
.if \simple |

376 |
vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) |

377 |
vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) |

378 |
vshr.s8 q11, q11, #3 @ c1 >>= 3 |

379 |
vshr.s8 q12, q12, #3 @ c2 >>= 3 |

380 |
vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) |

381 |
vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) |

382 |
veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 |

383 |
veor q3, q3, q13 @ P0 = PS0 ^ 0x80 |

384 |
veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 |

385 |
veor q2, q2, q13 @ P1 = PS1 ^ 0x80 |

386 |
.elseif \inner |

387 |
@ the !is4tap case of filter_common, only used for inner blocks |

388 |
@ c3 = ((c1&~hev) + 1) >> 1; |

389 |
@ Q1 = s2u(QS1 - c3); |

390 |
@ P1 = s2u(PS1 + c3); |

391 |
vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) |

392 |
vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) |

393 |
vshr.s8 q11, q11, #3 @ c1 >>= 3 |

394 |
vshr.s8 q12, q12, #3 @ c2 >>= 3 |

395 |
vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) |

396 |
vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) |

397 |
vbic q11, q11, q9 @ c1 & ~hev |

398 |
veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 |

399 |
vrshr.s8 q11, q11, #1 @ c3 >>= 1 |

400 |
veor q3, q3, q13 @ P0 = PS0 ^ 0x80 |

401 |
vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3) |

402 |
vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3) |

403 |
veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 |

404 |
veor q2, q2, q13 @ P1 = PS1 ^ 0x80 |

405 |
.else |

406 |
vand q12, q10, q9 @ w & hev |

407 |
vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4) |

408 |
vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3) |

409 |
vshr.s8 q11, q11, #3 @ c1 >>= 3 |

410 |
vshr.s8 q12, q12, #3 @ c2 >>= 3 |

411 |
vbic q10, q10, q9 @ w &= ~hev |

412 |
vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) |

413 |
vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) |

414 | |

415 |
@ filter_mbedge: |

416 |
@ a = clamp((27*w + 63) >> 7); |

417 |
@ Q0 = s2u(QS0 - a); |

418 |
@ P0 = s2u(PS0 + a); |

419 |
@ a = clamp((18*w + 63) >> 7); |

420 |
@ Q1 = s2u(QS1 - a); |

421 |
@ P1 = s2u(PS1 + a); |

422 |
@ a = clamp((9*w + 63) >> 7); |

423 |
@ Q2 = s2u(QS2 - a); |

424 |
@ P2 = s2u(PS2 + a); |

425 |
vmov.i16 q9, #63 |

426 |
vshll.s8 q14, d20, #3 |

427 |
vshll.s8 q15, d21, #3 |

428 |
vaddw.s8 q14, q14, d20 |

429 |
vaddw.s8 q15, q15, d21 |

430 |
vadd.s16 q8, q9, q14 |

431 |
vadd.s16 q9, q9, q15 @ 9*w + 63 |

432 |
vadd.s16 q11, q8, q14 |

433 |
vadd.s16 q12, q9, q15 @ 18*w + 63 |

434 |
vadd.s16 q14, q11, q14 |

435 |
vadd.s16 q15, q12, q15 @ 27*w + 63 |

436 |
vqshrn.s16 d16, q8, #7 |

437 |
vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7) |

438 |
vqshrn.s16 d22, q11, #7 |

439 |
vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7) |

440 |
vqshrn.s16 d28, q14, #7 |

441 |
vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7) |

442 |
vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a) |

443 |
vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a) |

444 |
vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a) |

445 |
vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a) |

446 |
vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a) |

447 |
vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a) |

448 |
veor q3, q3, q13 @ P0 = PS0 ^ 0x80 |

449 |
veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 |

450 |
veor q2, q2, q13 @ P1 = PS1 ^ 0x80 |

451 |
veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 |

452 |
veor q1, q1, q13 @ P2 = PS2 ^ 0x80 |

453 |
veor q6, q6, q13 @ Q2 = QS2 ^ 0x80 |

454 |
.endif |

455 |
.endm |

456 | |

457 |
.macro transpose8x16matrix |

458 |
vtrn.32 q0, q4 |

459 |
vtrn.32 q1, q5 |

460 |
vtrn.32 q2, q6 |

461 |
vtrn.32 q3, q7 |

462 | |

463 |
vtrn.16 q0, q2 |

464 |
vtrn.16 q1, q3 |

465 |
vtrn.16 q4, q6 |

466 |
vtrn.16 q5, q7 |

467 | |

468 |
vtrn.8 q0, q1 |

469 |
vtrn.8 q2, q3 |

470 |
vtrn.8 q4, q5 |

471 |
vtrn.8 q6, q7 |

472 |
.endm |

473 | |

474 |
.macro vp8_v_loop_filter16 name, inner=0, simple=0 |

475 |
function ff_vp8_v_loop_filter16\name\()_neon, export=1 |

476 |
vpush {q4-q7} |

477 |
sub r0, r0, r1, lsl #1+!\simple |

478 | |

479 |
@ Load pixels: |

480 |
.if !\simple |

481 |
ldr r12, [sp, #64] @ hev_thresh |

482 |
vld1.8 {q0}, [r0,:128], r1 @ P3 |

483 |
vld1.8 {q1}, [r0,:128], r1 @ P2 |

484 |
.endif |

485 |
vld1.8 {q2}, [r0,:128], r1 @ P1 |

486 |
vld1.8 {q3}, [r0,:128], r1 @ P0 |

487 |
vld1.8 {q4}, [r0,:128], r1 @ Q0 |

488 |
vld1.8 {q5}, [r0,:128], r1 @ Q1 |

489 |
.if !\simple |

490 |
vld1.8 {q6}, [r0,:128], r1 @ Q2 |

491 |
vld1.8 {q7}, [r0,:128] @ Q3 |

492 |
vdup.8 q15, r3 @ flim_I |

493 |
.endif |

494 |
vdup.8 q14, r2 @ flim_E |

495 | |

496 |
vp8_loop_filter inner=\inner, simple=\simple |

497 | |

498 |
@ back up to P2: dst -= stride * 6 |

499 |
sub r0, r0, r1, lsl #2 |

500 |
.if !\simple |

501 |
sub r0, r0, r1, lsl #1 |

502 | |

503 |
@ Store pixels: |

504 |
vst1.8 {q1}, [r0,:128], r1 @ P2 |

505 |
.endif |

506 |
vst1.8 {q2}, [r0,:128], r1 @ P1 |

507 |
vst1.8 {q3}, [r0,:128], r1 @ P0 |

508 |
vst1.8 {q4}, [r0,:128], r1 @ Q0 |

509 |
vst1.8 {q5}, [r0,:128], r1 @ Q1 |

510 |
.if !\simple |

511 |
vst1.8 {q6}, [r0,:128] @ Q2 |

512 |
.endif |

513 | |

514 |
vpop {q4-q7} |

515 |
bx lr |

516 |
endfunc |

517 |
.endm |

518 | |

519 |
vp8_v_loop_filter16 |

520 |
vp8_v_loop_filter16 _inner, inner=1 |

521 |
vp8_v_loop_filter16 _simple, simple=1 |

522 | |

523 |
.macro vp8_v_loop_filter8uv name, inner=0 |

524 |
function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 |

525 |
vpush {q4-q7} |

526 |
sub r0, r0, r2, lsl #2 |

527 |
sub r1, r1, r2, lsl #2 |

528 |
ldr r12, [sp, #64] @ flim_I |

529 | |

530 |
@ Load pixels: |

531 |
vld1.8 {d0}, [r0,:64], r2 @ P3 |

532 |
vld1.8 {d1}, [r1,:64], r2 @ P3 |

533 |
vld1.8 {d2}, [r0,:64], r2 @ P2 |

534 |
vld1.8 {d3}, [r1,:64], r2 @ P2 |

535 |
vld1.8 {d4}, [r0,:64], r2 @ P1 |

536 |
vld1.8 {d5}, [r1,:64], r2 @ P1 |

537 |
vld1.8 {d6}, [r0,:64], r2 @ P0 |

538 |
vld1.8 {d7}, [r1,:64], r2 @ P0 |

539 |
vld1.8 {d8}, [r0,:64], r2 @ Q0 |

540 |
vld1.8 {d9}, [r1,:64], r2 @ Q0 |

541 |
vld1.8 {d10}, [r0,:64], r2 @ Q1 |

542 |
vld1.8 {d11}, [r1,:64], r2 @ Q1 |

543 |
vld1.8 {d12}, [r0,:64], r2 @ Q2 |

544 |
vld1.8 {d13}, [r1,:64], r2 @ Q2 |

545 |
vld1.8 {d14}, [r0,:64] @ Q3 |

546 |
vld1.8 {d15}, [r1,:64] @ Q3 |

547 | |

548 |
vdup.8 q14, r3 @ flim_E |

549 |
vdup.8 q15, r12 @ flim_I |

550 |
ldr r12, [sp, #68] @ hev_thresh |

551 | |

552 |
vp8_loop_filter inner=\inner |

553 | |

554 |
@ back up to P2: u,v -= stride * 6 |

555 |
sub r0, r0, r2, lsl #2 |

556 |
sub r1, r1, r2, lsl #2 |

557 |
sub r0, r0, r2, lsl #1 |

558 |
sub r1, r1, r2, lsl #1 |

559 | |

560 |
@ Store pixels: |

561 |
vst1.8 {d2}, [r0,:64], r2 @ P2 |

562 |
vst1.8 {d3}, [r1,:64], r2 @ P2 |

563 |
vst1.8 {d4}, [r0,:64], r2 @ P1 |

564 |
vst1.8 {d5}, [r1,:64], r2 @ P1 |

565 |
vst1.8 {d6}, [r0,:64], r2 @ P0 |

566 |
vst1.8 {d7}, [r1,:64], r2 @ P0 |

567 |
vst1.8 {d8}, [r0,:64], r2 @ Q0 |

568 |
vst1.8 {d9}, [r1,:64], r2 @ Q0 |

569 |
vst1.8 {d10}, [r0,:64], r2 @ Q1 |

570 |
vst1.8 {d11}, [r1,:64], r2 @ Q1 |

571 |
vst1.8 {d12}, [r0,:64] @ Q2 |

572 |
vst1.8 {d13}, [r1,:64] @ Q2 |

573 | |

574 |
vpop {q4-q7} |

575 |
bx lr |

576 |
endfunc |

577 |
.endm |

578 | |

579 |
vp8_v_loop_filter8uv |

580 |
vp8_v_loop_filter8uv _inner, inner=1 |

581 | |

582 |
.macro vp8_h_loop_filter16 name, inner=0, simple=0 |

583 |
function ff_vp8_h_loop_filter16\name\()_neon, export=1 |

584 |
vpush {q4-q7} |

585 |
sub r0, r0, #4 |

586 |
.if !\simple |

587 |
ldr r12, [sp, #64] @ hev_thresh |

588 |
.endif |

589 | |

590 |
@ Load pixels: |

591 |
vld1.8 {d0}, [r0], r1 @ load first 8-line src data |

592 |
vld1.8 {d2}, [r0], r1 |

593 |
vld1.8 {d4}, [r0], r1 |

594 |
vld1.8 {d6}, [r0], r1 |

595 |
vld1.8 {d8}, [r0], r1 |

596 |
vld1.8 {d10}, [r0], r1 |

597 |
vld1.8 {d12}, [r0], r1 |

598 |
vld1.8 {d14}, [r0], r1 |

599 |
vld1.8 {d1}, [r0], r1 @ load second 8-line src data |

600 |
vld1.8 {d3}, [r0], r1 |

601 |
vld1.8 {d5}, [r0], r1 |

602 |
vld1.8 {d7}, [r0], r1 |

603 |
vld1.8 {d9}, [r0], r1 |

604 |
vld1.8 {d11}, [r0], r1 |

605 |
vld1.8 {d13}, [r0], r1 |

606 |
vld1.8 {d15}, [r0], r1 |

607 | |

608 |
transpose8x16matrix |

609 | |

610 |
vdup.8 q14, r2 @ flim_E |

611 |
.if !\simple |

612 |
vdup.8 q15, r3 @ flim_I |

613 |
.endif |

614 | |

615 |
vp8_loop_filter inner=\inner, simple=\simple |

616 | |

617 |
sub r0, r0, r1, lsl #4 @ backup 16 rows |

618 | |

619 |
transpose8x16matrix |

620 | |

621 |
@ Store pixels: |

622 |
vst1.8 {d0}, [r0], r1 |

623 |
vst1.8 {d2}, [r0], r1 |

624 |
vst1.8 {d4}, [r0], r1 |

625 |
vst1.8 {d6}, [r0], r1 |

626 |
vst1.8 {d8}, [r0], r1 |

627 |
vst1.8 {d10}, [r0], r1 |

628 |
vst1.8 {d12}, [r0], r1 |

629 |
vst1.8 {d14}, [r0], r1 |

630 |
vst1.8 {d1}, [r0], r1 |

631 |
vst1.8 {d3}, [r0], r1 |

632 |
vst1.8 {d5}, [r0], r1 |

633 |
vst1.8 {d7}, [r0], r1 |

634 |
vst1.8 {d9}, [r0], r1 |

635 |
vst1.8 {d11}, [r0], r1 |

636 |
vst1.8 {d13}, [r0], r1 |

637 |
vst1.8 {d15}, [r0] |

638 | |

639 |
vpop {q4-q7} |

640 |
bx lr |

641 |
endfunc |

642 |
.endm |

643 | |

644 |
vp8_h_loop_filter16 |

645 |
vp8_h_loop_filter16 _inner, inner=1 |

646 |
vp8_h_loop_filter16 _simple, simple=1 |

647 | |

648 |
.macro vp8_h_loop_filter8uv name, inner=0 |

649 |
function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 |

650 |
vpush {q4-q7} |

651 |
sub r0, r0, #4 |

652 |
sub r1, r1, #4 |

653 |
ldr r12, [sp, #64] @ flim_I |

654 | |

655 |
@ Load pixels: |

656 |
vld1.8 {d0}, [r0], r2 @ load u |

657 |
vld1.8 {d1}, [r1], r2 @ load v |

658 |
vld1.8 {d2}, [r0], r2 |

659 |
vld1.8 {d3}, [r1], r2 |

660 |
vld1.8 {d4}, [r0], r2 |

661 |
vld1.8 {d5}, [r1], r2 |

662 |
vld1.8 {d6}, [r0], r2 |

663 |
vld1.8 {d7}, [r1], r2 |

664 |
vld1.8 {d8}, [r0], r2 |

665 |
vld1.8 {d9}, [r1], r2 |

666 |
vld1.8 {d10}, [r0], r2 |

667 |
vld1.8 {d11}, [r1], r2 |

668 |
vld1.8 {d12}, [r0], r2 |

669 |
vld1.8 {d13}, [r1], r2 |

670 |
vld1.8 {d14}, [r0], r2 |

671 |
vld1.8 {d15}, [r1], r2 |

672 | |

673 |
transpose8x16matrix |

674 | |

675 |
vdup.8 q14, r3 @ flim_E |

676 |
vdup.8 q15, r12 @ flim_I |

677 |
ldr r12, [sp, #68] @ hev_thresh |

678 | |

679 |
vp8_loop_filter inner=\inner |

680 | |

681 |
sub r0, r0, r2, lsl #3 @ backup u 8 rows |

682 |
sub r1, r1, r2, lsl #3 @ backup v 8 rows |

683 | |

684 |
transpose8x16matrix |

685 | |

686 |
@ Store pixels: |

687 |
vst1.8 {d0}, [r0], r2 |

688 |
vst1.8 {d1}, [r1], r2 |

689 |
vst1.8 {d2}, [r0], r2 |

690 |
vst1.8 {d3}, [r1], r2 |

691 |
vst1.8 {d4}, [r0], r2 |

692 |
vst1.8 {d5}, [r1], r2 |

693 |
vst1.8 {d6}, [r0], r2 |

694 |
vst1.8 {d7}, [r1], r2 |

695 |
vst1.8 {d8}, [r0], r2 |

696 |
vst1.8 {d9}, [r1], r2 |

697 |
vst1.8 {d10}, [r0], r2 |

698 |
vst1.8 {d11}, [r1], r2 |

699 |
vst1.8 {d12}, [r0], r2 |

700 |
vst1.8 {d13}, [r1], r2 |

701 |
vst1.8 {d14}, [r0] |

702 |
vst1.8 {d15}, [r1] |

703 | |

704 |
vpop {q4-q7} |

705 |
bx lr |

706 |
endfunc |

707 |
.endm |

708 | |

709 |
vp8_h_loop_filter8uv |

710 |
vp8_h_loop_filter8uv _inner, inner=1 |

711 | |

712 |
function ff_put_vp8_pixels16_neon, export=1 |

713 |
ldr r12, [sp, #0] @ h |

714 |
1: |

715 |
subs r12, r12, #4 |

716 |
vld1.8 {q0}, [r2], r3 |

717 |
vld1.8 {q1}, [r2], r3 |

718 |
vld1.8 {q2}, [r2], r3 |

719 |
vld1.8 {q3}, [r2], r3 |

720 |
vst1.8 {q0}, [r0,:128], r1 |

721 |
vst1.8 {q1}, [r0,:128], r1 |

722 |
vst1.8 {q2}, [r0,:128], r1 |

723 |
vst1.8 {q3}, [r0,:128], r1 |

724 |
bgt 1b |

725 |
bx lr |

726 |
endfunc |

727 | |

728 |
function ff_put_vp8_pixels8_neon, export=1 |

729 |
ldr r12, [sp, #0] @ h |

730 |
1: |

731 |
subs r12, r12, #4 |

732 |
vld1.8 {d0}, [r2], r3 |

733 |
vld1.8 {d1}, [r2], r3 |

734 |
vld1.8 {d2}, [r2], r3 |

735 |
vld1.8 {d3}, [r2], r3 |

736 |
vst1.8 {d0}, [r0,:64], r1 |

737 |
vst1.8 {d1}, [r0,:64], r1 |

738 |
vst1.8 {d2}, [r0,:64], r1 |

739 |
vst1.8 {d3}, [r0,:64], r1 |

740 |
bgt 1b |

741 |
bx lr |

742 |
endfunc |

743 | |

744 |
function ff_put_vp8_pixels4_neon, export=1 |

745 |
ldr r12, [sp, #0] @ h |

746 |
push {r4-r6,lr} |

747 |
1: |

748 |
subs r12, r12, #4 |

749 |
ldr r4, [r2], r3 |

750 |
ldr r5, [r2], r3 |

751 |
ldr r6, [r2], r3 |

752 |
ldr lr, [r2], r3 |

753 |
str r4, [r0], r1 |

754 |
str r5, [r0], r1 |

755 |
str r6, [r0], r1 |

756 |
str lr, [r0], r1 |

757 |
bgt 1b |

758 |
pop {r4-r6,pc} |

759 |
endfunc |

760 | |

761 |
/* 4/6-tap 8th-pel MC */ |

762 | |

763 |
.macro vp8_epel8_h6 d, a, b |

764 |
vext.8 d27, \a, \b, #1 |

765 |
vmovl.u8 q8, \a |

766 |
vext.8 d28, \a, \b, #2 |

767 |
vmovl.u8 q9, d27 |

768 |
vext.8 d29, \a, \b, #3 |

769 |
vmovl.u8 q10, d28 |

770 |
vext.8 d30, \a, \b, #4 |

771 |
vmovl.u8 q11, d29 |

772 |
vext.8 d31, \a, \b, #5 |

773 |
vmovl.u8 q12, d30 |

774 |
vmul.u16 q10, q10, d0[2] |

775 |
vmovl.u8 q13, d31 |

776 |
vmul.u16 q11, q11, d0[3] |

777 |
vmls.u16 q10, q9, d0[1] |

778 |
vmls.u16 q11, q12, d1[0] |

779 |
vmla.u16 q10, q8, d0[0] |

780 |
vmla.u16 q11, q13, d1[1] |

781 |
vqadd.s16 q11, q10, q11 |

782 |
vqrshrun.s16 \d, q11, #7 |

783 |
.endm |

784 | |

785 |
.macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1 |

786 |
vext.8 q14, \q0, \q1, #3 |

787 |
vext.8 q15, \q0, \q1, #4 |

788 |
vmovl.u8 q11, d28 |

789 |
vmovl.u8 q14, d29 |

790 |
vext.8 q3, \q0, \q1, #2 |

791 |
vmovl.u8 q12, d30 |

792 |
vmovl.u8 q15, d31 |

793 |
vext.8 q8, \q0, \q1, #1 |

794 |
vmovl.u8 q10, d6 |

795 |
vmovl.u8 q3, d7 |

796 |
vext.8 q2, \q0, \q1, #5 |

797 |
vmovl.u8 q13, d4 |

798 |
vmovl.u8 q2, d5 |

799 |
vmovl.u8 q9, d16 |

800 |
vmovl.u8 q8, d17 |

801 |
vmul.u16 q11, q11, d0[3] |

802 |
vmul.u16 q10, q10, d0[2] |

803 |
vmul.u16 q3, q3, d0[2] |

804 |
vmul.u16 q14, q14, d0[3] |

805 |
vmls.u16 q11, q12, d1[0] |

806 |
vmovl.u8 q12, \s0 |

807 |
vmovl.u8 q1, \s1 |

808 |
vmls.u16 q10, q9, d0[1] |

809 |
vmls.u16 q3, q8, d0[1] |

810 |
vmls.u16 q14, q15, d1[0] |

811 |
vmla.u16 q10, q12, d0[0] |

812 |
vmla.u16 q11, q13, d1[1] |

813 |
vmla.u16 q3, q1, d0[0] |

814 |
vmla.u16 q14, q2, d1[1] |

815 |
vqadd.s16 q11, q10, q11 |

816 |
vqadd.s16 q14, q3, q14 |

817 |
vqrshrun.s16 \d0, q11, #7 |

818 |
vqrshrun.s16 \d1, q14, #7 |

819 |
.endm |

820 | |

821 |
.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 |

822 |
vmovl.u8 q10, \s2 |

823 |
vmovl.u8 q11, \s3 |

824 |
vmovl.u8 q9, \s1 |

825 |
vmovl.u8 q12, \s4 |

826 |
vmovl.u8 q8, \s0 |

827 |
vmovl.u8 q13, \s5 |

828 |
vmul.u16 q10, q10, d0[2] |

829 |
vmul.u16 q11, q11, d0[3] |

830 |
vmls.u16 q10, q9, d0[1] |

831 |
vmls.u16 q11, q12, d1[0] |

832 |
vmla.u16 q10, q8, d0[0] |

833 |
vmla.u16 q11, q13, d1[1] |

834 |
vqadd.s16 q11, q10, q11 |

835 |
vqrshrun.s16 \d0, q11, #7 |

836 |
.endm |

837 | |

838 |
.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 |

839 |
vmovl.u8 q10, \s0 |

840 |
vmovl.u8 q11, \s3 |

841 |
vmovl.u8 q14, \s6 |

842 |
vmovl.u8 q9, \s1 |

843 |
vmovl.u8 q12, \s4 |

844 |
vmovl.u8 q8, \s2 |

845 |
vmovl.u8 q13, \s5 |

846 |
vmul.u16 q10, q10, d0[0] |

847 |
vmul.u16 q15, q11, d0[3] |

848 |
vmul.u16 q11, q11, d0[2] |

849 |
vmul.u16 q14, q14, d1[1] |

850 |
vmls.u16 q10, q9, d0[1] |

851 |
vmls.u16 q15, q12, d1[0] |

852 |
vmls.u16 q11, q8, d0[1] |

853 |
vmls.u16 q14, q13, d1[0] |

854 |
vmla.u16 q10, q8, d0[2] |

855 |
vmla.u16 q15, q13, d1[1] |

856 |
vmla.u16 q11, q9, d0[0] |

857 |
vmla.u16 q14, q12, d0[3] |

858 |
vqadd.s16 q15, q10, q15 |

859 |
vqadd.s16 q14, q11, q14 |

860 |
vqrshrun.s16 \d0, q15, #7 |

861 |
vqrshrun.s16 \d1, q14, #7 |

862 |
.endm |

863 | |

864 |
.macro vp8_epel8_h4 d, a, b |

865 |
vext.8 d28, \a, \b, #1 |

866 |
vmovl.u8 q9, \a |

867 |
vext.8 d29, \a, \b, #2 |

868 |
vmovl.u8 q10, d28 |

869 |
vext.8 d30, \a, \b, #3 |

870 |
vmovl.u8 q11, d29 |

871 |
vmovl.u8 q12, d30 |

872 |
vmul.u16 q10, q10, d0[2] |

873 |
vmul.u16 q11, q11, d0[3] |

874 |
vmls.u16 q10, q9, d0[1] |

875 |
vmls.u16 q11, q12, d1[0] |

876 |
vqadd.s16 q11, q10, q11 |

877 |
vqrshrun.s16 \d, q11, #7 |

878 |
.endm |

879 | |

880 |
.macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4 |

881 |
vmovl.u8 q9, \s0 |

882 |
vmovl.u8 q10, \s1 |

883 |
vmovl.u8 q11, \s2 |

884 |
vmovl.u8 q12, \s3 |

885 |
vmovl.u8 q13, \s4 |

886 |
vmul.u16 q8, q10, d0[2] |

887 |
vmul.u16 q14, q11, d0[3] |

888 |
vmul.u16 q11, q11, d0[2] |

889 |
vmul.u16 q15, q12, d0[3] |

890 |
vmls.u16 q8, q9, d0[1] |

891 |
vmls.u16 q14, q12, d1[0] |

892 |
vmls.u16 q11, q10, d0[1] |

893 |
vmls.u16 q15, q13, d1[0] |

894 |
vqadd.s16 q8, q8, q14 |

895 |
vqadd.s16 q11, q11, q15 |

896 |
vqrshrun.s16 \d0, q8, #7 |

897 |
vqrshrun.s16 \d1, q11, #7 |

898 |
.endm |

899 | |

900 |
function ff_put_vp8_epel16_v6_neon, export=1 |

901 |
sub r2, r2, r3, lsl #1 |

902 |
push {r4,lr} |

903 |
vpush {d8-d15} |

904 | |

905 |
ldr r4, [sp, #80] @ my |

906 |
movrel lr, subpel_filters-16 |

907 |
ldr r12, [sp, #72] @ h |

908 |
add r4, lr, r4, lsl #4 |

909 |
vld1.16 {q0}, [r4,:128] |

910 |
1: |

911 |
vld1.8 {d2-d3}, [r2], r3 |

912 |
vld1.8 {d4-d5}, [r2], r3 |

913 |
vld1.8 {d6-d7}, [r2], r3 |

914 |
vld1.8 {d8-d9}, [r2], r3 |

915 |
vld1.8 {d10-d11},[r2], r3 |

916 |
vld1.8 {d12-d13},[r2], r3 |

917 |
vld1.8 {d14-d15},[r2] |

918 |
sub r2, r2, r3, lsl #2 |

919 | |

920 |
vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14 |

921 |
vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15 |

922 | |

923 |
vst1.8 {d2-d3}, [r0,:128], r1 |

924 |
vst1.8 {d4-d5}, [r0,:128], r1 |

925 |
subs r12, r12, #2 |

926 |
bne 1b |

927 | |

928 |
vpop {d8-d15} |

929 |
pop {r4,pc} |

930 |
endfunc |

931 | |

932 |
function ff_put_vp8_epel16_h6_neon, export=1 |

933 |
sub r2, r2, #2 |

934 |
push {r4,lr} |

935 | |

936 |
ldr r4, [sp, #12] @ mx |

937 |
movrel lr, subpel_filters-16 |

938 |
ldr r12, [sp, #8] @ h |

939 |
add r4, lr, r4, lsl #4 |

940 |
vld1.16 {q0}, [r4,:128] |

941 |
1: |

942 |
vld1.8 {d2-d4}, [r2], r3 |

943 | |

944 |
vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 |

945 | |

946 |
vst1.8 {d2-d3}, [r0,:128], r1 |

947 |
subs r12, r12, #1 |

948 |
bne 1b |

949 | |

950 |
pop {r4,pc} |

951 |
endfunc |

952 | |

953 |
function ff_put_vp8_epel16_h6v6_neon, export=1 |

954 |
sub r2, r2, r3, lsl #1 |

955 |
sub r2, r2, #2 |

956 |
push {r4,lr} |

957 |
vpush {d8-d9} |

958 | |

959 |
@ first pass (horizontal): |

960 |
ldr r4, [sp, #28] @ mx |

961 |
movrel lr, subpel_filters-16 |

962 |
ldr r12, [sp, #24] @ h |

963 |
add r4, lr, r4, lsl #4 |

964 |
sub sp, sp, #336+16 |

965 |
vld1.16 {q0}, [r4,:128] |

966 |
add lr, sp, #15 |

967 |
add r12, r12, #5 |

968 |
bic lr, lr, #15 |

969 |
1: |

970 |
vld1.8 {d2,d3,d4}, [r2], r3 |

971 | |

972 |
vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 |

973 | |

974 |
vst1.8 {d2-d3}, [lr,:128]! |

975 |
subs r12, r12, #1 |

976 |
bne 1b |

977 | |

978 |
@ second pass (vertical): |

979 |
ldr r4, [sp, #336+16+32] @ my |

980 |
movrel lr, subpel_filters-16 |

981 |
ldr r12, [sp, #336+16+24] @ h |

982 |
add r4, lr, r4, lsl #4 |

983 |
add lr, sp, #15 |

984 |
vld1.16 {q0}, [r4,:128] |

985 |
bic lr, lr, #15 |

986 |
2: |

987 |
vld1.8 {d2-d5}, [lr,:128]! |

988 |
vld1.8 {d6-d9}, [lr,:128]! |

989 |
vld1.8 {d28-d31},[lr,:128] |

990 |
sub lr, lr, #48 |

991 | |

992 |
vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30 |

993 |
vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31 |

994 | |

995 |
vst1.8 {d2-d3}, [r0,:128], r1 |

996 |
subs r12, r12, #1 |

997 |
bne 2b |

998 | |

999 |
add sp, sp, #336+16 |

1000 |
vpop {d8-d9} |

1001 |
pop {r4,pc} |

1002 |
endfunc |

1003 | |

1004 |
function ff_put_vp8_epel8_v6_neon, export=1 |

1005 |
sub r2, r2, r3, lsl #1 |

1006 |
push {r4,lr} |

1007 | |

1008 |
ldr r4, [sp, #16] @ my |

1009 |
movrel lr, subpel_filters-16 |

1010 |
ldr r12, [sp, #8] @ h |

1011 |
add r4, lr, r4, lsl #4 |

1012 |
vld1.16 {q0}, [r4,:128] |

1013 |
1: |

1014 |
vld1.8 {d2}, [r2], r3 |

1015 |
vld1.8 {d3}, [r2], r3 |

1016 |
vld1.8 {d4}, [r2], r3 |

1017 |
vld1.8 {d5}, [r2], r3 |

1018 |
vld1.8 {d6}, [r2], r3 |

1019 |
vld1.8 {d7}, [r2], r3 |

1020 |
vld1.8 {d28}, [r2] |

1021 | |

1022 |
sub r2, r2, r3, lsl #2 |

1023 | |

1024 |
vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 |

1025 | |

1026 |
vst1.8 {d2}, [r0,:64], r1 |

1027 |
vst1.8 {d3}, [r0,:64], r1 |

1028 |
subs r12, r12, #2 |

1029 |
bne 1b |

1030 | |

1031 |
pop {r4,pc} |

1032 |
endfunc |

1033 | |

1034 |
function ff_put_vp8_epel8_h6_neon, export=1 |

1035 |
sub r2, r2, #2 |

1036 |
push {r4,lr} |

1037 | |

1038 |
ldr r4, [sp, #12] @ mx |

1039 |
movrel lr, subpel_filters-16 |

1040 |
ldr r12, [sp, #8] @ h |

1041 |
add r4, lr, r4, lsl #4 |

1042 |
vld1.16 {q0}, [r4,:128] |

1043 |
1: |

1044 |
vld1.8 {d2,d3}, [r2], r3 |

1045 | |

1046 |
vp8_epel8_h6 d2, d2, d3 |

1047 | |

1048 |
vst1.8 {d2}, [r0,:64], r1 |

1049 |
subs r12, r12, #1 |

1050 |
bne 1b |

1051 | |

1052 |
pop {r4,pc} |

1053 |
endfunc |

1054 | |

1055 |
function ff_put_vp8_epel8_h6v6_neon, export=1 |

1056 |
sub r2, r2, r3, lsl #1 |

1057 |
sub r2, r2, #2 |

1058 |
push {r4,lr} |

1059 | |

1060 |
@ first pass (horizontal): |

1061 |
ldr r4, [sp, #12] @ mx |

1062 |
movrel lr, subpel_filters-16 |

1063 |
ldr r12, [sp, #8] @ h |

1064 |
add r4, lr, r4, lsl #4 |

1065 |
sub sp, sp, #168+16 |

1066 |
vld1.16 {q0}, [r4,:128] |

1067 |
add lr, sp, #15 |

1068 |
add r12, r12, #5 |

1069 |
bic lr, lr, #15 |

1070 |
1: |

1071 |
vld1.8 {d2,d3}, [r2], r3 |

1072 | |

1073 |
vp8_epel8_h6 d2, d2, d3 |

1074 | |

1075 |
vst1.8 {d2}, [lr,:64]! |

1076 |
subs r12, r12, #1 |

1077 |
bne 1b |

1078 | |

1079 |
@ second pass (vertical): |

1080 |
ldr r4, [sp, #168+16+16] @ my |

1081 |
movrel lr, subpel_filters-16 |

1082 |
ldr r12, [sp, #168+16+8] @ h |

1083 |
add r4, lr, r4, lsl #4 |

1084 |
add lr, sp, #15 |

1085 |
vld1.16 {q0}, [r4,:128] |

1086 |
bic lr, lr, #15 |

1087 |
2: |

1088 |
vld1.8 {d2-d5}, [lr,:128]! |

1089 |
vld1.8 {d6-d7}, [lr,:128]! |

1090 |
vld1.8 {d30}, [lr,:64] |

1091 |
sub lr, lr, #32 |

1092 | |

1093 |
vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 |

1094 | |

1095 |
vst1.8 {d2}, [r0,:64], r1 |

1096 |
vst1.8 {d3}, [r0,:64], r1 |

1097 |
subs r12, r12, #2 |

1098 |
bne 2b |

1099 | |

1100 |
add sp, sp, #168+16 |

1101 |
pop {r4,pc} |

1102 |
endfunc |

1103 | |

1104 |
function ff_put_vp8_epel8_v4_neon, export=1 |

1105 |
sub r2, r2, r3 |

1106 |
push {r4,lr} |

1107 | |

1108 |
ldr r4, [sp, #16] @ my |

1109 |
movrel lr, subpel_filters-16 |

1110 |
ldr r12, [sp, #8] @ h |

1111 |
add r4, lr, r4, lsl #4 |

1112 |
vld1.16 {q0}, [r4,:128] |

1113 |
1: |

1114 |
vld1.8 {d2}, [r2], r3 |

1115 |
vld1.8 {d3}, [r2], r3 |

1116 |
vld1.8 {d4}, [r2], r3 |

1117 |
vld1.8 {d5}, [r2], r3 |

1118 |
vld1.8 {d6}, [r2] |

1119 |
sub r2, r2, r3, lsl #1 |

1120 | |

1121 |
vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |

1122 | |

1123 |
vst1.8 {d2}, [r0,:64], r1 |

1124 |
vst1.8 {d3}, [r0,:64], r1 |

1125 |
subs r12, r12, #2 |

1126 |
bne 1b |

1127 | |

1128 |
pop {r4,pc} |

1129 |
endfunc |

1130 | |

1131 |
function ff_put_vp8_epel8_h4_neon, export=1 |

1132 |
sub r2, r2, #1 |

1133 |
push {r4,lr} |

1134 | |

1135 |
ldr r4, [sp, #12] @ mx |

1136 |
movrel lr, subpel_filters-16 |

1137 |
ldr r12, [sp, #8] @ h |

1138 |
add r4, lr, r4, lsl #4 |

1139 |
vld1.16 {q0}, [r4,:128] |

1140 |
1: |

1141 |
vld1.8 {d2,d3}, [r2], r3 |

1142 | |

1143 |
vp8_epel8_h4 d2, d2, d3 |

1144 | |

1145 |
vst1.8 {d2}, [r0,:64], r1 |

1146 |
subs r12, r12, #1 |

1147 |
bne 1b |

1148 | |

1149 |
pop {r4,pc} |

1150 |
endfunc |

1151 | |

1152 |
function ff_put_vp8_epel8_h4v4_neon, export=1 |

1153 |
sub r2, r2, r3 |

1154 |
sub r2, r2, #1 |

1155 |
push {r4,lr} |

1156 | |

1157 |
@ first pass (horizontal): |

1158 |
ldr r4, [sp, #12] @ mx |

1159 |
movrel lr, subpel_filters-16 |

1160 |
ldr r12, [sp, #8] @ h |

1161 |
add r4, lr, r4, lsl #4 |

1162 |
sub sp, sp, #168+16 |

1163 |
vld1.16 {q0}, [r4,:128] |

1164 |
add lr, sp, #15 |

1165 |
add r12, r12, #3 |

1166 |
bic lr, lr, #15 |

1167 |
1: |

1168 |
vld1.8 {d2,d3}, [r2], r3 |

1169 | |

1170 |
vp8_epel8_h4 d2, d2, d3 |

1171 | |

1172 |
vst1.8 {d2}, [lr,:64]! |

1173 |
subs r12, r12, #1 |

1174 |
bne 1b |

1175 | |

1176 |
@ second pass (vertical): |

1177 |
ldr r4, [sp, #168+16+16] @ my |

1178 |
movrel lr, subpel_filters-16 |

1179 |
ldr r12, [sp, #168+16+8] @ h |

1180 |
add r4, lr, r4, lsl #4 |

1181 |
add lr, sp, #15 |

1182 |
vld1.16 {q0}, [r4,:128] |

1183 |
bic lr, lr, #15 |

1184 |
2: |

1185 |
vld1.8 {d2-d5}, [lr,:128]! |

1186 |
vld1.8 {d6}, [lr,:64] |

1187 |
sub lr, lr, #16 |

1188 | |

1189 |
vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |

1190 | |

1191 |
vst1.8 {d2}, [r0,:64], r1 |

1192 |
vst1.8 {d3}, [r0,:64], r1 |

1193 |
subs r12, r12, #2 |

1194 |
bne 2b |

1195 | |

1196 |
add sp, sp, #168+16 |

1197 |
pop {r4,pc} |

1198 |
endfunc |

1199 | |

1200 |
function ff_put_vp8_epel8_h6v4_neon, export=1 |

1201 |
sub r2, r2, r3 |

1202 |
sub r2, r2, #2 |

1203 |
push {r4,lr} |

1204 | |

1205 |
@ first pass (horizontal): |

1206 |
ldr r4, [sp, #12] @ mx |

1207 |
movrel lr, subpel_filters-16 |

1208 |
ldr r12, [sp, #8] @ h |

1209 |
add r4, lr, r4, lsl #4 |

1210 |
sub sp, sp, #168+16 |

1211 |
vld1.16 {q0}, [r4,:128] |

1212 |
add lr, sp, #15 |

1213 |
add r12, r12, #3 |

1214 |
bic lr, lr, #15 |

1215 |
1: |

1216 |
vld1.8 {d2,d3}, [r2], r3 |

1217 | |

1218 |
vp8_epel8_h6 d2, d2, d3 |

1219 | |

1220 |
vst1.8 {d2}, [lr,:64]! |

1221 |
subs r12, r12, #1 |

1222 |
bne 1b |

1223 | |

1224 |
@ second pass (vertical): |

1225 |
ldr r4, [sp, #168+16+16] @ my |

1226 |
movrel lr, subpel_filters-16 |

1227 |
ldr r12, [sp, #168+16+8] @ h |

1228 |
add r4, lr, r4, lsl #4 |

1229 |
add lr, sp, #15 |

1230 |
vld1.16 {q0}, [r4,:128] |

1231 |
bic lr, lr, #15 |

1232 |
2: |

1233 |
vld1.8 {d2-d5}, [lr,:128]! |

1234 |
vld1.8 {d6}, [lr,:64] |

1235 |
sub lr, lr, #16 |

1236 | |

1237 |
vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |

1238 | |

1239 |
vst1.8 {d2}, [r0,:64], r1 |

1240 |
vst1.8 {d3}, [r0,:64], r1 |

1241 |
subs r12, r12, #2 |

1242 |
bne 2b |

1243 | |

1244 |
add sp, sp, #168+16 |

1245 |
pop {r4,pc} |

1246 |
endfunc |

1247 | |

1248 |
function ff_put_vp8_epel8_h4v6_neon, export=1 |

1249 |
sub r2, r2, r3, lsl #1 |

1250 |
sub r2, r2, #1 |

1251 |
push {r4,lr} |

1252 | |

1253 |
@ first pass (horizontal): |

1254 |
ldr r4, [sp, #12] @ mx |

1255 |
movrel lr, subpel_filters-16 |

1256 |
ldr r12, [sp, #8] @ h |

1257 |
add r4, lr, r4, lsl #4 |

1258 |
sub sp, sp, #168+16 |

1259 |
vld1.16 {q0}, [r4,:128] |

1260 |
add lr, sp, #15 |

1261 |
add r12, r12, #5 |

1262 |
bic lr, lr, #15 |

1263 |
1: |

1264 |
vld1.8 {d2,d3}, [r2], r3 |

1265 | |

1266 |
vp8_epel8_h4 d2, d2, d3 |

1267 | |

1268 |
vst1.8 {d2}, [lr,:64]! |

1269 |
subs r12, r12, #1 |

1270 |
bne 1b |

1271 | |

1272 |
@ second pass (vertical): |

1273 |
ldr r4, [sp, #168+16+16] @ my |

1274 |
movrel lr, subpel_filters-16 |

1275 |
ldr r12, [sp, #168+16+8] @ h |

1276 |
add r4, lr, r4, lsl #4 |

1277 |
add lr, sp, #15 |

1278 |
vld1.16 {q0}, [r4,:128] |

1279 |
bic lr, lr, #15 |

1280 |
2: |

1281 |
vld1.8 {d2-d5}, [lr,:128]! |

1282 |
vld1.8 {d6-d7}, [lr,:128]! |

1283 |
vld1.8 {d30}, [lr,:64] |

1284 |
sub lr, lr, #32 |

1285 | |

1286 |
vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 |

1287 | |

1288 |
vst1.8 {d2}, [r0,:64], r1 |

1289 |
vst1.8 {d3}, [r0,:64], r1 |

1290 |
subs r12, r12, #2 |

1291 |
bne 2b |

1292 | |

1293 |
add sp, sp, #168+16 |

1294 |
pop {r4,pc} |

1295 |
endfunc |

1296 | |

1297 |
function ff_put_vp8_epel4_v6_neon, export=1 |

1298 |
sub r2, r2, r3, lsl #1 |

1299 |
push {r4,lr} |

1300 | |

1301 |
ldr r4, [sp, #16] @ my |

1302 |
movrel lr, subpel_filters-16 |

1303 |
ldr r12, [sp, #8] @ h |

1304 |
add r4, lr, r4, lsl #4 |

1305 |
vld1.16 {q0}, [r4,:128] |

1306 |
1: |

1307 |
vld1.32 {d2[]}, [r2], r3 |

1308 |
vld1.32 {d3[]}, [r2], r3 |

1309 |
vld1.32 {d4[]}, [r2], r3 |

1310 |
vld1.32 {d5[]}, [r2], r3 |

1311 |
vld1.32 {d6[]}, [r2], r3 |

1312 |
vld1.32 {d7[]}, [r2], r3 |

1313 |
vld1.32 {d28[]}, [r2] |

1314 |
sub r2, r2, r3, lsl #2 |

1315 |
vld1.32 {d2[1]}, [r2], r3 |

1316 |
vld1.32 {d3[1]}, [r2], r3 |

1317 |
vld1.32 {d4[1]}, [r2], r3 |

1318 |
vld1.32 {d5[1]}, [r2], r3 |

1319 |
vld1.32 {d6[1]}, [r2], r3 |

1320 |
vld1.32 {d7[1]}, [r2], r3 |

1321 |
vld1.32 {d28[1]}, [r2] |

1322 |
sub r2, r2, r3, lsl #2 |

1323 | |

1324 |
vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 |

1325 | |

1326 |
vst1.32 {d2[0]}, [r0,:32], r1 |

1327 |
vst1.32 {d3[0]}, [r0,:32], r1 |

1328 |
vst1.32 {d2[1]}, [r0,:32], r1 |

1329 |
vst1.32 {d3[1]}, [r0,:32], r1 |

1330 |
subs r12, r12, #4 |

1331 |
bne 1b |

1332 | |

1333 |
pop {r4,pc} |

1334 |
endfunc |

1335 | |

1336 |
function ff_put_vp8_epel4_h6_neon, export=1 |

1337 |
sub r2, r2, #2 |

1338 |
push {r4,lr} |

1339 | |

1340 |
ldr r4, [sp, #12] @ mx |

1341 |
movrel lr, subpel_filters-16 |

1342 |
ldr r12, [sp, #8] @ h |

1343 |
add r4, lr, r4, lsl #4 |

1344 |
vld1.16 {q0}, [r4,:128] |

1345 |
1: |

1346 |
vld1.8 {q1}, [r2], r3 |

1347 |
vp8_epel8_h6 d2, d2, d3 |

1348 |
vst1.32 {d2[0]}, [r0,:32], r1 |

1349 |
subs r12, r12, #1 |

1350 |
bne 1b |

1351 | |

1352 |
pop {r4,pc} |

1353 |
endfunc |

1354 | |

1355 |
function ff_put_vp8_epel4_h6v6_neon, export=1 |

1356 |
sub r2, r2, r3, lsl #1 |

1357 |
sub r2, r2, #2 |

1358 |
push {r4,lr} |

1359 | |

1360 |
ldr r4, [sp, #12] @ mx |

1361 |
movrel lr, subpel_filters-16 |

1362 |
ldr r12, [sp, #8] @ h |

1363 |
add r4, lr, r4, lsl #4 |

1364 |
sub sp, sp, #52+16 |

1365 |
vld1.16 {q0}, [r4,:128] |

1366 |
add lr, sp, #15 |

1367 |
add r12, r12, #5 |

1368 |
bic lr, lr, #15 |

1369 |
1: |

1370 |
vld1.8 {q1}, [r2], r3 |

1371 |
vp8_epel8_h6 d2, d2, d3 |

1372 |
vst1.32 {d2[0]}, [lr,:32]! |

1373 |
subs r12, r12, #1 |

1374 |
bne 1b |

1375 | |

1376 |
ldr r4, [sp, #52+16+16] @ my |

1377 |
movrel lr, subpel_filters-16 |

1378 |
ldr r12, [sp, #52+16+8] @ h |

1379 |
add r4, lr, r4, lsl #4 |

1380 |
add lr, sp, #15 |

1381 |
vld1.16 {q0}, [r4,:128] |

1382 |
bic lr, lr, #15 |

1383 |
2: |

1384 |
vld1.8 {d2-d3}, [lr,:128]! |

1385 |
vld1.8 {d6}, [lr,:64]! |

1386 |
vld1.32 {d28[]}, [lr,:32] |

1387 |
sub lr, lr, #16 |

1388 |
vld1.8 {d4-d5}, [lr]! |

1389 |
vld1.8 {d7}, [lr,:64]! |

1390 |
vld1.32 {d28[1]}, [lr,:32] |

1391 |
sub lr, lr, #16 |

1392 |
vtrn.32 q1, q2 |

1393 |
vtrn.32 d6, d7 |

1394 |
vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 |

1395 |
vst1.32 {d2[0]}, [r0,:32], r1 |

1396 |
vst1.32 {d3[0]}, [r0,:32], r1 |

1397 |
vst1.32 {d2[1]}, [r0,:32], r1 |

1398 |
vst1.32 {d3[1]}, [r0,:32], r1 |

1399 |
subs r12, r12, #4 |

1400 |
bne 2b |

1401 | |

1402 |
add sp, sp, #52+16 |

1403 |
pop {r4,pc} |

1404 |
endfunc |

1405 | |

1406 |
function ff_put_vp8_epel4_h4v6_neon, export=1 |

1407 |
sub r2, r2, r3, lsl #1 |

1408 |
sub r2, r2, #1 |

1409 |
push {r4,lr} |

1410 | |

1411 |
ldr r4, [sp, #12] @ mx |

1412 |
movrel lr, subpel_filters-16 |

1413 |
ldr r12, [sp, #8] @ h |

1414 |
add r4, lr, r4, lsl #4 |

1415 |
sub sp, sp, #52+16 |

1416 |
vld1.16 {q0}, [r4,:128] |

1417 |
add lr, sp, #15 |

1418 |
add r12, r12, #5 |

1419 |
bic lr, lr, #15 |

1420 |
1: |

1421 |
vld1.8 {d2}, [r2], r3 |

1422 |
vp8_epel8_h4 d2, d2, d2 |

1423 |
vst1.32 {d2[0]}, [lr,:32]! |

1424 |
subs r12, r12, #1 |

1425 |
bne 1b |

1426 | |

1427 |
ldr r4, [sp, #52+16+16] @ my |

1428 |
movrel lr, subpel_filters-16 |

1429 |
ldr r12, [sp, #52+16+8] @ h |

1430 |
add r4, lr, r4, lsl #4 |

1431 |
add lr, sp, #15 |

1432 |
vld1.16 {q0}, [r4,:128] |

1433 |
bic lr, lr, #15 |

1434 |
2: |

1435 |
vld1.8 {d2-d3}, [lr,:128]! |

1436 |
vld1.8 {d6}, [lr,:64]! |

1437 |
vld1.32 {d28[]}, [lr,:32] |

1438 |
sub lr, lr, #16 |

1439 |
vld1.8 {d4-d5}, [lr]! |

1440 |
vld1.8 {d7}, [lr,:64]! |

1441 |
vld1.32 {d28[1]}, [lr,:32] |

1442 |
sub lr, lr, #16 |

1443 |
vtrn.32 q1, q2 |

1444 |
vtrn.32 d6, d7 |

1445 |
vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 |

1446 |
vst1.32 {d2[0]}, [r0,:32], r1 |

1447 |
vst1.32 {d3[0]}, [r0,:32], r1 |

1448 |
vst1.32 {d2[1]}, [r0,:32], r1 |

1449 |
vst1.32 {d3[1]}, [r0,:32], r1 |

1450 |
subs r12, r12, #4 |

1451 |
bne 2b |

1452 | |

1453 |
add sp, sp, #52+16 |

1454 |
pop {r4,pc} |

1455 |
endfunc |

1456 | |

1457 |
function ff_put_vp8_epel4_h6v4_neon, export=1 |

1458 |
sub r2, r2, r3 |

1459 |
sub r2, r2, #2 |

1460 |
push {r4,lr} |

1461 | |

1462 |
ldr r4, [sp, #12] @ mx |

1463 |
movrel lr, subpel_filters-16 |

1464 |
ldr r12, [sp, #8] @ h |

1465 |
add r4, lr, r4, lsl #4 |

1466 |
sub sp, sp, #44+16 |

1467 |
vld1.16 {q0}, [r4,:128] |

1468 |
add lr, sp, #15 |

1469 |
add r12, r12, #3 |

1470 |
bic lr, lr, #15 |

1471 |
1: |

1472 |
vld1.8 {q1}, [r2], r3 |

1473 |
vp8_epel8_h6 d2, d2, d3 |

1474 |
vst1.32 {d2[0]}, [lr,:32]! |

1475 |
subs r12, r12, #1 |

1476 |
bne 1b |

1477 | |

1478 |
ldr r4, [sp, #44+16+16] @ my |

1479 |
movrel lr, subpel_filters-16 |

1480 |
ldr r12, [sp, #44+16+8] @ h |

1481 |
add r4, lr, r4, lsl #4 |

1482 |
add lr, sp, #15 |

1483 |
vld1.16 {q0}, [r4,:128] |

1484 |
bic lr, lr, #15 |

1485 |
2: |

1486 |
vld1.8 {d2-d3}, [lr,:128]! |

1487 |
vld1.32 {d6[]}, [lr,:32] |

1488 |
sub lr, lr, #8 |

1489 |
vld1.8 {d4-d5}, [lr]! |

1490 |
vld1.32 {d6[1]}, [lr,:32] |

1491 |
sub lr, lr, #8 |

1492 |
vtrn.32 q1, q2 |

1493 |
vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 |

1494 |
vst1.32 {d2[0]}, [r0,:32], r1 |

1495 |
vst1.32 {d3[0]}, [r0,:32], r1 |

1496 |
vst1.32 {d2[1]}, [r0,:32], r1 |

1497 |
vst1.32 {d3[1]}, [r0,:32], r1 |

1498 |
subs r12, r12, #4 |

1499 |
bne 2b |

1500 | |

1501 |
add sp, sp, #44+16 |

1502 |
pop {r4,pc} |

1503 |
endfunc |

1504 | |

1505 |
function ff_put_vp8_epel4_h4_neon, export=1 |

1506 |
sub r2, r2, #1 |

1507 |
push {r4,lr} |

1508 | |

1509 |
ldr r4, [sp, #12] @ mx |

1510 |
movrel lr, subpel_filters-16 |

1511 |
ldr r12, [sp, #8] @ h |

1512 |
add r4, lr, r4, lsl #4 |

1513 |
vld1.16 {q0}, [r4,:128] |

1514 |
1: |

1515 |
vld1.8 {d2}, [r2], r3 |

1516 |
vp8_epel8_h4 d2, d2, d2 |

1517 |
vst1.32 {d2[0]}, [r0,:32], r1 |

1518 |
subs r12, r12, #1 |

1519 |
bne 1b |

1520 | |

1521 |
pop {r4,pc} |

1522 |
endfunc |

1523 | |

1524 |
function ff_put_vp8_epel4_v4_neon, export=1 |

1525 |
sub r2, r2, r3 |

1526 |
push {r4,lr} |

1527 | |

1528 |
ldr r4, [sp, #16] @ my |

1529 |
movrel lr, subpel_filters-16 |

1530 |
ldr r12, [sp, #8] @ h |

1531 |
add r4, lr, r4, lsl #4 |

1532 |
vld1.16 {q0}, [r4,:128] |

1533 |
1: |

1534 |
vld1.32 {d2[]}, [r2], r3 |

1535 |
vld1.32 {d3[]}, [r2], r3 |

1536 |
vld1.32 {d4[]}, [r2], r3 |

1537 |
vld1.32 {d5[]}, [r2], r3 |

1538 |
vld1.32 {d6[]}, [r2] |

1539 |
sub r2, r2, r3, lsl #1 |

1540 |
vld1.32 {d2[1]}, [r2], r3 |

1541 |
vld1.32 {d3[1]}, [r2], r3 |

1542 |
vld1.32 {d4[1]}, [r2], r3 |

1543 |
vld1.32 {d5[1]}, [r2], r3 |

1544 |
vld1.32 {d6[1]}, [r2] |

1545 |
sub r2, r2, r3, lsl #1 |

1546 | |

1547 |
vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |

1548 | |

1549 |
vst1.32 {d2[0]}, [r0,:32], r1 |

1550 |
vst1.32 {d3[0]}, [r0,:32], r1 |

1551 |
vst1.32 {d2[1]}, [r0,:32], r1 |

1552 |
vst1.32 {d3[1]}, [r0,:32], r1 |

1553 |
subs r12, r12, #4 |

1554 |
bne 1b |

1555 | |

1556 |
pop {r4,pc} |

1557 |
endfunc |

1558 | |

1559 |
function ff_put_vp8_epel4_h4v4_neon, export=1 |

1560 |
sub r2, r2, r3 |

1561 |
sub r2, r2, #1 |

1562 |
push {r4,lr} |

1563 | |

1564 |
ldr r4, [sp, #12] @ mx |

1565 |
movrel lr, subpel_filters-16 |

1566 |
ldr r12, [sp, #8] @ h |

1567 |
add r4, lr, r4, lsl #4 |

1568 |
sub sp, sp, #44+16 |

1569 |
vld1.16 {q0}, [r4,:128] |

1570 |
add lr, sp, #15 |

1571 |
add r12, r12, #3 |

1572 |
bic lr, lr, #15 |

1573 |
1: |

1574 |
vld1.8 {d2}, [r2], r3 |

1575 |
vp8_epel8_h4 d2, d2, d3 |

1576 |
vst1.32 {d2[0]}, [lr,:32]! |

1577 |
subs r12, r12, #1 |

1578 |
bne 1b |

1579 | |

1580 |
ldr r4, [sp, #44+16+16] @ my |

1581 |
movrel lr, subpel_filters-16 |

1582 |
ldr r12, [sp, #44+16+8] @ h |

1583 |
add r4, lr, r4, lsl #4 |

1584 |
add lr, sp, #15 |

1585 |
vld1.16 {q0}, [r4,:128] |

1586 |
bic lr, lr, #15 |

1587 |
2: |

1588 |
vld1.8 {d2-d3}, [lr,:128]! |

1589 |
vld1.32 {d6[]}, [lr,:32] |

1590 |
sub lr, lr, #8 |

1591 |
vld1.8 {d4-d5}, [lr]! |

1592 |
vld1.32 {d6[1]}, [lr,:32] |

1593 |
sub lr, lr, #8 |

1594 |
vtrn.32 q1, q2 |

1595 |
vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 |

1596 |
vst1.32 {d2[0]}, [r0,:32], r1 |

1597 |
vst1.32 {d3[0]}, [r0,:32], r1 |

1598 |
vst1.32 {d2[1]}, [r0,:32], r1 |

1599 |
vst1.32 {d3[1]}, [r0,:32], r1 |

1600 |
subs r12, r12, #4 |

1601 |
bne 2b |

1602 | |

1603 |
add sp, sp, #44+16 |

1604 |
pop {r4,pc} |

1605 |
endfunc |

1606 | |

1607 |
@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit |

1608 |
@ arithmatic can be used to apply filters |

1609 |
const subpel_filters, align=4 |

1610 |
.short 0, 6, 123, 12, 1, 0, 0, 0 |

1611 |
.short 2, 11, 108, 36, 8, 1, 0, 0 |

1612 |
.short 0, 9, 93, 50, 6, 0, 0, 0 |

1613 |
.short 3, 16, 77, 77, 16, 3, 0, 0 |

1614 |
.short 0, 6, 50, 93, 9, 0, 0, 0 |

1615 |
.short 1, 8, 36, 108, 11, 2, 0, 0 |

1616 |
.short 0, 1, 12, 123, 6, 0, 0, 0 |

1617 |
endconst |

1618 | |

1619 |
/* Bilinear MC */ |

1620 | |

1621 |
function ff_put_vp8_bilin16_h_neon, export=1 |

1622 |
ldr r3, [sp, #4] @ mx |

1623 |
rsb r12, r3, #8 |

1624 |
vdup.8 d0, r3 |

1625 |
vdup.8 d1, r12 |

1626 |
ldr r12, [sp] @ h |

1627 |
1: |

1628 |
subs r12, r12, #2 |

1629 |
vld1.8 {d2-d4}, [r2], r1 |

1630 |
vext.8 q2, q1, q2, #1 |

1631 |
vmull.u8 q8, d2, d1 |

1632 |
vmlal.u8 q8, d4, d0 |

1633 |
vld1.8 {d18-d20},[r2], r1 |

1634 |
vmull.u8 q3, d3, d1 |

1635 |
vmlal.u8 q3, d5, d0 |

1636 |
vext.8 q10, q9, q10, #1 |

1637 |
vmull.u8 q11, d18, d1 |

1638 |
vmlal.u8 q11, d20, d0 |

1639 |
vmull.u8 q12, d19, d1 |

1640 |
vmlal.u8 q12, d21, d0 |

1641 |
vrshrn.u16 d4, q8, #3 |

1642 |
vrshrn.u16 d5, q3, #3 |

1643 |
vrshrn.u16 d6, q11, #3 |

1644 |
vrshrn.u16 d7, q12, #3 |

1645 |
vst1.8 {q2}, [r0,:128], r1 |

1646 |
vst1.8 {q3}, [r0,:128], r1 |

1647 |
bgt 1b |

1648 | |

1649 |
bx lr |

1650 |
endfunc |

1651 | |

1652 |
function ff_put_vp8_bilin16_v_neon, export=1 |

1653 |
ldr r3, [sp, #8] @ my |

1654 |
rsb r12, r3, #8 |

1655 |
vdup.8 d0, r3 |

1656 |
vdup.8 d1, r12 |

1657 |
ldr r12, [sp] @ h |

1658 |
vld1.8 {q1}, [r2], r1 |

1659 |
1: |

1660 |
subs r12, r12, #2 |

1661 |
vld1.8 {q2}, [r2], r1 |

1662 |
vmull.u8 q3, d2, d1 |

1663 |
vmlal.u8 q3, d4, d0 |

1664 |
vmull.u8 q8, d3, d1 |

1665 |
vmlal.u8 q8, d5, d0 |

1666 |
vld1.8 {q1}, [r2], r1 |

1667 |
vmull.u8 q9, d4, d1 |

1668 |
vmlal.u8 q9, d2, d0 |

1669 |
vmull.u8 q10, d5, d1 |

1670 |
vmlal.u8 q10, d3, d0 |

1671 |
vrshrn.u16 d4, q3, #3 |

1672 |
vrshrn.u16 d5, q8, #3 |

1673 |
vrshrn.u16 d6, q9, #3 |

1674 |
vrshrn.u16 d7, q10, #3 |

1675 |
vst1.8 {q2}, [r0,:128], r1 |

1676 |
vst1.8 {q3}, [r0,:128], r1 |

1677 |
bgt 1b |

1678 | |

1679 |
bx lr |

1680 |
endfunc |

1681 | |

1682 |
function ff_put_vp8_bilin16_hv_neon, export=1 |

1683 |
ldr r3, [sp, #4] @ mx |

1684 |
rsb r12, r3, #8 |

1685 |
vdup.8 d0, r3 |

1686 |
vdup.8 d1, r12 |

1687 |
ldr r3, [sp, #8] @ my |

1688 |
rsb r12, r3, #8 |

1689 |
vdup.8 d2, r3 |

1690 |
vdup.8 d3, r12 |

1691 |
ldr r12, [sp] @ h |

1692 | |

1693 |
vld1.8 {d4-d6}, [r2], r1 |

1694 |
vext.8 q3, q2, q3, #1 |

1695 |
vmull.u8 q8, d4, d1 |

1696 |
vmlal.u8 q8, d6, d0 |

1697 |
vmull.u8 q9, d5, d1 |

1698 |
vmlal.u8 q9, d7, d0 |

1699 |
vrshrn.u16 d4, q8, #3 |

1700 |
vrshrn.u16 d5, q9, #3 |

1701 |
1: |

1702 |
subs r12, r12, #2 |

1703 |
vld1.8 {d18-d20},[r2], r1 |

1704 |
vext.8 q10, q9, q10, #1 |

1705 |
vmull.u8 q11, d18, d1 |

1706 |
vmlal.u8 q11, d20, d0 |

1707 |
vld1.8 {d26-d28},[r2], r1 |

1708 |
vmull.u8 q12, d19, d1 |

1709 |
vmlal.u8 q12, d21, d0 |

1710 |
vext.8 q14, q13, q14, #1 |

1711 |
vmull.u8 q8, d26, d1 |

1712 |
vmlal.u8 q8, d28, d0 |

1713 |
vmull.u8 q9, d27, d1 |

1714 |
vmlal.u8 q9, d29, d0 |

1715 |
vrshrn.u16 d6, q11, #3 |

1716 |
vrshrn.u16 d7, q12, #3 |

1717 |
vmull.u8 q12, d4, d3 |

1718 |
vmlal.u8 q12, d6, d2 |

1719 |
vmull.u8 q15, d5, d3 |

1720 |
vmlal.u8 q15, d7, d2 |

1721 |
vrshrn.u16 d4, q8, #3 |

1722 |
vrshrn.u16 d5, q9, #3 |

1723 |
vmull.u8 q10, d6, d3 |

1724 |
vmlal.u8 q10, d4, d2 |

1725 |
vmull.u8 q11, d7, d3 |

1726 |
vmlal.u8 q11, d5, d2 |

1727 |
vrshrn.u16 d24, q12, #3 |

1728 |
vrshrn.u16 d25, q15, #3 |

1729 |
vst1.8 {q12}, [r0,:128], r1 |

1730 |
vrshrn.u16 d20, q10, #3 |

1731 |
vrshrn.u16 d21, q11, #3 |

1732 |
vst1.8 {q10}, [r0,:128], r1 |

1733 |
bgt 1b |

1734 | |

1735 |
bx lr |

1736 |
endfunc |

1737 | |

1738 |
function ff_put_vp8_bilin8_h_neon, export=1 |

1739 |
ldr r3, [sp, #4] @ mx |

1740 |
rsb r12, r3, #8 |

1741 |
vdup.8 d0, r3 |

1742 |
vdup.8 d1, r12 |

1743 |
ldr r12, [sp] @ h |

1744 |
1: |

1745 |
subs r12, r12, #2 |

1746 |
vld1.8 {q1}, [r2], r1 |

1747 |
vext.8 d3, d2, d3, #1 |

1748 |
vmull.u8 q2, d2, d1 |

1749 |
vmlal.u8 q2, d3, d0 |

1750 |
vld1.8 {q3}, [r2], r1 |

1751 |
vext.8 d7, d6, d7, #1 |

1752 |
vmull.u8 q8, d6, d1 |

1753 |
vmlal.u8 q8, d7, d0 |

1754 |
vrshrn.u16 d4, q2, #3 |

1755 |
vrshrn.u16 d16, q8, #3 |

1756 |
vst1.8 {d4}, [r0,:64], r1 |

1757 |
vst1.8 {d16}, [r0,:64], r1 |

1758 |
bgt 1b |

1759 | |

1760 |
bx lr |

1761 |
endfunc |

1762 | |

1763 |
function ff_put_vp8_bilin8_v_neon, export=1 |

1764 |
ldr r3, [sp, #8] @ my |

1765 |
rsb r12, r3, #8 |

1766 |
vdup.8 d0, r3 |

1767 |
vdup.8 d1, r12 |

1768 |
ldr r12, [sp] @ h |

1769 |
vld1.8 {d2}, [r2], r1 |

1770 |
1: |

1771 |
subs r12, r12, #2 |

1772 |
vld1.8 {d3}, [r2], r1 |

1773 |
vmull.u8 q2, d2, d1 |

1774 |
vmlal.u8 q2, d3, d0 |

1775 |
vld1.8 {d2}, [r2], r1 |

1776 |
vmull.u8 q3, d3, d1 |

1777 |
vmlal.u8 q3, d2, d0 |

1778 |
vrshrn.u16 d4, q2, #3 |

1779 |
vrshrn.u16 d6, q3, #3 |

1780 |
vst1.8 {d4}, [r0,:64], r1 |

1781 |
vst1.8 {d6}, [r0,:64], r1 |

1782 |
bgt 1b |

1783 | |

1784 |
bx lr |

1785 |
endfunc |

1786 | |

1787 |
function ff_put_vp8_bilin8_hv_neon, export=1 |

1788 |
ldr r3, [sp, #4] @ mx |

1789 |
rsb r12, r3, #8 |

1790 |
vdup.8 d0, r3 |

1791 |
vdup.8 d1, r12 |

1792 |
ldr r3, [sp, #8] @ my |

1793 |
rsb r12, r3, #8 |

1794 |
vdup.8 d2, r3 |

1795 |
vdup.8 d3, r12 |

1796 |
ldr r12, [sp] @ h |

1797 | |

1798 |
vld1.8 {q2}, [r2], r1 |

1799 |
vext.8 d5, d4, d5, #1 |

1800 |
vmull.u8 q9, d4, d1 |

1801 |
vmlal.u8 q9, d5, d0 |

1802 |
vrshrn.u16 d22, q9, #3 |

1803 |
1: |

1804 |
subs r12, r12, #2 |

1805 |
vld1.8 {q3}, [r2], r1 |

1806 |
vext.8 d7, d6, d7, #1 |

1807 |
vmull.u8 q8, d6, d1 |

1808 |
vmlal.u8 q8, d7, d0 |

1809 |
vld1.8 {q2}, [r2], r1 |

1810 |
vext.8 d5, d4, d5, #1 |

1811 |
vmull.u8 q9, d4, d1 |

1812 |
vmlal.u8 q9, d5, d0 |

1813 |
vrshrn.u16 d16, q8, #3 |

1814 |
vmull.u8 q10, d22, d3 |

1815 |
vmlal.u8 q10, d16, d2 |

1816 |
vrshrn.u16 d22, q9, #3 |

1817 |
vmull.u8 q12, d16, d3 |

1818 |
vmlal.u8 q12, d22, d2 |

1819 |
vrshrn.u16 d20, q10, #3 |

1820 |
vst1.8 {d20}, [r0,:64], r1 |

1821 |
vrshrn.u16 d23, q12, #3 |

1822 |
vst1.8 {d23}, [r0,:64], r1 |

1823 |
bgt 1b |

1824 | |

1825 |
bx lr |

1826 |
endfunc |

1827 | |

1828 |
function ff_put_vp8_bilin4_h_neon, export=1 |

1829 |
ldr r3, [sp, #4] @ mx |

1830 |
rsb r12, r3, #8 |

1831 |
vdup.8 d0, r3 |

1832 |
vdup.8 d1, r12 |

1833 |
ldr r12, [sp] @ h |

1834 |
1: |

1835 |
subs r12, r12, #2 |

1836 |
vld1.8 {d2}, [r2], r1 |

1837 |
vext.8 d3, d2, d3, #1 |

1838 |
vld1.8 {d6}, [r2], r1 |

1839 |
vext.8 d7, d6, d7, #1 |

1840 |
vtrn.32 q1, q3 |

1841 |
vmull.u8 q2, d2, d1 |

1842 |
vmlal.u8 q2, d3, d0 |

1843 |
vrshrn.u16 d4, q2, #3 |

1844 |
vst1.32 {d4[0]}, [r0,:32], r1 |

1845 |
vst1.32 {d4[1]}, [r0,:32], r1 |

1846 |
bgt 1b |

1847 | |

1848 |
bx lr |

1849 |
endfunc |

1850 | |

1851 |
function ff_put_vp8_bilin4_v_neon, export=1 |

1852 |
ldr r3, [sp, #8] @ my |

1853 |
rsb r12, r3, #8 |

1854 |
vdup.8 d0, r3 |

1855 |
vdup.8 d1, r12 |

1856 |
ldr r12, [sp] @ h |

1857 |
vld1.32 {d2[]}, [r2], r1 |

1858 |
1: |

1859 |
vld1.32 {d3[]}, [r2] |

1860 |
vld1.32 {d2[1]}, [r2], r1 |

1861 |
vld1.32 {d3[1]}, [r2], r1 |

1862 |
vmull.u8 q2, d2, d1 |

1863 |
vmlal.u8 q2, d3, d0 |

1864 |
vtrn.32 d3, d2 |

1865 |
vrshrn.u16 d4, q2, #3 |

1866 |
vst1.32 {d4[0]}, [r0,:32], r1 |

1867 |
vst1.32 {d4[1]}, [r0,:32], r1 |

1868 |
subs r12, r12, #2 |

1869 |
bgt 1b |

1870 | |

1871 |
bx lr |

1872 |
endfunc |

1873 | |

1874 |
function ff_put_vp8_bilin4_hv_neon, export=1 |

1875 |
ldr r3, [sp, #4] @ mx |

1876 |
rsb r12, r3, #8 |

1877 |
vdup.8 d0, r3 |

1878 |
vdup.8 d1, r12 |

1879 |
ldr r3, [sp, #8] @ my |

1880 |
rsb r12, r3, #8 |

1881 |
vdup.8 d2, r3 |

1882 |
vdup.8 d3, r12 |

1883 |
ldr r12, [sp] @ h |

1884 | |

1885 |
vld1.8 {d4}, [r2], r1 |

1886 |
vext.8 d5, d4, d4, #1 |

1887 |
vmull.u8 q9, d4, d1 |

1888 |
vmlal.u8 q9, d5, d0 |

1889 |
vrshrn.u16 d22, q9, #3 |

1890 |
1: |

1891 |
subs r12, r12, #2 |

1892 |
vld1.8 {d6}, [r2], r1 |

1893 |
vext.8 d7, d6, d6, #1 |

1894 |
vld1.8 {d4}, [r2], r1 |

1895 |
vext.8 d5, d4, d4, #1 |

1896 |
vtrn.32 q3, q2 |

1897 |
vmull.u8 q8, d6, d1 |

1898 |
vmlal.u8 q8, d7, d0 |

1899 |
vrshrn.u16 d16, q8, #3 |

1900 |
vmull.u8 q10, d16, d2 |

1901 |
vtrn.32 d22, d16 |

1902 |
vmlal.u8 q10, d22, d3 |

1903 |
vrev64.32 d22, d16 |

1904 |
vrshrn.u16 d20, q10, #3 |

1905 |
vst1.32 {d20[0]}, [r0,:32], r1 |

1906 |
vst1.32 {d20[1]}, [r0,:32], r1 |

1907 |
bgt 1b |

1908 | |

1909 |
bx lr |

1910 |
endfunc |