## ffmpeg / libavcodec / arm / vp8dsp_neon.S @ ef15d71c

History | View | Annotate | Download (66.4 KB)

1 | ef15d71c | Mans Rullgard | /** |
---|---|---|---|

2 | * VP8 NEON optimisations |
||

3 | * |
||

4 | * Copyright (c) 2010 Rob Clark <rob@ti.com> |
||

5 | * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> |
||

6 | * |
||

7 | * This file is part of FFmpeg. |
||

8 | * |
||

9 | * FFmpeg is free software; you can redistribute it and/or |
||

10 | * modify it under the terms of the GNU Lesser General Public |
||

11 | * License as published by the Free Software Foundation; either |
||

12 | * version 2.1 of the License, or (at your option) any later version. |
||

13 | * |
||

14 | * FFmpeg is distributed in the hope that it will be useful, |
||

15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||

16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||

17 | * Lesser General Public License for more details. |
||

18 | * |
||

19 | * You should have received a copy of the GNU Lesser General Public |
||

20 | * License along with FFmpeg; if not, write to the Free Software |
||

21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||

22 | */ |
||

23 | |||

24 | #include "asm.S" |
||

25 | |||

26 | function ff_vp8_luma_dc_wht_neon, export=1 |
||

27 | vld1.16 {q0-q1}, [r1,:128] |
||

28 | vmov.i16 q15, #0 |
||

29 | |||

30 | vadd.i16 d4, d0, d3 |
||

31 | vadd.i16 d6, d1, d2 |
||

32 | vst1.16 {q15}, [r1,:128]! |
||

33 | vsub.i16 d7, d1, d2 |
||

34 | vsub.i16 d5, d0, d3 |
||

35 | vst1.16 {q15}, [r1,:128] |
||

36 | vadd.i16 q0, q2, q3 |
||

37 | vsub.i16 q1, q2, q3 |
||

38 | |||

39 | vmov.i16 q8, #3 |
||

40 | |||

41 | vtrn.32 d0, d2 |
||

42 | vtrn.32 d1, d3 |
||

43 | vtrn.16 d0, d1 |
||

44 | vtrn.16 d2, d3 |
||

45 | |||

46 | vadd.i16 d0, d0, d16 |
||

47 | |||

48 | vadd.i16 d4, d0, d3 |
||

49 | vadd.i16 d6, d1, d2 |
||

50 | vsub.i16 d7, d1, d2 |
||

51 | vsub.i16 d5, d0, d3 |
||

52 | vadd.i16 q0, q2, q3 |
||

53 | vsub.i16 q1, q2, q3 |
||

54 | |||

55 | vshr.s16 q0, q0, #3 |
||

56 | vshr.s16 q1, q1, #3 |
||

57 | |||

58 | mov r3, #32 |
||

59 | vst1.16 {d0[0]}, [r0,:16], r3 |
||

60 | vst1.16 {d1[0]}, [r0,:16], r3 |
||

61 | vst1.16 {d2[0]}, [r0,:16], r3 |
||

62 | vst1.16 {d3[0]}, [r0,:16], r3 |
||

63 | vst1.16 {d0[1]}, [r0,:16], r3 |
||

64 | vst1.16 {d1[1]}, [r0,:16], r3 |
||

65 | vst1.16 {d2[1]}, [r0,:16], r3 |
||

66 | vst1.16 {d3[1]}, [r0,:16], r3 |
||

67 | vst1.16 {d0[2]}, [r0,:16], r3 |
||

68 | vst1.16 {d1[2]}, [r0,:16], r3 |
||

69 | vst1.16 {d2[2]}, [r0,:16], r3 |
||

70 | vst1.16 {d3[2]}, [r0,:16], r3 |
||

71 | vst1.16 {d0[3]}, [r0,:16], r3 |
||

72 | vst1.16 {d1[3]}, [r0,:16], r3 |
||

73 | vst1.16 {d2[3]}, [r0,:16], r3 |
||

74 | vst1.16 {d3[3]}, [r0,:16], r3 |
||

75 | |||

76 | bx lr |
||

77 | endfunc |
||

78 | |||

79 | function ff_vp8_luma_dc_wht_dc_neon, export=1 |
||

80 | ldrsh r2, [r1] |
||

81 | mov r3, #0 |
||

82 | add r2, r2, #3 |
||

83 | strh r3, [r1] |
||

84 | asr r2, r2, #3 |
||

85 | .rept 16 |
||

86 | strh r2, [r0], #32 |
||

87 | .endr |
||

88 | bx lr |
||

89 | endfunc |
||

90 | |||

91 | function ff_vp8_idct_add_neon, export=1 |
||

92 | vld1.16 {q0-q1}, [r1,:128] |
||

93 | movw r3, #20091 |
||

94 | movt r3, #35468/2 |
||

95 | vdup.32 d4, r3 |
||

96 | |||

97 | vmull.s16 q12, d1, d4[0] |
||

98 | vmull.s16 q13, d3, d4[0] |
||

99 | vqdmulh.s16 d20, d1, d4[1] |
||

100 | vqdmulh.s16 d23, d3, d4[1] |
||

101 | vshrn.s32 d21, q12, #16 |
||

102 | vshrn.s32 d22, q13, #16 |
||

103 | vadd.s16 d21, d21, d1 |
||

104 | vadd.s16 d22, d22, d3 |
||

105 | |||

106 | vadd.s16 d16, d0, d2 |
||

107 | vsub.s16 d17, d0, d2 |
||

108 | vadd.s16 d18, d21, d23 |
||

109 | vsub.s16 d19, d20, d22 |
||

110 | vadd.s16 q0, q8, q9 |
||

111 | vsub.s16 q1, q8, q9 |
||

112 | |||

113 | vtrn.32 d0, d3 |
||

114 | vtrn.32 d1, d2 |
||

115 | vtrn.16 d0, d1 |
||

116 | vtrn.16 d3, d2 |
||

117 | |||

118 | vmov.i16 q15, #0 |
||

119 | vmull.s16 q12, d1, d4[0] |
||

120 | vst1.16 {q15}, [r1,:128]! |
||

121 | vmull.s16 q13, d2, d4[0] |
||

122 | vst1.16 {q15}, [r1,:128] |
||

123 | vqdmulh.s16 d21, d1, d4[1] |
||

124 | vqdmulh.s16 d23, d2, d4[1] |
||

125 | vshrn.s32 d20, q12, #16 |
||

126 | vshrn.s32 d22, q13, #16 |
||

127 | vadd.i16 d20, d20, d1 |
||

128 | vadd.i16 d22, d22, d2 |
||

129 | |||

130 | vadd.i16 d16, d0, d3 |
||

131 | vsub.i16 d17, d0, d3 |
||

132 | vadd.i16 d18, d20, d23 |
||

133 | vld1.32 {d20[]}, [r0,:32], r2 |
||

134 | vsub.i16 d19, d21, d22 |
||

135 | vld1.32 {d22[]}, [r0,:32], r2 |
||

136 | vadd.s16 q0, q8, q9 |
||

137 | vld1.32 {d23[]}, [r0,:32], r2 |
||

138 | vsub.s16 q1, q8, q9 |
||

139 | vld1.32 {d21[]}, [r0,:32], r2 |
||

140 | vrshr.s16 q0, q0, #3 |
||

141 | vtrn.32 q10, q11 |
||

142 | vrshr.s16 q1, q1, #3 |
||

143 | |||

144 | sub r0, r0, r2, lsl #2 |
||

145 | |||

146 | vtrn.32 d0, d3 |
||

147 | vtrn.32 d1, d2 |
||

148 | vtrn.16 d0, d1 |
||

149 | vtrn.16 d3, d2 |
||

150 | |||

151 | vaddw.u8 q0, q0, d20 |
||

152 | vaddw.u8 q1, q1, d21 |
||

153 | vqmovun.s16 d0, q0 |
||

154 | vqmovun.s16 d1, q1 |
||

155 | |||

156 | vst1.32 {d0[0]}, [r0,:32], r2 |
||

157 | vst1.32 {d0[1]}, [r0,:32], r2 |
||

158 | vst1.32 {d1[1]}, [r0,:32], r2 |
||

159 | vst1.32 {d1[0]}, [r0,:32], r2 |
||

160 | |||

161 | bx lr |
||

162 | endfunc |
||

163 | |||

164 | function ff_vp8_idct_dc_add_neon, export=1 |
||

165 | mov r3, #0 |
||

166 | ldrsh r12, [r1] |
||

167 | strh r3, [r1] |
||

168 | vdup.16 q1, r12 |
||

169 | vrshr.s16 q1, q1, #3 |
||

170 | vld1.32 {d0[]}, [r0,:32], r2 |
||

171 | vld1.32 {d1[]}, [r0,:32], r2 |
||

172 | vld1.32 {d0[1]}, [r0,:32], r2 |
||

173 | vld1.32 {d1[1]}, [r0,:32], r2 |
||

174 | vaddw.u8 q2, q1, d0 |
||

175 | vaddw.u8 q3, q1, d1 |
||

176 | sub r0, r0, r2, lsl #2 |
||

177 | vqmovun.s16 d0, q2 |
||

178 | vqmovun.s16 d1, q3 |
||

179 | vst1.32 {d0[0]}, [r0,:32], r2 |
||

180 | vst1.32 {d1[0]}, [r0,:32], r2 |
||

181 | vst1.32 {d0[1]}, [r0,:32], r2 |
||

182 | vst1.32 {d1[1]}, [r0,:32], r2 |
||

183 | bx lr |
||

184 | endfunc |
||

185 | |||

186 | function ff_vp8_idct_dc_add4uv_neon, export=1 |
||

187 | vmov.i16 d0, #0 |
||

188 | mov r3, #32 |
||

189 | vld1.16 {d16[]}, [r1,:16] |
||

190 | vst1.16 {d0[0]}, [r1,:16], r3 |
||

191 | vld1.16 {d17[]}, [r1,:16] |
||

192 | vst1.16 {d0[0]}, [r1,:16], r3 |
||

193 | vld1.16 {d18[]}, [r1,:16] |
||

194 | vst1.16 {d0[0]}, [r1,:16], r3 |
||

195 | vld1.16 {d19[]}, [r1,:16] |
||

196 | vst1.16 {d0[0]}, [r1,:16], r3 |
||

197 | mov r3, r0 |
||

198 | vrshr.s16 q8, q8, #3 @ dc >>= 3 |
||

199 | vld1.8 {d0}, [r0,:64], r2 |
||

200 | vrshr.s16 q9, q9, #3 |
||

201 | vld1.8 {d1}, [r0,:64], r2 |
||

202 | vaddw.u8 q10, q8, d0 |
||

203 | vld1.8 {d2}, [r0,:64], r2 |
||

204 | vaddw.u8 q0, q8, d1 |
||

205 | vld1.8 {d3}, [r0,:64], r2 |
||

206 | vaddw.u8 q11, q8, d2 |
||

207 | vld1.8 {d4}, [r0,:64], r2 |
||

208 | vaddw.u8 q1, q8, d3 |
||

209 | vld1.8 {d5}, [r0,:64], r2 |
||

210 | vaddw.u8 q12, q9, d4 |
||

211 | vld1.8 {d6}, [r0,:64], r2 |
||

212 | vaddw.u8 q2, q9, d5 |
||

213 | vld1.8 {d7}, [r0,:64], r2 |
||

214 | vaddw.u8 q13, q9, d6 |
||

215 | vqmovun.s16 d20, q10 |
||

216 | vaddw.u8 q3, q9, d7 |
||

217 | vqmovun.s16 d21, q0 |
||

218 | vqmovun.s16 d22, q11 |
||

219 | vst1.8 {d20}, [r3,:64], r2 |
||

220 | vqmovun.s16 d23, q1 |
||

221 | vst1.8 {d21}, [r3,:64], r2 |
||

222 | vqmovun.s16 d24, q12 |
||

223 | vst1.8 {d22}, [r3,:64], r2 |
||

224 | vqmovun.s16 d25, q2 |
||

225 | vst1.8 {d23}, [r3,:64], r2 |
||

226 | vqmovun.s16 d26, q13 |
||

227 | vst1.8 {d24}, [r3,:64], r2 |
||

228 | vqmovun.s16 d27, q3 |
||

229 | vst1.8 {d25}, [r3,:64], r2 |
||

230 | vst1.8 {d26}, [r3,:64], r2 |
||

231 | vst1.8 {d27}, [r3,:64], r2 |
||

232 | |||

233 | bx lr |
||

234 | endfunc |
||

235 | |||

236 | function ff_vp8_idct_dc_add4y_neon, export=1 |
||

237 | vmov.i16 d0, #0 |
||

238 | mov r3, #32 |
||

239 | vld1.16 {d16[]}, [r1,:16] |
||

240 | vst1.16 {d0[0]}, [r1,:16], r3 |
||

241 | vld1.16 {d17[]}, [r1,:16] |
||

242 | vst1.16 {d0[0]}, [r1,:16], r3 |
||

243 | vld1.16 {d18[]}, [r1,:16] |
||

244 | vst1.16 {d0[0]}, [r1,:16], r3 |
||

245 | vld1.16 {d19[]}, [r1,:16] |
||

246 | vst1.16 {d0[0]}, [r1,:16], r3 |
||

247 | vrshr.s16 q8, q8, #3 @ dc >>= 3 |
||

248 | vld1.8 {q0}, [r0,:128], r2 |
||

249 | vrshr.s16 q9, q9, #3 |
||

250 | vld1.8 {q1}, [r0,:128], r2 |
||

251 | vaddw.u8 q10, q8, d0 |
||

252 | vld1.8 {q2}, [r0,:128], r2 |
||

253 | vaddw.u8 q0, q9, d1 |
||

254 | vld1.8 {q3}, [r0,:128], r2 |
||

255 | vaddw.u8 q11, q8, d2 |
||

256 | vaddw.u8 q1, q9, d3 |
||

257 | vaddw.u8 q12, q8, d4 |
||

258 | vaddw.u8 q2, q9, d5 |
||

259 | vaddw.u8 q13, q8, d6 |
||

260 | vaddw.u8 q3, q9, d7 |
||

261 | sub r0, r0, r2, lsl #2 |
||

262 | vqmovun.s16 d20, q10 |
||

263 | vqmovun.s16 d21, q0 |
||

264 | vqmovun.s16 d22, q11 |
||

265 | vqmovun.s16 d23, q1 |
||

266 | vqmovun.s16 d24, q12 |
||

267 | vst1.8 {q10}, [r0,:128], r2 |
||

268 | vqmovun.s16 d25, q2 |
||

269 | vst1.8 {q11}, [r0,:128], r2 |
||

270 | vqmovun.s16 d26, q13 |
||

271 | vst1.8 {q12}, [r0,:128], r2 |
||

272 | vqmovun.s16 d27, q3 |
||

273 | vst1.8 {q13}, [r0,:128], r2 |
||

274 | |||

275 | bx lr |
||

276 | endfunc |
||

277 | |||

278 | @ Register layout: |
||

279 | @ P3..Q3 -> q0..q7 |
||

280 | @ flim_E -> q14 |
||

281 | @ flim_I -> q15 |
||

282 | @ hev_thresh -> r12 |
||

283 | @ |
||

284 | .macro vp8_loop_filter, inner=0, simple=0 |
||

285 | .if \simple |
||

286 | vabd.u8 q9, q3, q4 @ abs(P0-Q0) |
||

287 | vabd.u8 q15, q2, q5 @ abs(P1-Q1) |
||

288 | vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 |
||

289 | vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 |
||

290 | vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) |
||

291 | vmov.i8 q13, #0x80 |
||

292 | vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim |
||

293 | .else |
||

294 | @ calculate hev and normal_limit: |
||

295 | vabd.u8 q12, q2, q3 @ abs(P1-P0) |
||

296 | vabd.u8 q13, q5, q4 @ abs(Q1-Q0) |
||

297 | vabd.u8 q10, q0, q1 @ abs(P3-P2) |
||

298 | vabd.u8 q11, q1, q2 @ abs(P2-P1) |
||

299 | vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I |
||

300 | vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I |
||

301 | vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I |
||

302 | vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I |
||

303 | vand q8, q8, q9 |
||

304 | vabd.u8 q9, q7, q6 @ abs(Q3-Q2) |
||

305 | vand q8, q8, q11 |
||

306 | vabd.u8 q11, q6, q5 @ abs(Q2-Q1) |
||

307 | vand q8, q8, q10 |
||

308 | vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I |
||

309 | vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I |
||

310 | vabd.u8 q9, q3, q4 @ abs(P0-Q0) |
||

311 | vabd.u8 q15, q2, q5 @ abs(P1-Q1) |
||

312 | vand q8, q8, q10 |
||

313 | vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 |
||

314 | vand q8, q8, q11 |
||

315 | vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 |
||

316 | vdup.8 q15, r12 @ hev_thresh |
||

317 | vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) |
||

318 | vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh |
||

319 | vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E |
||

320 | vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh |
||

321 | vand q8, q8, q11 |
||

322 | vmov.i8 q13, #0x80 |
||

323 | vorr q9, q12, q14 |
||

324 | .endif |
||

325 | |||

326 | @ at this point: |
||

327 | @ q8: normal_limit |
||

328 | @ q9: hev |
||

329 | |||

330 | @ convert to signed value: |
||

331 | veor q3, q3, q13 @ PS0 = P0 ^ 0x80 |
||

332 | veor q4, q4, q13 @ QS0 = Q0 ^ 0x80 |
||

333 | |||

334 | vmov.i16 q12, #3 |
||

335 | vsubl.s8 q10, d8, d6 @ QS0 - PS0 |
||

336 | vsubl.s8 q11, d9, d7 @ (widened to 16bit) |
||

337 | veor q2, q2, q13 @ PS1 = P1 ^ 0x80 |
||

338 | veor q5, q5, q13 @ QS1 = Q1 ^ 0x80 |
||

339 | vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0) |
||

340 | vmul.i16 q11, q11, q12 |
||

341 | |||

342 | vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1) |
||

343 | vmov.i8 q14, #4 |
||

344 | vmov.i8 q15, #3 |
||

345 | .if \inner |
||

346 | vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1) |
||

347 | .endif |
||

348 | vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1) |
||

349 | vaddw.s8 q11, q11, d25 |
||

350 | vqmovn.s16 d20, q10 @ narrow result back into q10 |
||

351 | vqmovn.s16 d21, q11 |
||

352 | .if !\inner && !\simple |
||

353 | veor q1, q1, q13 @ PS2 = P2 ^ 0x80 |
||

354 | veor q6, q6, q13 @ QS2 = Q2 ^ 0x80 |
||

355 | .endif |
||

356 | vand q10, q10, q8 @ w &= normal_limit |
||

357 | |||

358 | @ registers used at this point.. |
||

359 | @ q0 -> P3 (don't corrupt) |
||

360 | @ q1-q6 -> PS2-QS2 |
||

361 | @ q7 -> Q3 (don't corrupt) |
||

362 | @ q9 -> hev |
||

363 | @ q10 -> w |
||

364 | @ q13 -> #0x80 |
||

365 | @ q14 -> #4 |
||

366 | @ q15 -> #3 |
||

367 | @ q8, q11, q12 -> unused |
||

368 | |||

369 | @ filter_common: is4tap==1 |
||

370 | @ c1 = clamp(w + 4) >> 3; |
||

371 | @ c2 = clamp(w + 3) >> 3; |
||

372 | @ Q0 = s2u(QS0 - c1); |
||

373 | @ P0 = s2u(PS0 + c2); |
||

374 | |||

375 | .if \simple |
||

376 | vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) |
||

377 | vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) |
||

378 | vshr.s8 q11, q11, #3 @ c1 >>= 3 |
||

379 | vshr.s8 q12, q12, #3 @ c2 >>= 3 |
||

380 | vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) |
||

381 | vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) |
||

382 | veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 |
||

383 | veor q3, q3, q13 @ P0 = PS0 ^ 0x80 |
||

384 | veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 |
||

385 | veor q2, q2, q13 @ P1 = PS1 ^ 0x80 |
||

386 | .elseif \inner |
||

387 | @ the !is4tap case of filter_common, only used for inner blocks |
||

388 | @ c3 = ((c1&~hev) + 1) >> 1; |
||

389 | @ Q1 = s2u(QS1 - c3); |
||

390 | @ P1 = s2u(PS1 + c3); |
||

391 | vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) |
||

392 | vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) |
||

393 | vshr.s8 q11, q11, #3 @ c1 >>= 3 |
||

394 | vshr.s8 q12, q12, #3 @ c2 >>= 3 |
||

395 | vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) |
||

396 | vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) |
||

397 | vbic q11, q11, q9 @ c1 & ~hev |
||

398 | veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 |
||

399 | vrshr.s8 q11, q11, #1 @ c3 >>= 1 |
||

400 | veor q3, q3, q13 @ P0 = PS0 ^ 0x80 |
||

401 | vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3) |
||

402 | vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3) |
||

403 | veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 |
||

404 | veor q2, q2, q13 @ P1 = PS1 ^ 0x80 |
||

405 | .else |
||

406 | vand q12, q10, q9 @ w & hev |
||

407 | vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4) |
||

408 | vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3) |
||

409 | vshr.s8 q11, q11, #3 @ c1 >>= 3 |
||

410 | vshr.s8 q12, q12, #3 @ c2 >>= 3 |
||

411 | vbic q10, q10, q9 @ w &= ~hev |
||

412 | vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) |
||

413 | vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) |
||

414 | |||

415 | @ filter_mbedge: |
||

416 | @ a = clamp((27*w + 63) >> 7); |
||

417 | @ Q0 = s2u(QS0 - a); |
||

418 | @ P0 = s2u(PS0 + a); |
||

419 | @ a = clamp((18*w + 63) >> 7); |
||

420 | @ Q1 = s2u(QS1 - a); |
||

421 | @ P1 = s2u(PS1 + a); |
||

422 | @ a = clamp((9*w + 63) >> 7); |
||

423 | @ Q2 = s2u(QS2 - a); |
||

424 | @ P2 = s2u(PS2 + a); |
||

425 | vmov.i16 q9, #63 |
||

426 | vshll.s8 q14, d20, #3 |
||

427 | vshll.s8 q15, d21, #3 |
||

428 | vaddw.s8 q14, q14, d20 |
||

429 | vaddw.s8 q15, q15, d21 |
||

430 | vadd.s16 q8, q9, q14 |
||

431 | vadd.s16 q9, q9, q15 @ 9*w + 63 |
||

432 | vadd.s16 q11, q8, q14 |
||

433 | vadd.s16 q12, q9, q15 @ 18*w + 63 |
||

434 | vadd.s16 q14, q11, q14 |
||

435 | vadd.s16 q15, q12, q15 @ 27*w + 63 |
||

436 | vqshrn.s16 d16, q8, #7 |
||

437 | vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7) |
||

438 | vqshrn.s16 d22, q11, #7 |
||

439 | vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7) |
||

440 | vqshrn.s16 d28, q14, #7 |
||

441 | vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7) |
||

442 | vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a) |
||

443 | vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a) |
||

444 | vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a) |
||

445 | vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a) |
||

446 | vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a) |
||

447 | vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a) |
||

448 | veor q3, q3, q13 @ P0 = PS0 ^ 0x80 |
||

449 | veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 |
||

450 | veor q2, q2, q13 @ P1 = PS1 ^ 0x80 |
||

451 | veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 |
||

452 | veor q1, q1, q13 @ P2 = PS2 ^ 0x80 |
||

453 | veor q6, q6, q13 @ Q2 = QS2 ^ 0x80 |
||

454 | .endif |
||

455 | .endm |
||

456 | |||

457 | .macro transpose8x16matrix |
||

458 | vtrn.32 q0, q4 |
||

459 | vtrn.32 q1, q5 |
||

460 | vtrn.32 q2, q6 |
||

461 | vtrn.32 q3, q7 |
||

462 | |||

463 | vtrn.16 q0, q2 |
||

464 | vtrn.16 q1, q3 |
||

465 | vtrn.16 q4, q6 |
||

466 | vtrn.16 q5, q7 |
||

467 | |||

468 | vtrn.8 q0, q1 |
||

469 | vtrn.8 q2, q3 |
||

470 | vtrn.8 q4, q5 |
||

471 | vtrn.8 q6, q7 |
||

472 | .endm |
||

473 | |||

474 | .macro vp8_v_loop_filter16 name, inner=0, simple=0 |
||

475 | function ff_vp8_v_loop_filter16\name\()_neon, export=1 |
||

476 | vpush {q4-q7} |
||

477 | sub r0, r0, r1, lsl #1+!\simple |
||

478 | |||

479 | @ Load pixels: |
||

480 | .if !\simple |
||

481 | ldr r12, [sp, #64] @ hev_thresh |
||

482 | vld1.8 {q0}, [r0,:128], r1 @ P3 |
||

483 | vld1.8 {q1}, [r0,:128], r1 @ P2 |
||

484 | .endif |
||

485 | vld1.8 {q2}, [r0,:128], r1 @ P1 |
||

486 | vld1.8 {q3}, [r0,:128], r1 @ P0 |
||

487 | vld1.8 {q4}, [r0,:128], r1 @ Q0 |
||

488 | vld1.8 {q5}, [r0,:128], r1 @ Q1 |
||

489 | .if !\simple |
||

490 | vld1.8 {q6}, [r0,:128], r1 @ Q2 |
||

491 | vld1.8 {q7}, [r0,:128] @ Q3 |
||

492 | vdup.8 q15, r3 @ flim_I |
||

493 | .endif |
||

494 | vdup.8 q14, r2 @ flim_E |
||

495 | |||

496 | vp8_loop_filter inner=\inner, simple=\simple |
||

497 | |||

498 | @ back up to P2: dst -= stride * 6 |
||

499 | sub r0, r0, r1, lsl #2 |
||

500 | .if !\simple |
||

501 | sub r0, r0, r1, lsl #1 |
||

502 | |||

503 | @ Store pixels: |
||

504 | vst1.8 {q1}, [r0,:128], r1 @ P2 |
||

505 | .endif |
||

506 | vst1.8 {q2}, [r0,:128], r1 @ P1 |
||

507 | vst1.8 {q3}, [r0,:128], r1 @ P0 |
||

508 | vst1.8 {q4}, [r0,:128], r1 @ Q0 |
||

509 | vst1.8 {q5}, [r0,:128], r1 @ Q1 |
||

510 | .if !\simple |
||

511 | vst1.8 {q6}, [r0,:128] @ Q2 |
||

512 | .endif |
||

513 | |||

514 | vpop {q4-q7} |
||

515 | bx lr |
||

516 | endfunc |
||

517 | .endm |
||

518 | |||

519 | vp8_v_loop_filter16 |
||

520 | vp8_v_loop_filter16 _inner, inner=1 |
||

521 | vp8_v_loop_filter16 _simple, simple=1 |
||

522 | |||

523 | .macro vp8_v_loop_filter8uv name, inner=0 |
||

524 | function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 |
||

525 | vpush {q4-q7} |
||

526 | sub r0, r0, r2, lsl #2 |
||

527 | sub r1, r1, r2, lsl #2 |
||

528 | ldr r12, [sp, #64] @ flim_I |
||

529 | |||

530 | @ Load pixels: |
||

531 | vld1.8 {d0}, [r0,:64], r2 @ P3 |
||

532 | vld1.8 {d1}, [r1,:64], r2 @ P3 |
||

533 | vld1.8 {d2}, [r0,:64], r2 @ P2 |
||

534 | vld1.8 {d3}, [r1,:64], r2 @ P2 |
||

535 | vld1.8 {d4}, [r0,:64], r2 @ P1 |
||

536 | vld1.8 {d5}, [r1,:64], r2 @ P1 |
||

537 | vld1.8 {d6}, [r0,:64], r2 @ P0 |
||

538 | vld1.8 {d7}, [r1,:64], r2 @ P0 |
||

539 | vld1.8 {d8}, [r0,:64], r2 @ Q0 |
||

540 | vld1.8 {d9}, [r1,:64], r2 @ Q0 |
||

541 | vld1.8 {d10}, [r0,:64], r2 @ Q1 |
||

542 | vld1.8 {d11}, [r1,:64], r2 @ Q1 |
||

543 | vld1.8 {d12}, [r0,:64], r2 @ Q2 |
||

544 | vld1.8 {d13}, [r1,:64], r2 @ Q2 |
||

545 | vld1.8 {d14}, [r0,:64] @ Q3 |
||

546 | vld1.8 {d15}, [r1,:64] @ Q3 |
||

547 | |||

548 | vdup.8 q14, r3 @ flim_E |
||

549 | vdup.8 q15, r12 @ flim_I |
||

550 | ldr r12, [sp, #68] @ hev_thresh |
||

551 | |||

552 | vp8_loop_filter inner=\inner |
||

553 | |||

554 | @ back up to P2: u,v -= stride * 6 |
||

555 | sub r0, r0, r2, lsl #2 |
||

556 | sub r1, r1, r2, lsl #2 |
||

557 | sub r0, r0, r2, lsl #1 |
||

558 | sub r1, r1, r2, lsl #1 |
||

559 | |||

560 | @ Store pixels: |
||

561 | vst1.8 {d2}, [r0,:64], r2 @ P2 |
||

562 | vst1.8 {d3}, [r1,:64], r2 @ P2 |
||

563 | vst1.8 {d4}, [r0,:64], r2 @ P1 |
||

564 | vst1.8 {d5}, [r1,:64], r2 @ P1 |
||

565 | vst1.8 {d6}, [r0,:64], r2 @ P0 |
||

566 | vst1.8 {d7}, [r1,:64], r2 @ P0 |
||

567 | vst1.8 {d8}, [r0,:64], r2 @ Q0 |
||

568 | vst1.8 {d9}, [r1,:64], r2 @ Q0 |
||

569 | vst1.8 {d10}, [r0,:64], r2 @ Q1 |
||

570 | vst1.8 {d11}, [r1,:64], r2 @ Q1 |
||

571 | vst1.8 {d12}, [r0,:64] @ Q2 |
||

572 | vst1.8 {d13}, [r1,:64] @ Q2 |
||

573 | |||

574 | vpop {q4-q7} |
||

575 | bx lr |
||

576 | endfunc |
||

577 | .endm |
||

578 | |||

579 | vp8_v_loop_filter8uv |
||

580 | vp8_v_loop_filter8uv _inner, inner=1 |
||

581 | |||

582 | .macro vp8_h_loop_filter16 name, inner=0, simple=0 |
||

583 | function ff_vp8_h_loop_filter16\name\()_neon, export=1 |
||

584 | vpush {q4-q7} |
||

585 | sub r0, r0, #4 |
||

586 | .if !\simple |
||

587 | ldr r12, [sp, #64] @ hev_thresh |
||

588 | .endif |
||

589 | |||

590 | @ Load pixels: |
||

591 | vld1.8 {d0}, [r0], r1 @ load first 8-line src data |
||

592 | vld1.8 {d2}, [r0], r1 |
||

593 | vld1.8 {d4}, [r0], r1 |
||

594 | vld1.8 {d6}, [r0], r1 |
||

595 | vld1.8 {d8}, [r0], r1 |
||

596 | vld1.8 {d10}, [r0], r1 |
||

597 | vld1.8 {d12}, [r0], r1 |
||

598 | vld1.8 {d14}, [r0], r1 |
||

599 | vld1.8 {d1}, [r0], r1 @ load second 8-line src data |
||

600 | vld1.8 {d3}, [r0], r1 |
||

601 | vld1.8 {d5}, [r0], r1 |
||

602 | vld1.8 {d7}, [r0], r1 |
||

603 | vld1.8 {d9}, [r0], r1 |
||

604 | vld1.8 {d11}, [r0], r1 |
||

605 | vld1.8 {d13}, [r0], r1 |
||

606 | vld1.8 {d15}, [r0], r1 |
||

607 | |||

608 | transpose8x16matrix |
||

609 | |||

610 | vdup.8 q14, r2 @ flim_E |
||

611 | .if !\simple |
||

612 | vdup.8 q15, r3 @ flim_I |
||

613 | .endif |
||

614 | |||

615 | vp8_loop_filter inner=\inner, simple=\simple |
||

616 | |||

617 | sub r0, r0, r1, lsl #4 @ backup 16 rows |
||

618 | |||

619 | transpose8x16matrix |
||

620 | |||

621 | @ Store pixels: |
||

622 | vst1.8 {d0}, [r0], r1 |
||

623 | vst1.8 {d2}, [r0], r1 |
||

624 | vst1.8 {d4}, [r0], r1 |
||

625 | vst1.8 {d6}, [r0], r1 |
||

626 | vst1.8 {d8}, [r0], r1 |
||

627 | vst1.8 {d10}, [r0], r1 |
||

628 | vst1.8 {d12}, [r0], r1 |
||

629 | vst1.8 {d14}, [r0], r1 |
||

630 | vst1.8 {d1}, [r0], r1 |
||

631 | vst1.8 {d3}, [r0], r1 |
||

632 | vst1.8 {d5}, [r0], r1 |
||

633 | vst1.8 {d7}, [r0], r1 |
||

634 | vst1.8 {d9}, [r0], r1 |
||

635 | vst1.8 {d11}, [r0], r1 |
||

636 | vst1.8 {d13}, [r0], r1 |
||

637 | vst1.8 {d15}, [r0] |
||

638 | |||

639 | vpop {q4-q7} |
||

640 | bx lr |
||

641 | endfunc |
||

642 | .endm |
||

643 | |||

644 | vp8_h_loop_filter16 |
||

645 | vp8_h_loop_filter16 _inner, inner=1 |
||

646 | vp8_h_loop_filter16 _simple, simple=1 |
||

647 | |||

648 | .macro vp8_h_loop_filter8uv name, inner=0 |
||

649 | function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 |
||

650 | vpush {q4-q7} |
||

651 | sub r0, r0, #4 |
||

652 | sub r1, r1, #4 |
||

653 | ldr r12, [sp, #64] @ flim_I |
||

654 | |||

655 | @ Load pixels: |
||

656 | vld1.8 {d0}, [r0], r2 @ load u |
||

657 | vld1.8 {d1}, [r1], r2 @ load v |
||

658 | vld1.8 {d2}, [r0], r2 |
||

659 | vld1.8 {d3}, [r1], r2 |
||

660 | vld1.8 {d4}, [r0], r2 |
||

661 | vld1.8 {d5}, [r1], r2 |
||

662 | vld1.8 {d6}, [r0], r2 |
||

663 | vld1.8 {d7}, [r1], r2 |
||

664 | vld1.8 {d8}, [r0], r2 |
||

665 | vld1.8 {d9}, [r1], r2 |
||

666 | vld1.8 {d10}, [r0], r2 |
||

667 | vld1.8 {d11}, [r1], r2 |
||

668 | vld1.8 {d12}, [r0], r2 |
||

669 | vld1.8 {d13}, [r1], r2 |
||

670 | vld1.8 {d14}, [r0], r2 |
||

671 | vld1.8 {d15}, [r1], r2 |
||

672 | |||

673 | transpose8x16matrix |
||

674 | |||

675 | vdup.8 q14, r3 @ flim_E |
||

676 | vdup.8 q15, r12 @ flim_I |
||

677 | ldr r12, [sp, #68] @ hev_thresh |
||

678 | |||

679 | vp8_loop_filter inner=\inner |
||

680 | |||

681 | sub r0, r0, r2, lsl #3 @ backup u 8 rows |
||

682 | sub r1, r1, r2, lsl #3 @ backup v 8 rows |
||

683 | |||

684 | transpose8x16matrix |
||

685 | |||

686 | @ Store pixels: |
||

687 | vst1.8 {d0}, [r0], r2 |
||

688 | vst1.8 {d1}, [r1], r2 |
||

689 | vst1.8 {d2}, [r0], r2 |
||

690 | vst1.8 {d3}, [r1], r2 |
||

691 | vst1.8 {d4}, [r0], r2 |
||

692 | vst1.8 {d5}, [r1], r2 |
||

693 | vst1.8 {d6}, [r0], r2 |
||

694 | vst1.8 {d7}, [r1], r2 |
||

695 | vst1.8 {d8}, [r0], r2 |
||

696 | vst1.8 {d9}, [r1], r2 |
||

697 | vst1.8 {d10}, [r0], r2 |
||

698 | vst1.8 {d11}, [r1], r2 |
||

699 | vst1.8 {d12}, [r0], r2 |
||

700 | vst1.8 {d13}, [r1], r2 |
||

701 | vst1.8 {d14}, [r0] |
||

702 | vst1.8 {d15}, [r1] |
||

703 | |||

704 | vpop {q4-q7} |
||

705 | bx lr |
||

706 | endfunc |
||

707 | .endm |
||

708 | |||

709 | vp8_h_loop_filter8uv |
||

710 | vp8_h_loop_filter8uv _inner, inner=1 |
||

711 | |||

712 | function ff_put_vp8_pixels16_neon, export=1 |
||

713 | ldr r12, [sp, #0] @ h |
||

714 | 1: |
||

715 | subs r12, r12, #4 |
||

716 | vld1.8 {q0}, [r2], r3 |
||

717 | vld1.8 {q1}, [r2], r3 |
||

718 | vld1.8 {q2}, [r2], r3 |
||

719 | vld1.8 {q3}, [r2], r3 |
||

720 | vst1.8 {q0}, [r0,:128], r1 |
||

721 | vst1.8 {q1}, [r0,:128], r1 |
||

722 | vst1.8 {q2}, [r0,:128], r1 |
||

723 | vst1.8 {q3}, [r0,:128], r1 |
||

724 | bgt 1b |
||

725 | bx lr |
||

726 | endfunc |
||

727 | |||

728 | function ff_put_vp8_pixels8_neon, export=1 |
||

729 | ldr r12, [sp, #0] @ h |
||

730 | 1: |
||

731 | subs r12, r12, #4 |
||

732 | vld1.8 {d0}, [r2], r3 |
||

733 | vld1.8 {d1}, [r2], r3 |
||

734 | vld1.8 {d2}, [r2], r3 |
||

735 | vld1.8 {d3}, [r2], r3 |
||

736 | vst1.8 {d0}, [r0,:64], r1 |
||

737 | vst1.8 {d1}, [r0,:64], r1 |
||

738 | vst1.8 {d2}, [r0,:64], r1 |
||

739 | vst1.8 {d3}, [r0,:64], r1 |
||

740 | bgt 1b |
||

741 | bx lr |
||

742 | endfunc |
||

743 | |||

744 | function ff_put_vp8_pixels4_neon, export=1 |
||

745 | ldr r12, [sp, #0] @ h |
||

746 | push {r4-r6,lr} |
||

747 | 1: |
||

748 | subs r12, r12, #4 |
||

749 | ldr r4, [r2], r3 |
||

750 | ldr r5, [r2], r3 |
||

751 | ldr r6, [r2], r3 |
||

752 | ldr lr, [r2], r3 |
||

753 | str r4, [r0], r1 |
||

754 | str r5, [r0], r1 |
||

755 | str r6, [r0], r1 |
||

756 | str lr, [r0], r1 |
||

757 | bgt 1b |
||

758 | pop {r4-r6,pc} |
||

759 | endfunc |
||

760 | |||

761 | /* 4/6-tap 8th-pel MC */ |
||

762 | |||

763 | .macro vp8_epel8_h6 d, a, b |
||

764 | vext.8 d27, \a, \b, #1 |
||

765 | vmovl.u8 q8, \a |
||

766 | vext.8 d28, \a, \b, #2 |
||

767 | vmovl.u8 q9, d27 |
||

768 | vext.8 d29, \a, \b, #3 |
||

769 | vmovl.u8 q10, d28 |
||

770 | vext.8 d30, \a, \b, #4 |
||

771 | vmovl.u8 q11, d29 |
||

772 | vext.8 d31, \a, \b, #5 |
||

773 | vmovl.u8 q12, d30 |
||

774 | vmul.u16 q10, q10, d0[2] |
||

775 | vmovl.u8 q13, d31 |
||

776 | vmul.u16 q11, q11, d0[3] |
||

777 | vmls.u16 q10, q9, d0[1] |
||

778 | vmls.u16 q11, q12, d1[0] |
||

779 | vmla.u16 q10, q8, d0[0] |
||

780 | vmla.u16 q11, q13, d1[1] |
||

781 | vqadd.s16 q11, q10, q11 |
||

782 | vqrshrun.s16 \d, q11, #7 |
||

783 | .endm |
||

784 | |||

785 | .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1 |
||

786 | vext.8 q14, \q0, \q1, #3 |
||

787 | vext.8 q15, \q0, \q1, #4 |
||

788 | vmovl.u8 q11, d28 |
||

789 | vmovl.u8 q14, d29 |
||

790 | vext.8 q3, \q0, \q1, #2 |
||

791 | vmovl.u8 q12, d30 |
||

792 | vmovl.u8 q15, d31 |
||

793 | vext.8 q8, \q0, \q1, #1 |
||

794 | vmovl.u8 q10, d6 |
||

795 | vmovl.u8 q3, d7 |
||

796 | vext.8 q2, \q0, \q1, #5 |
||

797 | vmovl.u8 q13, d4 |
||

798 | vmovl.u8 q2, d5 |
||

799 | vmovl.u8 q9, d16 |
||

800 | vmovl.u8 q8, d17 |
||

801 | vmul.u16 q11, q11, d0[3] |
||

802 | vmul.u16 q10, q10, d0[2] |
||

803 | vmul.u16 q3, q3, d0[2] |
||

804 | vmul.u16 q14, q14, d0[3] |
||

805 | vmls.u16 q11, q12, d1[0] |
||

806 | vmovl.u8 q12, \s0 |
||

807 | vmovl.u8 q1, \s1 |
||

808 | vmls.u16 q10, q9, d0[1] |
||

809 | vmls.u16 q3, q8, d0[1] |
||

810 | vmls.u16 q14, q15, d1[0] |
||

811 | vmla.u16 q10, q12, d0[0] |
||

812 | vmla.u16 q11, q13, d1[1] |
||

813 | vmla.u16 q3, q1, d0[0] |
||

814 | vmla.u16 q14, q2, d1[1] |
||

815 | vqadd.s16 q11, q10, q11 |
||

816 | vqadd.s16 q14, q3, q14 |
||

817 | vqrshrun.s16 \d0, q11, #7 |
||

818 | vqrshrun.s16 \d1, q14, #7 |
||

819 | .endm |
||

820 | |||

821 | .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 |
||

822 | vmovl.u8 q10, \s2 |
||

823 | vmovl.u8 q11, \s3 |
||

824 | vmovl.u8 q9, \s1 |
||

825 | vmovl.u8 q12, \s4 |
||

826 | vmovl.u8 q8, \s0 |
||

827 | vmovl.u8 q13, \s5 |
||

828 | vmul.u16 q10, q10, d0[2] |
||

829 | vmul.u16 q11, q11, d0[3] |
||

830 | vmls.u16 q10, q9, d0[1] |
||

831 | vmls.u16 q11, q12, d1[0] |
||

832 | vmla.u16 q10, q8, d0[0] |
||

833 | vmla.u16 q11, q13, d1[1] |
||

834 | vqadd.s16 q11, q10, q11 |
||

835 | vqrshrun.s16 \d0, q11, #7 |
||

836 | .endm |
||

837 | |||

838 | .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 |
||

839 | vmovl.u8 q10, \s0 |
||

840 | vmovl.u8 q11, \s3 |
||

841 | vmovl.u8 q14, \s6 |
||

842 | vmovl.u8 q9, \s1 |
||

843 | vmovl.u8 q12, \s4 |
||

844 | vmovl.u8 q8, \s2 |
||

845 | vmovl.u8 q13, \s5 |
||

846 | vmul.u16 q10, q10, d0[0] |
||

847 | vmul.u16 q15, q11, d0[3] |
||

848 | vmul.u16 q11, q11, d0[2] |
||

849 | vmul.u16 q14, q14, d1[1] |
||

850 | vmls.u16 q10, q9, d0[1] |
||

851 | vmls.u16 q15, q12, d1[0] |
||

852 | vmls.u16 q11, q8, d0[1] |
||

853 | vmls.u16 q14, q13, d1[0] |
||

854 | vmla.u16 q10, q8, d0[2] |
||

855 | vmla.u16 q15, q13, d1[1] |
||

856 | vmla.u16 q11, q9, d0[0] |
||

857 | vmla.u16 q14, q12, d0[3] |
||

858 | vqadd.s16 q15, q10, q15 |
||

859 | vqadd.s16 q14, q11, q14 |
||

860 | vqrshrun.s16 \d0, q15, #7 |
||

861 | vqrshrun.s16 \d1, q14, #7 |
||

862 | .endm |
||

863 | |||

864 | .macro vp8_epel8_h4 d, a, b |
||

865 | vext.8 d28, \a, \b, #1 |
||

866 | vmovl.u8 q9, \a |
||

867 | vext.8 d29, \a, \b, #2 |
||

868 | vmovl.u8 q10, d28 |
||

869 | vext.8 d30, \a, \b, #3 |
||

870 | vmovl.u8 q11, d29 |
||

871 | vmovl.u8 q12, d30 |
||

872 | vmul.u16 q10, q10, d0[2] |
||

873 | vmul.u16 q11, q11, d0[3] |
||

874 | vmls.u16 q10, q9, d0[1] |
||

875 | vmls.u16 q11, q12, d1[0] |
||

876 | vqadd.s16 q11, q10, q11 |
||

877 | vqrshrun.s16 \d, q11, #7 |
||

878 | .endm |
||

879 | |||

880 | .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4 |
||

881 | vmovl.u8 q9, \s0 |
||

882 | vmovl.u8 q10, \s1 |
||

883 | vmovl.u8 q11, \s2 |
||

884 | vmovl.u8 q12, \s3 |
||

885 | vmovl.u8 q13, \s4 |
||

886 | vmul.u16 q8, q10, d0[2] |
||

887 | vmul.u16 q14, q11, d0[3] |
||

888 | vmul.u16 q11, q11, d0[2] |
||

889 | vmul.u16 q15, q12, d0[3] |
||

890 | vmls.u16 q8, q9, d0[1] |
||

891 | vmls.u16 q14, q12, d1[0] |
||

892 | vmls.u16 q11, q10, d0[1] |
||

893 | vmls.u16 q15, q13, d1[0] |
||

894 | vqadd.s16 q8, q8, q14 |
||

895 | vqadd.s16 q11, q11, q15 |
||

896 | vqrshrun.s16 \d0, q8, #7 |
||

897 | vqrshrun.s16 \d1, q11, #7 |
||

898 | .endm |
||

899 | |||

900 | function ff_put_vp8_epel16_v6_neon, export=1 |
||

901 | sub r2, r2, r3, lsl #1 |
||

902 | push {r4,lr} |
||

903 | vpush {d8-d15} |
||

904 | |||

905 | ldr r4, [sp, #80] @ my |
||

906 | movrel lr, subpel_filters-16 |
||

907 | ldr r12, [sp, #72] @ h |
||

908 | add r4, lr, r4, lsl #4 |
||

909 | vld1.16 {q0}, [r4,:128] |
||

910 | 1: |
||

911 | vld1.8 {d2-d3}, [r2], r3 |
||

912 | vld1.8 {d4-d5}, [r2], r3 |
||

913 | vld1.8 {d6-d7}, [r2], r3 |
||

914 | vld1.8 {d8-d9}, [r2], r3 |
||

915 | vld1.8 {d10-d11},[r2], r3 |
||

916 | vld1.8 {d12-d13},[r2], r3 |
||

917 | vld1.8 {d14-d15},[r2] |
||

918 | sub r2, r2, r3, lsl #2 |
||

919 | |||

920 | vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14 |
||

921 | vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15 |
||

922 | |||

923 | vst1.8 {d2-d3}, [r0,:128], r1 |
||

924 | vst1.8 {d4-d5}, [r0,:128], r1 |
||

925 | subs r12, r12, #2 |
||

926 | bne 1b |
||

927 | |||

928 | vpop {d8-d15} |
||

929 | pop {r4,pc} |
||

930 | endfunc |
||

931 | |||

932 | function ff_put_vp8_epel16_h6_neon, export=1 |
||

933 | sub r2, r2, #2 |
||

934 | push {r4,lr} |
||

935 | |||

936 | ldr r4, [sp, #12] @ mx |
||

937 | movrel lr, subpel_filters-16 |
||

938 | ldr r12, [sp, #8] @ h |
||

939 | add r4, lr, r4, lsl #4 |
||

940 | vld1.16 {q0}, [r4,:128] |
||

941 | 1: |
||

942 | vld1.8 {d2-d4}, [r2], r3 |
||

943 | |||

944 | vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 |
||

945 | |||

946 | vst1.8 {d2-d3}, [r0,:128], r1 |
||

947 | subs r12, r12, #1 |
||

948 | bne 1b |
||

949 | |||

950 | pop {r4,pc} |
||

951 | endfunc |
||

952 | |||

953 | function ff_put_vp8_epel16_h6v6_neon, export=1 |
||

954 | sub r2, r2, r3, lsl #1 |
||

955 | sub r2, r2, #2 |
||

956 | push {r4,lr} |
||

957 | vpush {d8-d9} |
||

958 | |||

959 | @ first pass (horizontal): |
||

960 | ldr r4, [sp, #28] @ mx |
||

961 | movrel lr, subpel_filters-16 |
||

962 | ldr r12, [sp, #24] @ h |
||

963 | add r4, lr, r4, lsl #4 |
||

964 | sub sp, sp, #336+16 |
||

965 | vld1.16 {q0}, [r4,:128] |
||

966 | add lr, sp, #15 |
||

967 | add r12, r12, #5 |
||

968 | bic lr, lr, #15 |
||

969 | 1: |
||

970 | vld1.8 {d2,d3,d4}, [r2], r3 |
||

971 | |||

972 | vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 |
||

973 | |||

974 | vst1.8 {d2-d3}, [lr,:128]! |
||

975 | subs r12, r12, #1 |
||

976 | bne 1b |
||

977 | |||

978 | @ second pass (vertical): |
||

979 | ldr r4, [sp, #336+16+32] @ my |
||

980 | movrel lr, subpel_filters-16 |
||

981 | ldr r12, [sp, #336+16+24] @ h |
||

982 | add r4, lr, r4, lsl #4 |
||

983 | add lr, sp, #15 |
||

984 | vld1.16 {q0}, [r4,:128] |
||

985 | bic lr, lr, #15 |
||

986 | 2: |
||

987 | vld1.8 {d2-d5}, [lr,:128]! |
||

988 | vld1.8 {d6-d9}, [lr,:128]! |
||

989 | vld1.8 {d28-d31},[lr,:128] |
||

990 | sub lr, lr, #48 |
||

991 | |||

992 | vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30 |
||

993 | vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31 |
||

994 | |||

995 | vst1.8 {d2-d3}, [r0,:128], r1 |
||

996 | subs r12, r12, #1 |
||

997 | bne 2b |
||

998 | |||

999 | add sp, sp, #336+16 |
||

1000 | vpop {d8-d9} |
||

1001 | pop {r4,pc} |
||

1002 | endfunc |
||

1003 | |||

1004 | function ff_put_vp8_epel8_v6_neon, export=1 |
||

1005 | sub r2, r2, r3, lsl #1 |
||

1006 | push {r4,lr} |
||

1007 | |||

1008 | ldr r4, [sp, #16] @ my |
||

1009 | movrel lr, subpel_filters-16 |
||

1010 | ldr r12, [sp, #8] @ h |
||

1011 | add r4, lr, r4, lsl #4 |
||

1012 | vld1.16 {q0}, [r4,:128] |
||

1013 | 1: |
||

1014 | vld1.8 {d2}, [r2], r3 |
||

1015 | vld1.8 {d3}, [r2], r3 |
||

1016 | vld1.8 {d4}, [r2], r3 |
||

1017 | vld1.8 {d5}, [r2], r3 |
||

1018 | vld1.8 {d6}, [r2], r3 |
||

1019 | vld1.8 {d7}, [r2], r3 |
||

1020 | vld1.8 {d28}, [r2] |
||

1021 | |||

1022 | sub r2, r2, r3, lsl #2 |
||

1023 | |||

1024 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 |
||

1025 | |||

1026 | vst1.8 {d2}, [r0,:64], r1 |
||

1027 | vst1.8 {d3}, [r0,:64], r1 |
||

1028 | subs r12, r12, #2 |
||

1029 | bne 1b |
||

1030 | |||

1031 | pop {r4,pc} |
||

1032 | endfunc |
||

1033 | |||

1034 | function ff_put_vp8_epel8_h6_neon, export=1 |
||

1035 | sub r2, r2, #2 |
||

1036 | push {r4,lr} |
||

1037 | |||

1038 | ldr r4, [sp, #12] @ mx |
||

1039 | movrel lr, subpel_filters-16 |
||

1040 | ldr r12, [sp, #8] @ h |
||

1041 | add r4, lr, r4, lsl #4 |
||

1042 | vld1.16 {q0}, [r4,:128] |
||

1043 | 1: |
||

1044 | vld1.8 {d2,d3}, [r2], r3 |
||

1045 | |||

1046 | vp8_epel8_h6 d2, d2, d3 |
||

1047 | |||

1048 | vst1.8 {d2}, [r0,:64], r1 |
||

1049 | subs r12, r12, #1 |
||

1050 | bne 1b |
||

1051 | |||

1052 | pop {r4,pc} |
||

1053 | endfunc |
||

1054 | |||

1055 | function ff_put_vp8_epel8_h6v6_neon, export=1 |
||

1056 | sub r2, r2, r3, lsl #1 |
||

1057 | sub r2, r2, #2 |
||

1058 | push {r4,lr} |
||

1059 | |||

1060 | @ first pass (horizontal): |
||

1061 | ldr r4, [sp, #12] @ mx |
||

1062 | movrel lr, subpel_filters-16 |
||

1063 | ldr r12, [sp, #8] @ h |
||

1064 | add r4, lr, r4, lsl #4 |
||

1065 | sub sp, sp, #168+16 |
||

1066 | vld1.16 {q0}, [r4,:128] |
||

1067 | add lr, sp, #15 |
||

1068 | add r12, r12, #5 |
||

1069 | bic lr, lr, #15 |
||

1070 | 1: |
||

1071 | vld1.8 {d2,d3}, [r2], r3 |
||

1072 | |||

1073 | vp8_epel8_h6 d2, d2, d3 |
||

1074 | |||

1075 | vst1.8 {d2}, [lr,:64]! |
||

1076 | subs r12, r12, #1 |
||

1077 | bne 1b |
||

1078 | |||

1079 | @ second pass (vertical): |
||

1080 | ldr r4, [sp, #168+16+16] @ my |
||

1081 | movrel lr, subpel_filters-16 |
||

1082 | ldr r12, [sp, #168+16+8] @ h |
||

1083 | add r4, lr, r4, lsl #4 |
||

1084 | add lr, sp, #15 |
||

1085 | vld1.16 {q0}, [r4,:128] |
||

1086 | bic lr, lr, #15 |
||

1087 | 2: |
||

1088 | vld1.8 {d2-d5}, [lr,:128]! |
||

1089 | vld1.8 {d6-d7}, [lr,:128]! |
||

1090 | vld1.8 {d30}, [lr,:64] |
||

1091 | sub lr, lr, #32 |
||

1092 | |||

1093 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 |
||

1094 | |||

1095 | vst1.8 {d2}, [r0,:64], r1 |
||

1096 | vst1.8 {d3}, [r0,:64], r1 |
||

1097 | subs r12, r12, #2 |
||

1098 | bne 2b |
||

1099 | |||

1100 | add sp, sp, #168+16 |
||

1101 | pop {r4,pc} |
||

1102 | endfunc |
||

1103 | |||

1104 | function ff_put_vp8_epel8_v4_neon, export=1 |
||

1105 | sub r2, r2, r3 |
||

1106 | push {r4,lr} |
||

1107 | |||

1108 | ldr r4, [sp, #16] @ my |
||

1109 | movrel lr, subpel_filters-16 |
||

1110 | ldr r12, [sp, #8] @ h |
||

1111 | add r4, lr, r4, lsl #4 |
||

1112 | vld1.16 {q0}, [r4,:128] |
||

1113 | 1: |
||

1114 | vld1.8 {d2}, [r2], r3 |
||

1115 | vld1.8 {d3}, [r2], r3 |
||

1116 | vld1.8 {d4}, [r2], r3 |
||

1117 | vld1.8 {d5}, [r2], r3 |
||

1118 | vld1.8 {d6}, [r2] |
||

1119 | sub r2, r2, r3, lsl #1 |
||

1120 | |||

1121 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |
||

1122 | |||

1123 | vst1.8 {d2}, [r0,:64], r1 |
||

1124 | vst1.8 {d3}, [r0,:64], r1 |
||

1125 | subs r12, r12, #2 |
||

1126 | bne 1b |
||

1127 | |||

1128 | pop {r4,pc} |
||

1129 | endfunc |
||

1130 | |||

1131 | function ff_put_vp8_epel8_h4_neon, export=1 |
||

1132 | sub r2, r2, #1 |
||

1133 | push {r4,lr} |
||

1134 | |||

1135 | ldr r4, [sp, #12] @ mx |
||

1136 | movrel lr, subpel_filters-16 |
||

1137 | ldr r12, [sp, #8] @ h |
||

1138 | add r4, lr, r4, lsl #4 |
||

1139 | vld1.16 {q0}, [r4,:128] |
||

1140 | 1: |
||

1141 | vld1.8 {d2,d3}, [r2], r3 |
||

1142 | |||

1143 | vp8_epel8_h4 d2, d2, d3 |
||

1144 | |||

1145 | vst1.8 {d2}, [r0,:64], r1 |
||

1146 | subs r12, r12, #1 |
||

1147 | bne 1b |
||

1148 | |||

1149 | pop {r4,pc} |
||

1150 | endfunc |
||

1151 | |||

1152 | function ff_put_vp8_epel8_h4v4_neon, export=1 |
||

1153 | sub r2, r2, r3 |
||

1154 | sub r2, r2, #1 |
||

1155 | push {r4,lr} |
||

1156 | |||

1157 | @ first pass (horizontal): |
||

1158 | ldr r4, [sp, #12] @ mx |
||

1159 | movrel lr, subpel_filters-16 |
||

1160 | ldr r12, [sp, #8] @ h |
||

1161 | add r4, lr, r4, lsl #4 |
||

1162 | sub sp, sp, #168+16 |
||

1163 | vld1.16 {q0}, [r4,:128] |
||

1164 | add lr, sp, #15 |
||

1165 | add r12, r12, #3 |
||

1166 | bic lr, lr, #15 |
||

1167 | 1: |
||

1168 | vld1.8 {d2,d3}, [r2], r3 |
||

1169 | |||

1170 | vp8_epel8_h4 d2, d2, d3 |
||

1171 | |||

1172 | vst1.8 {d2}, [lr,:64]! |
||

1173 | subs r12, r12, #1 |
||

1174 | bne 1b |
||

1175 | |||

1176 | @ second pass (vertical): |
||

1177 | ldr r4, [sp, #168+16+16] @ my |
||

1178 | movrel lr, subpel_filters-16 |
||

1179 | ldr r12, [sp, #168+16+8] @ h |
||

1180 | add r4, lr, r4, lsl #4 |
||

1181 | add lr, sp, #15 |
||

1182 | vld1.16 {q0}, [r4,:128] |
||

1183 | bic lr, lr, #15 |
||

1184 | 2: |
||

1185 | vld1.8 {d2-d5}, [lr,:128]! |
||

1186 | vld1.8 {d6}, [lr,:64] |
||

1187 | sub lr, lr, #16 |
||

1188 | |||

1189 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |
||

1190 | |||

1191 | vst1.8 {d2}, [r0,:64], r1 |
||

1192 | vst1.8 {d3}, [r0,:64], r1 |
||

1193 | subs r12, r12, #2 |
||

1194 | bne 2b |
||

1195 | |||

1196 | add sp, sp, #168+16 |
||

1197 | pop {r4,pc} |
||

1198 | endfunc |
||

1199 | |||

1200 | function ff_put_vp8_epel8_h6v4_neon, export=1 |
||

1201 | sub r2, r2, r3 |
||

1202 | sub r2, r2, #2 |
||

1203 | push {r4,lr} |
||

1204 | |||

1205 | @ first pass (horizontal): |
||

1206 | ldr r4, [sp, #12] @ mx |
||

1207 | movrel lr, subpel_filters-16 |
||

1208 | ldr r12, [sp, #8] @ h |
||

1209 | add r4, lr, r4, lsl #4 |
||

1210 | sub sp, sp, #168+16 |
||

1211 | vld1.16 {q0}, [r4,:128] |
||

1212 | add lr, sp, #15 |
||

1213 | add r12, r12, #3 |
||

1214 | bic lr, lr, #15 |
||

1215 | 1: |
||

1216 | vld1.8 {d2,d3}, [r2], r3 |
||

1217 | |||

1218 | vp8_epel8_h6 d2, d2, d3 |
||

1219 | |||

1220 | vst1.8 {d2}, [lr,:64]! |
||

1221 | subs r12, r12, #1 |
||

1222 | bne 1b |
||

1223 | |||

1224 | @ second pass (vertical): |
||

1225 | ldr r4, [sp, #168+16+16] @ my |
||

1226 | movrel lr, subpel_filters-16 |
||

1227 | ldr r12, [sp, #168+16+8] @ h |
||

1228 | add r4, lr, r4, lsl #4 |
||

1229 | add lr, sp, #15 |
||

1230 | vld1.16 {q0}, [r4,:128] |
||

1231 | bic lr, lr, #15 |
||

1232 | 2: |
||

1233 | vld1.8 {d2-d5}, [lr,:128]! |
||

1234 | vld1.8 {d6}, [lr,:64] |
||

1235 | sub lr, lr, #16 |
||

1236 | |||

1237 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |
||

1238 | |||

1239 | vst1.8 {d2}, [r0,:64], r1 |
||

1240 | vst1.8 {d3}, [r0,:64], r1 |
||

1241 | subs r12, r12, #2 |
||

1242 | bne 2b |
||

1243 | |||

1244 | add sp, sp, #168+16 |
||

1245 | pop {r4,pc} |
||

1246 | endfunc |
||

1247 | |||

1248 | function ff_put_vp8_epel8_h4v6_neon, export=1 |
||

1249 | sub r2, r2, r3, lsl #1 |
||

1250 | sub r2, r2, #1 |
||

1251 | push {r4,lr} |
||

1252 | |||

1253 | @ first pass (horizontal): |
||

1254 | ldr r4, [sp, #12] @ mx |
||

1255 | movrel lr, subpel_filters-16 |
||

1256 | ldr r12, [sp, #8] @ h |
||

1257 | add r4, lr, r4, lsl #4 |
||

1258 | sub sp, sp, #168+16 |
||

1259 | vld1.16 {q0}, [r4,:128] |
||

1260 | add lr, sp, #15 |
||

1261 | add r12, r12, #5 |
||

1262 | bic lr, lr, #15 |
||

1263 | 1: |
||

1264 | vld1.8 {d2,d3}, [r2], r3 |
||

1265 | |||

1266 | vp8_epel8_h4 d2, d2, d3 |
||

1267 | |||

1268 | vst1.8 {d2}, [lr,:64]! |
||

1269 | subs r12, r12, #1 |
||

1270 | bne 1b |
||

1271 | |||

1272 | @ second pass (vertical): |
||

1273 | ldr r4, [sp, #168+16+16] @ my |
||

1274 | movrel lr, subpel_filters-16 |
||

1275 | ldr r12, [sp, #168+16+8] @ h |
||

1276 | add r4, lr, r4, lsl #4 |
||

1277 | add lr, sp, #15 |
||

1278 | vld1.16 {q0}, [r4,:128] |
||

1279 | bic lr, lr, #15 |
||

1280 | 2: |
||

1281 | vld1.8 {d2-d5}, [lr,:128]! |
||

1282 | vld1.8 {d6-d7}, [lr,:128]! |
||

1283 | vld1.8 {d30}, [lr,:64] |
||

1284 | sub lr, lr, #32 |
||

1285 | |||

1286 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 |
||

1287 | |||

1288 | vst1.8 {d2}, [r0,:64], r1 |
||

1289 | vst1.8 {d3}, [r0,:64], r1 |
||

1290 | subs r12, r12, #2 |
||

1291 | bne 2b |
||

1292 | |||

1293 | add sp, sp, #168+16 |
||

1294 | pop {r4,pc} |
||

1295 | endfunc |
||

1296 | |||

1297 | function ff_put_vp8_epel4_v6_neon, export=1 |
||

1298 | sub r2, r2, r3, lsl #1 |
||

1299 | push {r4,lr} |
||

1300 | |||

1301 | ldr r4, [sp, #16] @ my |
||

1302 | movrel lr, subpel_filters-16 |
||

1303 | ldr r12, [sp, #8] @ h |
||

1304 | add r4, lr, r4, lsl #4 |
||

1305 | vld1.16 {q0}, [r4,:128] |
||

1306 | 1: |
||

1307 | vld1.32 {d2[]}, [r2], r3 |
||

1308 | vld1.32 {d3[]}, [r2], r3 |
||

1309 | vld1.32 {d4[]}, [r2], r3 |
||

1310 | vld1.32 {d5[]}, [r2], r3 |
||

1311 | vld1.32 {d6[]}, [r2], r3 |
||

1312 | vld1.32 {d7[]}, [r2], r3 |
||

1313 | vld1.32 {d28[]}, [r2] |
||

1314 | sub r2, r2, r3, lsl #2 |
||

1315 | vld1.32 {d2[1]}, [r2], r3 |
||

1316 | vld1.32 {d3[1]}, [r2], r3 |
||

1317 | vld1.32 {d4[1]}, [r2], r3 |
||

1318 | vld1.32 {d5[1]}, [r2], r3 |
||

1319 | vld1.32 {d6[1]}, [r2], r3 |
||

1320 | vld1.32 {d7[1]}, [r2], r3 |
||

1321 | vld1.32 {d28[1]}, [r2] |
||

1322 | sub r2, r2, r3, lsl #2 |
||

1323 | |||

1324 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 |
||

1325 | |||

1326 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1327 | vst1.32 {d3[0]}, [r0,:32], r1 |
||

1328 | vst1.32 {d2[1]}, [r0,:32], r1 |
||

1329 | vst1.32 {d3[1]}, [r0,:32], r1 |
||

1330 | subs r12, r12, #4 |
||

1331 | bne 1b |
||

1332 | |||

1333 | pop {r4,pc} |
||

1334 | endfunc |
||

1335 | |||

1336 | function ff_put_vp8_epel4_h6_neon, export=1 |
||

1337 | sub r2, r2, #2 |
||

1338 | push {r4,lr} |
||

1339 | |||

1340 | ldr r4, [sp, #12] @ mx |
||

1341 | movrel lr, subpel_filters-16 |
||

1342 | ldr r12, [sp, #8] @ h |
||

1343 | add r4, lr, r4, lsl #4 |
||

1344 | vld1.16 {q0}, [r4,:128] |
||

1345 | 1: |
||

1346 | vld1.8 {q1}, [r2], r3 |
||

1347 | vp8_epel8_h6 d2, d2, d3 |
||

1348 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1349 | subs r12, r12, #1 |
||

1350 | bne 1b |
||

1351 | |||

1352 | pop {r4,pc} |
||

1353 | endfunc |
||

1354 | |||

1355 | function ff_put_vp8_epel4_h6v6_neon, export=1 |
||

1356 | sub r2, r2, r3, lsl #1 |
||

1357 | sub r2, r2, #2 |
||

1358 | push {r4,lr} |
||

1359 | |||

1360 | ldr r4, [sp, #12] @ mx |
||

1361 | movrel lr, subpel_filters-16 |
||

1362 | ldr r12, [sp, #8] @ h |
||

1363 | add r4, lr, r4, lsl #4 |
||

1364 | sub sp, sp, #52+16 |
||

1365 | vld1.16 {q0}, [r4,:128] |
||

1366 | add lr, sp, #15 |
||

1367 | add r12, r12, #5 |
||

1368 | bic lr, lr, #15 |
||

1369 | 1: |
||

1370 | vld1.8 {q1}, [r2], r3 |
||

1371 | vp8_epel8_h6 d2, d2, d3 |
||

1372 | vst1.32 {d2[0]}, [lr,:32]! |
||

1373 | subs r12, r12, #1 |
||

1374 | bne 1b |
||

1375 | |||

1376 | ldr r4, [sp, #52+16+16] @ my |
||

1377 | movrel lr, subpel_filters-16 |
||

1378 | ldr r12, [sp, #52+16+8] @ h |
||

1379 | add r4, lr, r4, lsl #4 |
||

1380 | add lr, sp, #15 |
||

1381 | vld1.16 {q0}, [r4,:128] |
||

1382 | bic lr, lr, #15 |
||

1383 | 2: |
||

1384 | vld1.8 {d2-d3}, [lr,:128]! |
||

1385 | vld1.8 {d6}, [lr,:64]! |
||

1386 | vld1.32 {d28[]}, [lr,:32] |
||

1387 | sub lr, lr, #16 |
||

1388 | vld1.8 {d4-d5}, [lr]! |
||

1389 | vld1.8 {d7}, [lr,:64]! |
||

1390 | vld1.32 {d28[1]}, [lr,:32] |
||

1391 | sub lr, lr, #16 |
||

1392 | vtrn.32 q1, q2 |
||

1393 | vtrn.32 d6, d7 |
||

1394 | vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 |
||

1395 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1396 | vst1.32 {d3[0]}, [r0,:32], r1 |
||

1397 | vst1.32 {d2[1]}, [r0,:32], r1 |
||

1398 | vst1.32 {d3[1]}, [r0,:32], r1 |
||

1399 | subs r12, r12, #4 |
||

1400 | bne 2b |
||

1401 | |||

1402 | add sp, sp, #52+16 |
||

1403 | pop {r4,pc} |
||

1404 | endfunc |
||

1405 | |||

1406 | function ff_put_vp8_epel4_h4v6_neon, export=1 |
||

1407 | sub r2, r2, r3, lsl #1 |
||

1408 | sub r2, r2, #1 |
||

1409 | push {r4,lr} |
||

1410 | |||

1411 | ldr r4, [sp, #12] @ mx |
||

1412 | movrel lr, subpel_filters-16 |
||

1413 | ldr r12, [sp, #8] @ h |
||

1414 | add r4, lr, r4, lsl #4 |
||

1415 | sub sp, sp, #52+16 |
||

1416 | vld1.16 {q0}, [r4,:128] |
||

1417 | add lr, sp, #15 |
||

1418 | add r12, r12, #5 |
||

1419 | bic lr, lr, #15 |
||

1420 | 1: |
||

1421 | vld1.8 {d2}, [r2], r3 |
||

1422 | vp8_epel8_h4 d2, d2, d2 |
||

1423 | vst1.32 {d2[0]}, [lr,:32]! |
||

1424 | subs r12, r12, #1 |
||

1425 | bne 1b |
||

1426 | |||

1427 | ldr r4, [sp, #52+16+16] @ my |
||

1428 | movrel lr, subpel_filters-16 |
||

1429 | ldr r12, [sp, #52+16+8] @ h |
||

1430 | add r4, lr, r4, lsl #4 |
||

1431 | add lr, sp, #15 |
||

1432 | vld1.16 {q0}, [r4,:128] |
||

1433 | bic lr, lr, #15 |
||

1434 | 2: |
||

1435 | vld1.8 {d2-d3}, [lr,:128]! |
||

1436 | vld1.8 {d6}, [lr,:64]! |
||

1437 | vld1.32 {d28[]}, [lr,:32] |
||

1438 | sub lr, lr, #16 |
||

1439 | vld1.8 {d4-d5}, [lr]! |
||

1440 | vld1.8 {d7}, [lr,:64]! |
||

1441 | vld1.32 {d28[1]}, [lr,:32] |
||

1442 | sub lr, lr, #16 |
||

1443 | vtrn.32 q1, q2 |
||

1444 | vtrn.32 d6, d7 |
||

1445 | vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 |
||

1446 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1447 | vst1.32 {d3[0]}, [r0,:32], r1 |
||

1448 | vst1.32 {d2[1]}, [r0,:32], r1 |
||

1449 | vst1.32 {d3[1]}, [r0,:32], r1 |
||

1450 | subs r12, r12, #4 |
||

1451 | bne 2b |
||

1452 | |||

1453 | add sp, sp, #52+16 |
||

1454 | pop {r4,pc} |
||

1455 | endfunc |
||

1456 | |||

1457 | function ff_put_vp8_epel4_h6v4_neon, export=1 |
||

1458 | sub r2, r2, r3 |
||

1459 | sub r2, r2, #2 |
||

1460 | push {r4,lr} |
||

1461 | |||

1462 | ldr r4, [sp, #12] @ mx |
||

1463 | movrel lr, subpel_filters-16 |
||

1464 | ldr r12, [sp, #8] @ h |
||

1465 | add r4, lr, r4, lsl #4 |
||

1466 | sub sp, sp, #44+16 |
||

1467 | vld1.16 {q0}, [r4,:128] |
||

1468 | add lr, sp, #15 |
||

1469 | add r12, r12, #3 |
||

1470 | bic lr, lr, #15 |
||

1471 | 1: |
||

1472 | vld1.8 {q1}, [r2], r3 |
||

1473 | vp8_epel8_h6 d2, d2, d3 |
||

1474 | vst1.32 {d2[0]}, [lr,:32]! |
||

1475 | subs r12, r12, #1 |
||

1476 | bne 1b |
||

1477 | |||

1478 | ldr r4, [sp, #44+16+16] @ my |
||

1479 | movrel lr, subpel_filters-16 |
||

1480 | ldr r12, [sp, #44+16+8] @ h |
||

1481 | add r4, lr, r4, lsl #4 |
||

1482 | add lr, sp, #15 |
||

1483 | vld1.16 {q0}, [r4,:128] |
||

1484 | bic lr, lr, #15 |
||

1485 | 2: |
||

1486 | vld1.8 {d2-d3}, [lr,:128]! |
||

1487 | vld1.32 {d6[]}, [lr,:32] |
||

1488 | sub lr, lr, #8 |
||

1489 | vld1.8 {d4-d5}, [lr]! |
||

1490 | vld1.32 {d6[1]}, [lr,:32] |
||

1491 | sub lr, lr, #8 |
||

1492 | vtrn.32 q1, q2 |
||

1493 | vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 |
||

1494 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1495 | vst1.32 {d3[0]}, [r0,:32], r1 |
||

1496 | vst1.32 {d2[1]}, [r0,:32], r1 |
||

1497 | vst1.32 {d3[1]}, [r0,:32], r1 |
||

1498 | subs r12, r12, #4 |
||

1499 | bne 2b |
||

1500 | |||

1501 | add sp, sp, #44+16 |
||

1502 | pop {r4,pc} |
||

1503 | endfunc |
||

1504 | |||

1505 | function ff_put_vp8_epel4_h4_neon, export=1 |
||

1506 | sub r2, r2, #1 |
||

1507 | push {r4,lr} |
||

1508 | |||

1509 | ldr r4, [sp, #12] @ mx |
||

1510 | movrel lr, subpel_filters-16 |
||

1511 | ldr r12, [sp, #8] @ h |
||

1512 | add r4, lr, r4, lsl #4 |
||

1513 | vld1.16 {q0}, [r4,:128] |
||

1514 | 1: |
||

1515 | vld1.8 {d2}, [r2], r3 |
||

1516 | vp8_epel8_h4 d2, d2, d2 |
||

1517 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1518 | subs r12, r12, #1 |
||

1519 | bne 1b |
||

1520 | |||

1521 | pop {r4,pc} |
||

1522 | endfunc |
||

1523 | |||

1524 | function ff_put_vp8_epel4_v4_neon, export=1 |
||

1525 | sub r2, r2, r3 |
||

1526 | push {r4,lr} |
||

1527 | |||

1528 | ldr r4, [sp, #16] @ my |
||

1529 | movrel lr, subpel_filters-16 |
||

1530 | ldr r12, [sp, #8] @ h |
||

1531 | add r4, lr, r4, lsl #4 |
||

1532 | vld1.16 {q0}, [r4,:128] |
||

1533 | 1: |
||

1534 | vld1.32 {d2[]}, [r2], r3 |
||

1535 | vld1.32 {d3[]}, [r2], r3 |
||

1536 | vld1.32 {d4[]}, [r2], r3 |
||

1537 | vld1.32 {d5[]}, [r2], r3 |
||

1538 | vld1.32 {d6[]}, [r2] |
||

1539 | sub r2, r2, r3, lsl #1 |
||

1540 | vld1.32 {d2[1]}, [r2], r3 |
||

1541 | vld1.32 {d3[1]}, [r2], r3 |
||

1542 | vld1.32 {d4[1]}, [r2], r3 |
||

1543 | vld1.32 {d5[1]}, [r2], r3 |
||

1544 | vld1.32 {d6[1]}, [r2] |
||

1545 | sub r2, r2, r3, lsl #1 |
||

1546 | |||

1547 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |
||

1548 | |||

1549 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1550 | vst1.32 {d3[0]}, [r0,:32], r1 |
||

1551 | vst1.32 {d2[1]}, [r0,:32], r1 |
||

1552 | vst1.32 {d3[1]}, [r0,:32], r1 |
||

1553 | subs r12, r12, #4 |
||

1554 | bne 1b |
||

1555 | |||

1556 | pop {r4,pc} |
||

1557 | endfunc |
||

1558 | |||

1559 | function ff_put_vp8_epel4_h4v4_neon, export=1 |
||

1560 | sub r2, r2, r3 |
||

1561 | sub r2, r2, #1 |
||

1562 | push {r4,lr} |
||

1563 | |||

1564 | ldr r4, [sp, #12] @ mx |
||

1565 | movrel lr, subpel_filters-16 |
||

1566 | ldr r12, [sp, #8] @ h |
||

1567 | add r4, lr, r4, lsl #4 |
||

1568 | sub sp, sp, #44+16 |
||

1569 | vld1.16 {q0}, [r4,:128] |
||

1570 | add lr, sp, #15 |
||

1571 | add r12, r12, #3 |
||

1572 | bic lr, lr, #15 |
||

1573 | 1: |
||

1574 | vld1.8 {d2}, [r2], r3 |
||

1575 | vp8_epel8_h4 d2, d2, d3 |
||

1576 | vst1.32 {d2[0]}, [lr,:32]! |
||

1577 | subs r12, r12, #1 |
||

1578 | bne 1b |
||

1579 | |||

1580 | ldr r4, [sp, #44+16+16] @ my |
||

1581 | movrel lr, subpel_filters-16 |
||

1582 | ldr r12, [sp, #44+16+8] @ h |
||

1583 | add r4, lr, r4, lsl #4 |
||

1584 | add lr, sp, #15 |
||

1585 | vld1.16 {q0}, [r4,:128] |
||

1586 | bic lr, lr, #15 |
||

1587 | 2: |
||

1588 | vld1.8 {d2-d3}, [lr,:128]! |
||

1589 | vld1.32 {d6[]}, [lr,:32] |
||

1590 | sub lr, lr, #8 |
||

1591 | vld1.8 {d4-d5}, [lr]! |
||

1592 | vld1.32 {d6[1]}, [lr,:32] |
||

1593 | sub lr, lr, #8 |
||

1594 | vtrn.32 q1, q2 |
||

1595 | vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 |
||

1596 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1597 | vst1.32 {d3[0]}, [r0,:32], r1 |
||

1598 | vst1.32 {d2[1]}, [r0,:32], r1 |
||

1599 | vst1.32 {d3[1]}, [r0,:32], r1 |
||

1600 | subs r12, r12, #4 |
||

1601 | bne 2b |
||

1602 | |||

1603 | add sp, sp, #44+16 |
||

1604 | pop {r4,pc} |
||

1605 | endfunc |
||

1606 | |||

1607 | @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit |
||

1608 | @ arithmatic can be used to apply filters |
||

1609 | const subpel_filters, align=4 |
||

1610 | .short 0, 6, 123, 12, 1, 0, 0, 0 |
||

1611 | .short 2, 11, 108, 36, 8, 1, 0, 0 |
||

1612 | .short 0, 9, 93, 50, 6, 0, 0, 0 |
||

1613 | .short 3, 16, 77, 77, 16, 3, 0, 0 |
||

1614 | .short 0, 6, 50, 93, 9, 0, 0, 0 |
||

1615 | .short 1, 8, 36, 108, 11, 2, 0, 0 |
||

1616 | .short 0, 1, 12, 123, 6, 0, 0, 0 |
||

1617 | endconst |
||

1618 | |||

1619 | /* Bilinear MC */ |
||

1620 | |||

1621 | function ff_put_vp8_bilin16_h_neon, export=1 |
||

1622 | ldr r3, [sp, #4] @ mx |
||

1623 | rsb r12, r3, #8 |
||

1624 | vdup.8 d0, r3 |
||

1625 | vdup.8 d1, r12 |
||

1626 | ldr r12, [sp] @ h |
||

1627 | 1: |
||

1628 | subs r12, r12, #2 |
||

1629 | vld1.8 {d2-d4}, [r2], r1 |
||

1630 | vext.8 q2, q1, q2, #1 |
||

1631 | vmull.u8 q8, d2, d1 |
||

1632 | vmlal.u8 q8, d4, d0 |
||

1633 | vld1.8 {d18-d20},[r2], r1 |
||

1634 | vmull.u8 q3, d3, d1 |
||

1635 | vmlal.u8 q3, d5, d0 |
||

1636 | vext.8 q10, q9, q10, #1 |
||

1637 | vmull.u8 q11, d18, d1 |
||

1638 | vmlal.u8 q11, d20, d0 |
||

1639 | vmull.u8 q12, d19, d1 |
||

1640 | vmlal.u8 q12, d21, d0 |
||

1641 | vrshrn.u16 d4, q8, #3 |
||

1642 | vrshrn.u16 d5, q3, #3 |
||

1643 | vrshrn.u16 d6, q11, #3 |
||

1644 | vrshrn.u16 d7, q12, #3 |
||

1645 | vst1.8 {q2}, [r0,:128], r1 |
||

1646 | vst1.8 {q3}, [r0,:128], r1 |
||

1647 | bgt 1b |
||

1648 | |||

1649 | bx lr |
||

1650 | endfunc |
||

1651 | |||

1652 | function ff_put_vp8_bilin16_v_neon, export=1 |
||

1653 | ldr r3, [sp, #8] @ my |
||

1654 | rsb r12, r3, #8 |
||

1655 | vdup.8 d0, r3 |
||

1656 | vdup.8 d1, r12 |
||

1657 | ldr r12, [sp] @ h |
||

1658 | vld1.8 {q1}, [r2], r1 |
||

1659 | 1: |
||

1660 | subs r12, r12, #2 |
||

1661 | vld1.8 {q2}, [r2], r1 |
||

1662 | vmull.u8 q3, d2, d1 |
||

1663 | vmlal.u8 q3, d4, d0 |
||

1664 | vmull.u8 q8, d3, d1 |
||

1665 | vmlal.u8 q8, d5, d0 |
||

1666 | vld1.8 {q1}, [r2], r1 |
||

1667 | vmull.u8 q9, d4, d1 |
||

1668 | vmlal.u8 q9, d2, d0 |
||

1669 | vmull.u8 q10, d5, d1 |
||

1670 | vmlal.u8 q10, d3, d0 |
||

1671 | vrshrn.u16 d4, q3, #3 |
||

1672 | vrshrn.u16 d5, q8, #3 |
||

1673 | vrshrn.u16 d6, q9, #3 |
||

1674 | vrshrn.u16 d7, q10, #3 |
||

1675 | vst1.8 {q2}, [r0,:128], r1 |
||

1676 | vst1.8 {q3}, [r0,:128], r1 |
||

1677 | bgt 1b |
||

1678 | |||

1679 | bx lr |
||

1680 | endfunc |
||

1681 | |||

1682 | function ff_put_vp8_bilin16_hv_neon, export=1 |
||

1683 | ldr r3, [sp, #4] @ mx |
||

1684 | rsb r12, r3, #8 |
||

1685 | vdup.8 d0, r3 |
||

1686 | vdup.8 d1, r12 |
||

1687 | ldr r3, [sp, #8] @ my |
||

1688 | rsb r12, r3, #8 |
||

1689 | vdup.8 d2, r3 |
||

1690 | vdup.8 d3, r12 |
||

1691 | ldr r12, [sp] @ h |
||

1692 | |||

1693 | vld1.8 {d4-d6}, [r2], r1 |
||

1694 | vext.8 q3, q2, q3, #1 |
||

1695 | vmull.u8 q8, d4, d1 |
||

1696 | vmlal.u8 q8, d6, d0 |
||

1697 | vmull.u8 q9, d5, d1 |
||

1698 | vmlal.u8 q9, d7, d0 |
||

1699 | vrshrn.u16 d4, q8, #3 |
||

1700 | vrshrn.u16 d5, q9, #3 |
||

1701 | 1: |
||

1702 | subs r12, r12, #2 |
||

1703 | vld1.8 {d18-d20},[r2], r1 |
||

1704 | vext.8 q10, q9, q10, #1 |
||

1705 | vmull.u8 q11, d18, d1 |
||

1706 | vmlal.u8 q11, d20, d0 |
||

1707 | vld1.8 {d26-d28},[r2], r1 |
||

1708 | vmull.u8 q12, d19, d1 |
||

1709 | vmlal.u8 q12, d21, d0 |
||

1710 | vext.8 q14, q13, q14, #1 |
||

1711 | vmull.u8 q8, d26, d1 |
||

1712 | vmlal.u8 q8, d28, d0 |
||

1713 | vmull.u8 q9, d27, d1 |
||

1714 | vmlal.u8 q9, d29, d0 |
||

1715 | vrshrn.u16 d6, q11, #3 |
||

1716 | vrshrn.u16 d7, q12, #3 |
||

1717 | vmull.u8 q12, d4, d3 |
||

1718 | vmlal.u8 q12, d6, d2 |
||

1719 | vmull.u8 q15, d5, d3 |
||

1720 | vmlal.u8 q15, d7, d2 |
||

1721 | vrshrn.u16 d4, q8, #3 |
||

1722 | vrshrn.u16 d5, q9, #3 |
||

1723 | vmull.u8 q10, d6, d3 |
||

1724 | vmlal.u8 q10, d4, d2 |
||

1725 | vmull.u8 q11, d7, d3 |
||

1726 | vmlal.u8 q11, d5, d2 |
||

1727 | vrshrn.u16 d24, q12, #3 |
||

1728 | vrshrn.u16 d25, q15, #3 |
||

1729 | vst1.8 {q12}, [r0,:128], r1 |
||

1730 | vrshrn.u16 d20, q10, #3 |
||

1731 | vrshrn.u16 d21, q11, #3 |
||

1732 | vst1.8 {q10}, [r0,:128], r1 |
||

1733 | bgt 1b |
||

1734 | |||

1735 | bx lr |
||

1736 | endfunc |
||

1737 | |||

1738 | function ff_put_vp8_bilin8_h_neon, export=1 |
||

1739 | ldr r3, [sp, #4] @ mx |
||

1740 | rsb r12, r3, #8 |
||

1741 | vdup.8 d0, r3 |
||

1742 | vdup.8 d1, r12 |
||

1743 | ldr r12, [sp] @ h |
||

1744 | 1: |
||

1745 | subs r12, r12, #2 |
||

1746 | vld1.8 {q1}, [r2], r1 |
||

1747 | vext.8 d3, d2, d3, #1 |
||

1748 | vmull.u8 q2, d2, d1 |
||

1749 | vmlal.u8 q2, d3, d0 |
||

1750 | vld1.8 {q3}, [r2], r1 |
||

1751 | vext.8 d7, d6, d7, #1 |
||

1752 | vmull.u8 q8, d6, d1 |
||

1753 | vmlal.u8 q8, d7, d0 |
||

1754 | vrshrn.u16 d4, q2, #3 |
||

1755 | vrshrn.u16 d16, q8, #3 |
||

1756 | vst1.8 {d4}, [r0,:64], r1 |
||

1757 | vst1.8 {d16}, [r0,:64], r1 |
||

1758 | bgt 1b |
||

1759 | |||

1760 | bx lr |
||

1761 | endfunc |
||

1762 | |||

1763 | function ff_put_vp8_bilin8_v_neon, export=1 |
||

1764 | ldr r3, [sp, #8] @ my |
||

1765 | rsb r12, r3, #8 |
||

1766 | vdup.8 d0, r3 |
||

1767 | vdup.8 d1, r12 |
||

1768 | ldr r12, [sp] @ h |
||

1769 | vld1.8 {d2}, [r2], r1 |
||

1770 | 1: |
||

1771 | subs r12, r12, #2 |
||

1772 | vld1.8 {d3}, [r2], r1 |
||

1773 | vmull.u8 q2, d2, d1 |
||

1774 | vmlal.u8 q2, d3, d0 |
||

1775 | vld1.8 {d2}, [r2], r1 |
||

1776 | vmull.u8 q3, d3, d1 |
||

1777 | vmlal.u8 q3, d2, d0 |
||

1778 | vrshrn.u16 d4, q2, #3 |
||

1779 | vrshrn.u16 d6, q3, #3 |
||

1780 | vst1.8 {d4}, [r0,:64], r1 |
||

1781 | vst1.8 {d6}, [r0,:64], r1 |
||

1782 | bgt 1b |
||

1783 | |||

1784 | bx lr |
||

1785 | endfunc |
||

1786 | |||

1787 | function ff_put_vp8_bilin8_hv_neon, export=1 |
||

1788 | ldr r3, [sp, #4] @ mx |
||

1789 | rsb r12, r3, #8 |
||

1790 | vdup.8 d0, r3 |
||

1791 | vdup.8 d1, r12 |
||

1792 | ldr r3, [sp, #8] @ my |
||

1793 | rsb r12, r3, #8 |
||

1794 | vdup.8 d2, r3 |
||

1795 | vdup.8 d3, r12 |
||

1796 | ldr r12, [sp] @ h |
||

1797 | |||

1798 | vld1.8 {q2}, [r2], r1 |
||

1799 | vext.8 d5, d4, d5, #1 |
||

1800 | vmull.u8 q9, d4, d1 |
||

1801 | vmlal.u8 q9, d5, d0 |
||

1802 | vrshrn.u16 d22, q9, #3 |
||

1803 | 1: |
||

1804 | subs r12, r12, #2 |
||

1805 | vld1.8 {q3}, [r2], r1 |
||

1806 | vext.8 d7, d6, d7, #1 |
||

1807 | vmull.u8 q8, d6, d1 |
||

1808 | vmlal.u8 q8, d7, d0 |
||

1809 | vld1.8 {q2}, [r2], r1 |
||

1810 | vext.8 d5, d4, d5, #1 |
||

1811 | vmull.u8 q9, d4, d1 |
||

1812 | vmlal.u8 q9, d5, d0 |
||

1813 | vrshrn.u16 d16, q8, #3 |
||

1814 | vmull.u8 q10, d22, d3 |
||

1815 | vmlal.u8 q10, d16, d2 |
||

1816 | vrshrn.u16 d22, q9, #3 |
||

1817 | vmull.u8 q12, d16, d3 |
||

1818 | vmlal.u8 q12, d22, d2 |
||

1819 | vrshrn.u16 d20, q10, #3 |
||

1820 | vst1.8 {d20}, [r0,:64], r1 |
||

1821 | vrshrn.u16 d23, q12, #3 |
||

1822 | vst1.8 {d23}, [r0,:64], r1 |
||

1823 | bgt 1b |
||

1824 | |||

1825 | bx lr |
||

1826 | endfunc |
||

1827 | |||

1828 | function ff_put_vp8_bilin4_h_neon, export=1 |
||

1829 | ldr r3, [sp, #4] @ mx |
||

1830 | rsb r12, r3, #8 |
||

1831 | vdup.8 d0, r3 |
||

1832 | vdup.8 d1, r12 |
||

1833 | ldr r12, [sp] @ h |
||

1834 | 1: |
||

1835 | subs r12, r12, #2 |
||

1836 | vld1.8 {d2}, [r2], r1 |
||

1837 | vext.8 d3, d2, d3, #1 |
||

1838 | vld1.8 {d6}, [r2], r1 |
||

1839 | vext.8 d7, d6, d7, #1 |
||

1840 | vtrn.32 q1, q3 |
||

1841 | vmull.u8 q2, d2, d1 |
||

1842 | vmlal.u8 q2, d3, d0 |
||

1843 | vrshrn.u16 d4, q2, #3 |
||

1844 | vst1.32 {d4[0]}, [r0,:32], r1 |
||

1845 | vst1.32 {d4[1]}, [r0,:32], r1 |
||

1846 | bgt 1b |
||

1847 | |||

1848 | bx lr |
||

1849 | endfunc |
||

1850 | |||

1851 | function ff_put_vp8_bilin4_v_neon, export=1 |
||

1852 | ldr r3, [sp, #8] @ my |
||

1853 | rsb r12, r3, #8 |
||

1854 | vdup.8 d0, r3 |
||

1855 | vdup.8 d1, r12 |
||

1856 | ldr r12, [sp] @ h |
||

1857 | vld1.32 {d2[]}, [r2], r1 |
||

1858 | 1: |
||

1859 | vld1.32 {d3[]}, [r2] |
||

1860 | vld1.32 {d2[1]}, [r2], r1 |
||

1861 | vld1.32 {d3[1]}, [r2], r1 |
||

1862 | vmull.u8 q2, d2, d1 |
||

1863 | vmlal.u8 q2, d3, d0 |
||

1864 | vtrn.32 d3, d2 |
||

1865 | vrshrn.u16 d4, q2, #3 |
||

1866 | vst1.32 {d4[0]}, [r0,:32], r1 |
||

1867 | vst1.32 {d4[1]}, [r0,:32], r1 |
||

1868 | subs r12, r12, #2 |
||

1869 | bgt 1b |
||

1870 | |||

1871 | bx lr |
||

1872 | endfunc |
||

1873 | |||

1874 | function ff_put_vp8_bilin4_hv_neon, export=1 |
||

1875 | ldr r3, [sp, #4] @ mx |
||

1876 | rsb r12, r3, #8 |
||

1877 | vdup.8 d0, r3 |
||

1878 | vdup.8 d1, r12 |
||

1879 | ldr r3, [sp, #8] @ my |
||

1880 | rsb r12, r3, #8 |
||

1881 | vdup.8 d2, r3 |
||

1882 | vdup.8 d3, r12 |
||

1883 | ldr r12, [sp] @ h |
||

1884 | |||

1885 | vld1.8 {d4}, [r2], r1 |
||

1886 | vext.8 d5, d4, d4, #1 |
||

1887 | vmull.u8 q9, d4, d1 |
||

1888 | vmlal.u8 q9, d5, d0 |
||

1889 | vrshrn.u16 d22, q9, #3 |
||

1890 | 1: |
||

1891 | subs r12, r12, #2 |
||

1892 | vld1.8 {d6}, [r2], r1 |
||

1893 | vext.8 d7, d6, d6, #1 |
||

1894 | vld1.8 {d4}, [r2], r1 |
||

1895 | vext.8 d5, d4, d4, #1 |
||

1896 | vtrn.32 q3, q2 |
||

1897 | vmull.u8 q8, d6, d1 |
||

1898 | vmlal.u8 q8, d7, d0 |
||

1899 | vrshrn.u16 d16, q8, #3 |
||

1900 | vmull.u8 q10, d16, d2 |
||

1901 | vtrn.32 d22, d16 |
||

1902 | vmlal.u8 q10, d22, d3 |
||

1903 | vrev64.32 d22, d16 |
||

1904 | vrshrn.u16 d20, q10, #3 |
||

1905 | vst1.32 {d20[0]}, [r0,:32], r1 |
||

1906 | vst1.32 {d20[1]}, [r0,:32], r1 |
||

1907 | bgt 1b |
||

1908 | |||

1909 | bx lr |
||

1910 | endfunc |