## ffmpeg / libavcodec / arm / jrevdct_arm.S @ 0115b3ea

History | View | Annotate | Download (13.6 KB)

1 |
/* |
---|---|

2 |
C-like prototype : |

3 |
void j_rev_dct_arm(DCTBLOCK data) |

4 | |

5 |
With DCTBLOCK being a pointer to an array of 64 'signed shorts' |

6 | |

7 |
Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org) |

8 | |

9 |
Permission is hereby granted, free of charge, to any person obtaining a copy |

10 |
of this software and associated documentation files (the "Software"), to deal |

11 |
in the Software without restriction, including without limitation the rights |

12 |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |

13 |
copies of the Software, and to permit persons to whom the Software is |

14 |
furnished to do so, subject to the following conditions: |

15 | |

16 |
The above copyright notice and this permission notice shall be included in |

17 |
all copies or substantial portions of the Software. |

18 | |

19 |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |

20 |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |

21 |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |

22 |
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER |

23 |
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |

24 |
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |

25 | |

26 |
*/ |

27 | |

28 |
#include "asm.S" |

29 | |

30 |
#define FIX_0_298631336 2446 |

31 |
#define FIX_0_541196100 4433 |

32 |
#define FIX_0_765366865 6270 |

33 |
#define FIX_1_175875602 9633 |

34 |
#define FIX_1_501321110 12299 |

35 |
#define FIX_2_053119869 16819 |

36 |
#define FIX_3_072711026 25172 |

37 |
#define FIX_M_0_390180644 -3196 |

38 |
#define FIX_M_0_899976223 -7373 |

39 |
#define FIX_M_1_847759065 -15137 |

40 |
#define FIX_M_1_961570560 -16069 |

41 |
#define FIX_M_2_562915447 -20995 |

42 |
#define FIX_0xFFFF 0xFFFF |

43 | |

44 |
#define FIX_0_298631336_ID 0 |

45 |
#define FIX_0_541196100_ID 4 |

46 |
#define FIX_0_765366865_ID 8 |

47 |
#define FIX_1_175875602_ID 12 |

48 |
#define FIX_1_501321110_ID 16 |

49 |
#define FIX_2_053119869_ID 20 |

50 |
#define FIX_3_072711026_ID 24 |

51 |
#define FIX_M_0_390180644_ID 28 |

52 |
#define FIX_M_0_899976223_ID 32 |

53 |
#define FIX_M_1_847759065_ID 36 |

54 |
#define FIX_M_1_961570560_ID 40 |

55 |
#define FIX_M_2_562915447_ID 44 |

56 |
#define FIX_0xFFFF_ID 48 |

57 |
.text |

58 |
.align |

59 | |

60 |
function ff_j_rev_dct_arm, export=1 |

61 |
stmdb sp!, { r4 - r12, lr } @ all callee saved regs |

62 | |

63 |
sub sp, sp, #4 @ reserve some space on the stack |

64 |
str r0, [ sp ] @ save the DCT pointer to the stack |

65 | |

66 |
mov lr, r0 @ lr = pointer to the current row |

67 |
mov r12, #8 @ r12 = row-counter |

68 |
adr r11, const_array @ r11 = base pointer to the constants array |

69 |
row_loop: |

70 |
ldrsh r0, [lr, # 0] @ r0 = 'd0' |

71 |
ldrsh r2, [lr, # 2] @ r2 = 'd2' |

72 | |

73 |
@ Optimization for row that have all items except the first set to 0 |

74 |
@ (this works as the DCTELEMS are always 4-byte aligned) |

75 |
ldr r5, [lr, # 0] |

76 |
ldr r6, [lr, # 4] |

77 |
ldr r3, [lr, # 8] |

78 |
ldr r4, [lr, #12] |

79 |
orr r3, r3, r4 |

80 |
orr r3, r3, r6 |

81 |
orrs r5, r3, r5 |

82 |
beq end_of_row_loop @ nothing to be done as ALL of them are '0' |

83 |
orrs r3, r3, r2 |

84 |
beq empty_row |

85 | |

86 |
ldrsh r1, [lr, # 8] @ r1 = 'd1' |

87 |
ldrsh r4, [lr, # 4] @ r4 = 'd4' |

88 |
ldrsh r6, [lr, # 6] @ r6 = 'd6' |

89 | |

90 |
ldr r3, [r11, #FIX_0_541196100_ID] |

91 |
add r7, r2, r6 |

92 |
ldr r5, [r11, #FIX_M_1_847759065_ID] |

93 |
mul r7, r3, r7 @ r7 = z1 |

94 |
ldr r3, [r11, #FIX_0_765366865_ID] |

95 |
mla r6, r5, r6, r7 @ r6 = tmp2 |

96 |
add r5, r0, r4 @ r5 = tmp0 |

97 |
mla r2, r3, r2, r7 @ r2 = tmp3 |

98 |
sub r3, r0, r4 @ r3 = tmp1 |

99 | |

100 |
add r0, r2, r5, lsl #13 @ r0 = tmp10 |

101 |
rsb r2, r2, r5, lsl #13 @ r2 = tmp13 |

102 |
add r4, r6, r3, lsl #13 @ r4 = tmp11 |

103 |
rsb r3, r6, r3, lsl #13 @ r3 = tmp12 |

104 | |

105 |
stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11 |

106 | |

107 |
ldrsh r3, [lr, #10] @ r3 = 'd3' |

108 |
ldrsh r5, [lr, #12] @ r5 = 'd5' |

109 |
ldrsh r7, [lr, #14] @ r7 = 'd7' |

110 | |

111 |
add r0, r3, r5 @ r0 = 'z2' |

112 |
add r2, r1, r7 @ r2 = 'z1' |

113 |
add r4, r3, r7 @ r4 = 'z3' |

114 |
add r6, r1, r5 @ r6 = 'z4' |

115 |
ldr r9, [r11, #FIX_1_175875602_ID] |

116 |
add r8, r4, r6 @ r8 = z3 + z4 |

117 |
ldr r10, [r11, #FIX_M_0_899976223_ID] |

118 |
mul r8, r9, r8 @ r8 = 'z5' |

119 |
ldr r9, [r11, #FIX_M_2_562915447_ID] |

120 |
mul r2, r10, r2 @ r2 = 'z1' |

121 |
ldr r10, [r11, #FIX_M_1_961570560_ID] |

122 |
mul r0, r9, r0 @ r0 = 'z2' |

123 |
ldr r9, [r11, #FIX_M_0_390180644_ID] |

124 |
mla r4, r10, r4, r8 @ r4 = 'z3' |

125 |
ldr r10, [r11, #FIX_0_298631336_ID] |

126 |
mla r6, r9, r6, r8 @ r6 = 'z4' |

127 |
ldr r9, [r11, #FIX_2_053119869_ID] |

128 |
mla r7, r10, r7, r2 @ r7 = tmp0 + z1 |

129 |
ldr r10, [r11, #FIX_3_072711026_ID] |

130 |
mla r5, r9, r5, r0 @ r5 = tmp1 + z2 |

131 |
ldr r9, [r11, #FIX_1_501321110_ID] |

132 |
mla r3, r10, r3, r0 @ r3 = tmp2 + z2 |

133 |
add r7, r7, r4 @ r7 = tmp0 |

134 |
mla r1, r9, r1, r2 @ r1 = tmp3 + z1 |

135 |
add r5, r5, r6 @ r5 = tmp1 |

136 |
add r3, r3, r4 @ r3 = tmp2 |

137 |
add r1, r1, r6 @ r1 = tmp3 |

138 | |

139 |
ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11 |

140 |
@ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 |

141 | |

142 |
@ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) |

143 |
add r8, r0, r1 |

144 |
add r8, r8, #(1<<10) |

145 |
mov r8, r8, asr #11 |

146 |
strh r8, [lr, # 0] |

147 | |

148 |
@ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) |

149 |
sub r8, r0, r1 |

150 |
add r8, r8, #(1<<10) |

151 |
mov r8, r8, asr #11 |

152 |
strh r8, [lr, #14] |

153 | |

154 |
@ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) |

155 |
add r8, r6, r3 |

156 |
add r8, r8, #(1<<10) |

157 |
mov r8, r8, asr #11 |

158 |
strh r8, [lr, # 2] |

159 | |

160 |
@ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) |

161 |
sub r8, r6, r3 |

162 |
add r8, r8, #(1<<10) |

163 |
mov r8, r8, asr #11 |

164 |
strh r8, [lr, #12] |

165 | |

166 |
@ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) |

167 |
add r8, r4, r5 |

168 |
add r8, r8, #(1<<10) |

169 |
mov r8, r8, asr #11 |

170 |
strh r8, [lr, # 4] |

171 | |

172 |
@ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) |

173 |
sub r8, r4, r5 |

174 |
add r8, r8, #(1<<10) |

175 |
mov r8, r8, asr #11 |

176 |
strh r8, [lr, #10] |

177 | |

178 |
@ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) |

179 |
add r8, r2, r7 |

180 |
add r8, r8, #(1<<10) |

181 |
mov r8, r8, asr #11 |

182 |
strh r8, [lr, # 6] |

183 | |

184 |
@ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) |

185 |
sub r8, r2, r7 |

186 |
add r8, r8, #(1<<10) |

187 |
mov r8, r8, asr #11 |

188 |
strh r8, [lr, # 8] |

189 | |

190 |
@ End of row loop |

191 |
add lr, lr, #16 |

192 |
subs r12, r12, #1 |

193 |
bne row_loop |

194 |
beq start_column_loop |

195 | |

196 |
empty_row: |

197 |
ldr r1, [r11, #FIX_0xFFFF_ID] |

198 |
mov r0, r0, lsl #2 |

199 |
and r0, r0, r1 |

200 |
add r0, r0, r0, lsl #16 |

201 |
str r0, [lr, # 0] |

202 |
str r0, [lr, # 4] |

203 |
str r0, [lr, # 8] |

204 |
str r0, [lr, #12] |

205 | |

206 |
end_of_row_loop: |

207 |
@ End of loop |

208 |
add lr, lr, #16 |

209 |
subs r12, r12, #1 |

210 |
bne row_loop |

211 | |

212 |
start_column_loop: |

213 |
@ Start of column loop |

214 |
ldr lr, [ sp ] |

215 |
mov r12, #8 |

216 |
column_loop: |

217 |
ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0' |

218 |
ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2' |

219 |
ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4' |

220 |
ldrsh r6, [lr, #(12*8)] @ r6 = 'd6' |

221 | |

222 |
ldr r3, [r11, #FIX_0_541196100_ID] |

223 |
add r1, r2, r6 |

224 |
ldr r5, [r11, #FIX_M_1_847759065_ID] |

225 |
mul r1, r3, r1 @ r1 = z1 |

226 |
ldr r3, [r11, #FIX_0_765366865_ID] |

227 |
mla r6, r5, r6, r1 @ r6 = tmp2 |

228 |
add r5, r0, r4 @ r5 = tmp0 |

229 |
mla r2, r3, r2, r1 @ r2 = tmp3 |

230 |
sub r3, r0, r4 @ r3 = tmp1 |

231 | |

232 |
add r0, r2, r5, lsl #13 @ r0 = tmp10 |

233 |
rsb r2, r2, r5, lsl #13 @ r2 = tmp13 |

234 |
add r4, r6, r3, lsl #13 @ r4 = tmp11 |

235 |
rsb r6, r6, r3, lsl #13 @ r6 = tmp12 |

236 | |

237 |
ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1' |

238 |
ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3' |

239 |
ldrsh r5, [lr, #(10*8)] @ r5 = 'd5' |

240 |
ldrsh r7, [lr, #(14*8)] @ r7 = 'd7' |

241 | |

242 |
@ Check for empty odd column (happens about 20 to 25 % of the time according to my stats) |

243 |
orr r9, r1, r3 |

244 |
orr r10, r5, r7 |

245 |
orrs r10, r9, r10 |

246 |
beq empty_odd_column |

247 | |

248 |
stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11 |

249 | |

250 |
add r0, r3, r5 @ r0 = 'z2' |

251 |
add r2, r1, r7 @ r2 = 'z1' |

252 |
add r4, r3, r7 @ r4 = 'z3' |

253 |
add r6, r1, r5 @ r6 = 'z4' |

254 |
ldr r9, [r11, #FIX_1_175875602_ID] |

255 |
add r8, r4, r6 |

256 |
ldr r10, [r11, #FIX_M_0_899976223_ID] |

257 |
mul r8, r9, r8 @ r8 = 'z5' |

258 |
ldr r9, [r11, #FIX_M_2_562915447_ID] |

259 |
mul r2, r10, r2 @ r2 = 'z1' |

260 |
ldr r10, [r11, #FIX_M_1_961570560_ID] |

261 |
mul r0, r9, r0 @ r0 = 'z2' |

262 |
ldr r9, [r11, #FIX_M_0_390180644_ID] |

263 |
mla r4, r10, r4, r8 @ r4 = 'z3' |

264 |
ldr r10, [r11, #FIX_0_298631336_ID] |

265 |
mla r6, r9, r6, r8 @ r6 = 'z4' |

266 |
ldr r9, [r11, #FIX_2_053119869_ID] |

267 |
mla r7, r10, r7, r2 @ r7 = tmp0 + z1 |

268 |
ldr r10, [r11, #FIX_3_072711026_ID] |

269 |
mla r5, r9, r5, r0 @ r5 = tmp1 + z2 |

270 |
ldr r9, [r11, #FIX_1_501321110_ID] |

271 |
mla r3, r10, r3, r0 @ r3 = tmp2 + z2 |

272 |
add r7, r7, r4 @ r7 = tmp0 |

273 |
mla r1, r9, r1, r2 @ r1 = tmp3 + z1 |

274 |
add r5, r5, r6 @ r5 = tmp1 |

275 |
add r3, r3, r4 @ r3 = tmp2 |

276 |
add r1, r1, r6 @ r1 = tmp3 |

277 | |

278 |
ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12 |

279 |
@ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 |

280 | |

281 |
@ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) |

282 |
add r8, r0, r1 |

283 |
add r8, r8, #(1<<17) |

284 |
mov r8, r8, asr #18 |

285 |
strh r8, [lr, #( 0*8)] |

286 | |

287 |
@ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) |

288 |
sub r8, r0, r1 |

289 |
add r8, r8, #(1<<17) |

290 |
mov r8, r8, asr #18 |

291 |
strh r8, [lr, #(14*8)] |

292 | |

293 |
@ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) |

294 |
add r8, r4, r3 |

295 |
add r8, r8, #(1<<17) |

296 |
mov r8, r8, asr #18 |

297 |
strh r8, [lr, #( 2*8)] |

298 | |

299 |
@ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) |

300 |
sub r8, r4, r3 |

301 |
add r8, r8, #(1<<17) |

302 |
mov r8, r8, asr #18 |

303 |
strh r8, [lr, #(12*8)] |

304 | |

305 |
@ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) |

306 |
add r8, r6, r5 |

307 |
add r8, r8, #(1<<17) |

308 |
mov r8, r8, asr #18 |

309 |
strh r8, [lr, #( 4*8)] |

310 | |

311 |
@ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) |

312 |
sub r8, r6, r5 |

313 |
add r8, r8, #(1<<17) |

314 |
mov r8, r8, asr #18 |

315 |
strh r8, [lr, #(10*8)] |

316 | |

317 |
@ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) |

318 |
add r8, r2, r7 |

319 |
add r8, r8, #(1<<17) |

320 |
mov r8, r8, asr #18 |

321 |
strh r8, [lr, #( 6*8)] |

322 | |

323 |
@ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) |

324 |
sub r8, r2, r7 |

325 |
add r8, r8, #(1<<17) |

326 |
mov r8, r8, asr #18 |

327 |
strh r8, [lr, #( 8*8)] |

328 | |

329 |
@ End of row loop |

330 |
add lr, lr, #2 |

331 |
subs r12, r12, #1 |

332 |
bne column_loop |

333 |
beq the_end |

334 | |

335 |
empty_odd_column: |

336 |
@ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) |

337 |
@ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) |

338 |
add r0, r0, #(1<<17) |

339 |
mov r0, r0, asr #18 |

340 |
strh r0, [lr, #( 0*8)] |

341 |
strh r0, [lr, #(14*8)] |

342 | |

343 |
@ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) |

344 |
@ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) |

345 |
add r4, r4, #(1<<17) |

346 |
mov r4, r4, asr #18 |

347 |
strh r4, [lr, #( 2*8)] |

348 |
strh r4, [lr, #(12*8)] |

349 | |

350 |
@ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) |

351 |
@ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) |

352 |
add r6, r6, #(1<<17) |

353 |
mov r6, r6, asr #18 |

354 |
strh r6, [lr, #( 4*8)] |

355 |
strh r6, [lr, #(10*8)] |

356 | |

357 |
@ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) |

358 |
@ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) |

359 |
add r2, r2, #(1<<17) |

360 |
mov r2, r2, asr #18 |

361 |
strh r2, [lr, #( 6*8)] |

362 |
strh r2, [lr, #( 8*8)] |

363 | |

364 |
@ End of row loop |

365 |
add lr, lr, #2 |

366 |
subs r12, r12, #1 |

367 |
bne column_loop |

368 | |

369 |
the_end: |

370 |
@ The end.... |

371 |
add sp, sp, #4 |

372 |
ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return |

373 | |

374 |
const_array: |

375 |
.align |

376 |
.word FIX_0_298631336 |

377 |
.word FIX_0_541196100 |

378 |
.word FIX_0_765366865 |

379 |
.word FIX_1_175875602 |

380 |
.word FIX_1_501321110 |

381 |
.word FIX_2_053119869 |

382 |
.word FIX_3_072711026 |

383 |
.word FIX_M_0_390180644 |

384 |
.word FIX_M_0_899976223 |

385 |
.word FIX_M_1_847759065 |

386 |
.word FIX_M_1_961570560 |

387 |
.word FIX_M_2_562915447 |

388 |
.word FIX_0xFFFF |