## ffmpeg / libavcodec / arm / simple_idct_armv6.S @ 2912e87a

History | View | Annotate | Download (13.1 KB)

1 |
/* |
---|---|

2 |
* Simple IDCT |

3 |
* |

4 |
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> |

5 |
* Copyright (c) 2007 Mans Rullgard <mans@mansr.com> |

6 |
* |

7 |
* This file is part of Libav. |

8 |
* |

9 |
* Libav is free software; you can redistribute it and/or |

10 |
* modify it under the terms of the GNU Lesser General Public |

11 |
* License as published by the Free Software Foundation; either |

12 |
* version 2.1 of the License, or (at your option) any later version. |

13 |
* |

14 |
* Libav is distributed in the hope that it will be useful, |

15 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |

16 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

17 |
* Lesser General Public License for more details. |

18 |
* |

19 |
* You should have received a copy of the GNU Lesser General Public |

20 |
* License along with Libav; if not, write to the Free Software |

21 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

22 |
*/ |

23 | |

24 |
#include "asm.S" |

25 | |

26 |
#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

27 |
#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

28 |
#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

29 |
#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

30 |
#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

31 |
#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

32 |
#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

33 |
#define ROW_SHIFT 11 |

34 |
#define COL_SHIFT 20 |

35 | |

36 |
#define W13 (W1 | (W3 << 16)) |

37 |
#define W26 (W2 | (W6 << 16)) |

38 |
#define W42 (W4 | (W2 << 16)) |

39 |
#define W42n (-W4&0xffff | (-W2 << 16)) |

40 |
#define W46 (W4 | (W6 << 16)) |

41 |
#define W57 (W5 | (W7 << 16)) |

42 | |

43 |
.text |

44 |
.align |

45 |
w13: .long W13 |

46 |
w26: .long W26 |

47 |
w42: .long W42 |

48 |
w42n: .long W42n |

49 |
w46: .long W46 |

50 |
w57: .long W57 |

51 | |

52 |
/* |

53 |
Compute partial IDCT of single row. |

54 |
shift = left-shift amount |

55 |
r0 = source address |

56 |
r2 = row[2,0] <= 2 cycles |

57 |
r3 = row[3,1] |

58 |
ip = w42 <= 2 cycles |

59 | |

60 |
Output in registers r4--r11 |

61 |
*/ |

62 |
.macro idct_row shift |

63 |
ldr lr, w46 /* lr = W4 | (W6 << 16) */ |

64 |
mov r1, #(1<<(\shift-1)) |

65 |
smlad r4, r2, ip, r1 |

66 |
smlsd r7, r2, ip, r1 |

67 |
ldr ip, w13 /* ip = W1 | (W3 << 16) */ |

68 |
ldr r10,w57 /* r10 = W5 | (W7 << 16) */ |

69 |
smlad r5, r2, lr, r1 |

70 |
smlsd r6, r2, lr, r1 |

71 | |

72 |
smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ |

73 |
smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ |

74 |
ldr lr, [r0, #12] /* lr = row[7,5] */ |

75 |
pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ |

76 |
pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ |

77 |
smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ |

78 |
smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */ |

79 |
smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ |

80 | |

81 |
ldr r3, w42n /* r3 = -W4 | (-W2 << 16) */ |

82 |
smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */ |

83 |
ldr r2, [r0, #4] /* r2 = row[6,4] */ |

84 |
smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */ |

85 |
ldr ip, w46 /* ip = W4 | (W6 << 16) */ |

86 |
smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */ |

87 | |

88 |
smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */ |

89 |
smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */ |

90 |
smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */ |

91 |
smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */ |

92 |
.endm |

93 | |

94 |
/* |

95 |
Compute partial IDCT of half row. |

96 |
shift = left-shift amount |

97 |
r2 = row[2,0] |

98 |
r3 = row[3,1] |

99 |
ip = w42 |

100 | |

101 |
Output in registers r4--r11 |

102 |
*/ |

103 |
.macro idct_row4 shift |

104 |
ldr lr, w46 /* lr = W4 | (W6 << 16) */ |

105 |
ldr r10,w57 /* r10 = W5 | (W7 << 16) */ |

106 |
mov r1, #(1<<(\shift-1)) |

107 |
smlad r4, r2, ip, r1 |

108 |
smlsd r7, r2, ip, r1 |

109 |
ldr ip, w13 /* ip = W1 | (W3 << 16) */ |

110 |
smlad r5, r2, lr, r1 |

111 |
smlsd r6, r2, lr, r1 |

112 |
smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ |

113 |
smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ |

114 |
pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ |

115 |
pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ |

116 |
smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ |

117 |
smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ |

118 |
.endm |

119 | |

120 |
/* |

121 |
Compute final part of IDCT single row without shift. |

122 |
Input in registers r4--r11 |

123 |
Output in registers ip, r4--r6, lr, r8--r10 |

124 |
*/ |

125 |
.macro idct_finish |

126 |
add ip, r4, r8 /* r1 = A0 + B0 */ |

127 |
sub lr, r4, r8 /* r2 = A0 - B0 */ |

128 |
sub r4, r5, r9 /* r2 = A1 + B1 */ |

129 |
add r8, r5, r9 /* r2 = A1 - B1 */ |

130 |
add r5, r6, r10 /* r1 = A2 + B2 */ |

131 |
sub r9, r6, r10 /* r1 = A2 - B2 */ |

132 |
add r6, r7, r11 /* r2 = A3 + B3 */ |

133 |
sub r10,r7, r11 /* r2 = A3 - B3 */ |

134 |
.endm |

135 | |

136 |
/* |

137 |
Compute final part of IDCT single row. |

138 |
shift = right-shift amount |

139 |
Input/output in registers r4--r11 |

140 |
*/ |

141 |
.macro idct_finish_shift shift |

142 |
add r3, r4, r8 /* r3 = A0 + B0 */ |

143 |
sub r2, r4, r8 /* r2 = A0 - B0 */ |

144 |
mov r4, r3, asr #\shift |

145 |
mov r8, r2, asr #\shift |

146 | |

147 |
sub r3, r5, r9 /* r3 = A1 + B1 */ |

148 |
add r2, r5, r9 /* r2 = A1 - B1 */ |

149 |
mov r5, r3, asr #\shift |

150 |
mov r9, r2, asr #\shift |

151 | |

152 |
add r3, r6, r10 /* r3 = A2 + B2 */ |

153 |
sub r2, r6, r10 /* r2 = A2 - B2 */ |

154 |
mov r6, r3, asr #\shift |

155 |
mov r10,r2, asr #\shift |

156 | |

157 |
add r3, r7, r11 /* r3 = A3 + B3 */ |

158 |
sub r2, r7, r11 /* r2 = A3 - B3 */ |

159 |
mov r7, r3, asr #\shift |

160 |
mov r11,r2, asr #\shift |

161 |
.endm |

162 | |

163 |
/* |

164 |
Compute final part of IDCT single row, saturating results at 8 bits. |

165 |
shift = right-shift amount |

166 |
Input/output in registers r4--r11 |

167 |
*/ |

168 |
.macro idct_finish_shift_sat shift |

169 |
add r3, r4, r8 /* r3 = A0 + B0 */ |

170 |
sub ip, r4, r8 /* ip = A0 - B0 */ |

171 |
usat r4, #8, r3, asr #\shift |

172 |
usat r8, #8, ip, asr #\shift |

173 | |

174 |
sub r3, r5, r9 /* r3 = A1 + B1 */ |

175 |
add ip, r5, r9 /* ip = A1 - B1 */ |

176 |
usat r5, #8, r3, asr #\shift |

177 |
usat r9, #8, ip, asr #\shift |

178 | |

179 |
add r3, r6, r10 /* r3 = A2 + B2 */ |

180 |
sub ip, r6, r10 /* ip = A2 - B2 */ |

181 |
usat r6, #8, r3, asr #\shift |

182 |
usat r10,#8, ip, asr #\shift |

183 | |

184 |
add r3, r7, r11 /* r3 = A3 + B3 */ |

185 |
sub ip, r7, r11 /* ip = A3 - B3 */ |

186 |
usat r7, #8, r3, asr #\shift |

187 |
usat r11,#8, ip, asr #\shift |

188 |
.endm |

189 | |

190 |
/* |

191 |
Compute IDCT of single row, storing as column. |

192 |
r0 = source |

193 |
r1 = dest |

194 |
*/ |

195 |
function idct_row_armv6 |

196 |
push {lr} |

197 | |

198 |
ldr lr, [r0, #12] /* lr = row[7,5] */ |

199 |
ldr ip, [r0, #4] /* ip = row[6,4] */ |

200 |
ldr r3, [r0, #8] /* r3 = row[3,1] */ |

201 |
ldr r2, [r0] /* r2 = row[2,0] */ |

202 |
orrs lr, lr, ip |

203 |
cmpeq lr, r3 |

204 |
cmpeq lr, r2, lsr #16 |

205 |
beq 1f |

206 |
push {r1} |

207 |
ldr ip, w42 /* ip = W4 | (W2 << 16) */ |

208 |
cmp lr, #0 |

209 |
beq 2f |

210 | |

211 |
idct_row ROW_SHIFT |

212 |
b 3f |

213 | |

214 |
2: idct_row4 ROW_SHIFT |

215 | |

216 |
3: pop {r1} |

217 |
idct_finish_shift ROW_SHIFT |

218 | |

219 |
strh r4, [r1] |

220 |
strh r5, [r1, #(16*2)] |

221 |
strh r6, [r1, #(16*4)] |

222 |
strh r7, [r1, #(16*6)] |

223 |
strh r11,[r1, #(16*1)] |

224 |
strh r10,[r1, #(16*3)] |

225 |
strh r9, [r1, #(16*5)] |

226 |
strh r8, [r1, #(16*7)] |

227 | |

228 |
pop {pc} |

229 | |

230 |
1: mov r2, r2, lsl #3 |

231 |
strh r2, [r1] |

232 |
strh r2, [r1, #(16*2)] |

233 |
strh r2, [r1, #(16*4)] |

234 |
strh r2, [r1, #(16*6)] |

235 |
strh r2, [r1, #(16*1)] |

236 |
strh r2, [r1, #(16*3)] |

237 |
strh r2, [r1, #(16*5)] |

238 |
strh r2, [r1, #(16*7)] |

239 |
pop {pc} |

240 |
endfunc |

241 | |

242 |
/* |

243 |
Compute IDCT of single column, read as row. |

244 |
r0 = source |

245 |
r1 = dest |

246 |
*/ |

247 |
function idct_col_armv6 |

248 |
push {r1, lr} |

249 | |

250 |
ldr r2, [r0] /* r2 = row[2,0] */ |

251 |
ldr ip, w42 /* ip = W4 | (W2 << 16) */ |

252 |
ldr r3, [r0, #8] /* r3 = row[3,1] */ |

253 |
idct_row COL_SHIFT |

254 |
pop {r1} |

255 |
idct_finish_shift COL_SHIFT |

256 | |

257 |
strh r4, [r1] |

258 |
strh r5, [r1, #(16*1)] |

259 |
strh r6, [r1, #(16*2)] |

260 |
strh r7, [r1, #(16*3)] |

261 |
strh r11,[r1, #(16*4)] |

262 |
strh r10,[r1, #(16*5)] |

263 |
strh r9, [r1, #(16*6)] |

264 |
strh r8, [r1, #(16*7)] |

265 | |

266 |
pop {pc} |

267 |
endfunc |

268 | |

269 |
/* |

270 |
Compute IDCT of single column, read as row, store saturated 8-bit. |

271 |
r0 = source |

272 |
r1 = dest |

273 |
r2 = line size |

274 |
*/ |

275 |
function idct_col_put_armv6 |

276 |
push {r1, r2, lr} |

277 | |

278 |
ldr r2, [r0] /* r2 = row[2,0] */ |

279 |
ldr ip, w42 /* ip = W4 | (W2 << 16) */ |

280 |
ldr r3, [r0, #8] /* r3 = row[3,1] */ |

281 |
idct_row COL_SHIFT |

282 |
pop {r1, r2} |

283 |
idct_finish_shift_sat COL_SHIFT |

284 | |

285 |
strb r4, [r1], r2 |

286 |
strb r5, [r1], r2 |

287 |
strb r6, [r1], r2 |

288 |
strb r7, [r1], r2 |

289 |
strb r11,[r1], r2 |

290 |
strb r10,[r1], r2 |

291 |
strb r9, [r1], r2 |

292 |
strb r8, [r1], r2 |

293 | |

294 |
sub r1, r1, r2, lsl #3 |

295 | |

296 |
pop {pc} |

297 |
endfunc |

298 | |

299 |
/* |

300 |
Compute IDCT of single column, read as row, add/store saturated 8-bit. |

301 |
r0 = source |

302 |
r1 = dest |

303 |
r2 = line size |

304 |
*/ |

305 |
function idct_col_add_armv6 |

306 |
push {r1, r2, lr} |

307 | |

308 |
ldr r2, [r0] /* r2 = row[2,0] */ |

309 |
ldr ip, w42 /* ip = W4 | (W2 << 16) */ |

310 |
ldr r3, [r0, #8] /* r3 = row[3,1] */ |

311 |
idct_row COL_SHIFT |

312 |
pop {r1, r2} |

313 |
idct_finish |

314 | |

315 |
ldrb r3, [r1] |

316 |
ldrb r7, [r1, r2] |

317 |
ldrb r11,[r1, r2, lsl #2] |

318 |
add ip, r3, ip, asr #COL_SHIFT |

319 |
usat ip, #8, ip |

320 |
add r4, r7, r4, asr #COL_SHIFT |

321 |
strb ip, [r1], r2 |

322 |
ldrb ip, [r1, r2] |

323 |
usat r4, #8, r4 |

324 |
ldrb r11,[r1, r2, lsl #2] |

325 |
add r5, ip, r5, asr #COL_SHIFT |

326 |
usat r5, #8, r5 |

327 |
strb r4, [r1], r2 |

328 |
ldrb r3, [r1, r2] |

329 |
ldrb ip, [r1, r2, lsl #2] |

330 |
strb r5, [r1], r2 |

331 |
ldrb r7, [r1, r2] |

332 |
ldrb r4, [r1, r2, lsl #2] |

333 |
add r6, r3, r6, asr #COL_SHIFT |

334 |
usat r6, #8, r6 |

335 |
add r10,r7, r10,asr #COL_SHIFT |

336 |
usat r10,#8, r10 |

337 |
add r9, r11,r9, asr #COL_SHIFT |

338 |
usat r9, #8, r9 |

339 |
add r8, ip, r8, asr #COL_SHIFT |

340 |
usat r8, #8, r8 |

341 |
add lr, r4, lr, asr #COL_SHIFT |

342 |
usat lr, #8, lr |

343 |
strb r6, [r1], r2 |

344 |
strb r10,[r1], r2 |

345 |
strb r9, [r1], r2 |

346 |
strb r8, [r1], r2 |

347 |
strb lr, [r1], r2 |

348 | |

349 |
sub r1, r1, r2, lsl #3 |

350 | |

351 |
pop {pc} |

352 |
endfunc |

353 | |

354 |
/* |

355 |
Compute 8 IDCT row transforms. |

356 |
func = IDCT row->col function |

357 |
width = width of columns in bytes |

358 |
*/ |

359 |
.macro idct_rows func width |

360 |
bl \func |

361 |
add r0, r0, #(16*2) |

362 |
add r1, r1, #\width |

363 |
bl \func |

364 |
add r0, r0, #(16*2) |

365 |
add r1, r1, #\width |

366 |
bl \func |

367 |
add r0, r0, #(16*2) |

368 |
add r1, r1, #\width |

369 |
bl \func |

370 |
sub r0, r0, #(16*5) |

371 |
add r1, r1, #\width |

372 |
bl \func |

373 |
add r0, r0, #(16*2) |

374 |
add r1, r1, #\width |

375 |
bl \func |

376 |
add r0, r0, #(16*2) |

377 |
add r1, r1, #\width |

378 |
bl \func |

379 |
add r0, r0, #(16*2) |

380 |
add r1, r1, #\width |

381 |
bl \func |

382 | |

383 |
sub r0, r0, #(16*7) |

384 |
.endm |

385 | |

386 |
/* void ff_simple_idct_armv6(DCTELEM *data); */ |

387 |
function ff_simple_idct_armv6, export=1 |

388 |
push {r4-r11, lr} |

389 |
sub sp, sp, #128 |

390 | |

391 |
mov r1, sp |

392 |
idct_rows idct_row_armv6, 2 |

393 |
mov r1, r0 |

394 |
mov r0, sp |

395 |
idct_rows idct_col_armv6, 2 |

396 | |

397 |
add sp, sp, #128 |

398 |
pop {r4-r11, pc} |

399 |
endfunc |

400 | |

401 |
/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ |

402 |
function ff_simple_idct_add_armv6, export=1 |

403 |
push {r0, r1, r4-r11, lr} |

404 |
sub sp, sp, #128 |

405 | |

406 |
mov r0, r2 |

407 |
mov r1, sp |

408 |
idct_rows idct_row_armv6, 2 |

409 |
mov r0, sp |

410 |
ldr r1, [sp, #128] |

411 |
ldr r2, [sp, #(128+4)] |

412 |
idct_rows idct_col_add_armv6, 1 |

413 | |

414 |
add sp, sp, #(128+8) |

415 |
pop {r4-r11, pc} |

416 |
endfunc |

417 | |

418 |
/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ |

419 |
function ff_simple_idct_put_armv6, export=1 |

420 |
push {r0, r1, r4-r11, lr} |

421 |
sub sp, sp, #128 |

422 | |

423 |
mov r0, r2 |

424 |
mov r1, sp |

425 |
idct_rows idct_row_armv6, 2 |

426 |
mov r0, sp |

427 |
ldr r1, [sp, #128] |

428 |
ldr r2, [sp, #(128+4)] |

429 |
idct_rows idct_col_put_armv6, 1 |

430 | |

431 |
add sp, sp, #(128+8) |

432 |
pop {r4-r11, pc} |

433 |
endfunc |