## ffmpeg / libavcodec / armv4l / simple_idct_armv6.S @ d761f089

History | View | Annotate | Download (14 KB)

1 |
/* |
---|---|

2 |
* Simple IDCT |

3 |
* |

4 |
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> |

5 |
* Copyright (c) 2007 Mans Rullgard <mans@mansr.com> |

6 |
* |

7 |
* This file is part of FFmpeg. |

8 |
* |

9 |
* FFmpeg is free software; you can redistribute it and/or |

10 |
* modify it under the terms of the GNU Lesser General Public |

11 |
* License as published by the Free Software Foundation; either |

12 |
* version 2.1 of the License, or (at your option) any later version. |

13 |
* |

14 |
* FFmpeg is distributed in the hope that it will be useful, |

15 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |

16 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

17 |
* Lesser General Public License for more details. |

18 |
* |

19 |
* You should have received a copy of the GNU Lesser General Public |

20 |
* License along with FFmpeg; if not, write to the Free Software |

21 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

22 |
*/ |

23 | |

24 |
#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

25 |
#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

26 |
#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

27 |
#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

28 |
#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

29 |
#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

30 |
#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

31 |
#define ROW_SHIFT 11 |

32 |
#define COL_SHIFT 20 |

33 | |

34 |
#define W13 (W1 | (W3 << 16)) |

35 |
#define W26 (W2 | (W6 << 16)) |

36 |
#define W42 (W4 | (W2 << 16)) |

37 |
#define W42n (-W4&0xffff | (-W2 << 16)) |

38 |
#define W46 (W4 | (W6 << 16)) |

39 |
#define W57 (W5 | (W7 << 16)) |

40 | |

41 |
.text |

42 |
.align |

43 |
w13: .long W13 |

44 |
w26: .long W26 |

45 |
w42: .long W42 |

46 |
w42n: .long W42n |

47 |
w46: .long W46 |

48 |
w57: .long W57 |

49 | |

50 |
/* |

51 |
Compute partial IDCT of single row. |

52 |
shift = left-shift amount |

53 |
a1 = source address |

54 |
a3 = row[2,0] <= 2 cycles |

55 |
a4 = row[3,1] |

56 |
ip = w42 <= 2 cycles |

57 | |

58 |
Output in registers v1--v8 |

59 |
*/ |

60 |
.macro idct_row shift |

61 |
ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ |

62 |
mov a2, #(1<<(\shift-1)) |

63 |
smlad v1, a3, ip, a2 |

64 |
smlsd v4, a3, ip, a2 |

65 |
ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ |

66 |
ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ |

67 |
smlad v2, a3, lr, a2 |

68 |
smlsd v3, a3, lr, a2 |

69 | |

70 |
smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ |

71 |
smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ |

72 |
ldr lr, [a1, #12] /* lr = row[7,5] */ |

73 |
pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ |

74 |
pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ |

75 |
smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ |

76 |
smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */ |

77 |
smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ |

78 | |

79 |
ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */ |

80 |
smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */ |

81 |
ldr a3, [a1, #4] /* a3 = row[6,4] */ |

82 |
smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */ |

83 |
ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */ |

84 |
smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */ |

85 | |

86 |
smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */ |

87 |
smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */ |

88 |
smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ |

89 |
smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ |

90 |
.endm |

91 | |

92 |
/* |

93 |
Compute partial IDCT of half row. |

94 |
shift = left-shift amount |

95 |
a3 = row[2,0] |

96 |
a4 = row[3,1] |

97 |
ip = w42 |

98 | |

99 |
Output in registers v1--v8 |

100 |
*/ |

101 |
.macro idct_row4 shift |

102 |
ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ |

103 |
ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ |

104 |
mov a2, #(1<<(\shift-1)) |

105 |
smlad v1, a3, ip, a2 |

106 |
smlsd v4, a3, ip, a2 |

107 |
ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ |

108 |
smlad v2, a3, lr, a2 |

109 |
smlsd v3, a3, lr, a2 |

110 |
smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ |

111 |
smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ |

112 |
pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ |

113 |
pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ |

114 |
smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ |

115 |
smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ |

116 |
.endm |

117 | |

118 |
/* |

119 |
Compute final part of IDCT single row without shift. |

120 |
Input in registers v1--v8 |

121 |
Output in registers ip, v1--v3, lr, v5--v7 |

122 |
*/ |

123 |
.macro idct_finish |

124 |
add ip, v1, v5 /* a2 = A0 + B0 */ |

125 |
sub lr, v1, v5 /* a3 = A0 - B0 */ |

126 |
sub v1, v2, v6 /* a3 = A1 + B1 */ |

127 |
add v5, v2, v6 /* a3 = A1 - B1 */ |

128 |
add v2, v3, v7 /* a2 = A2 + B2 */ |

129 |
sub v6, v3, v7 /* a2 = A2 - B2 */ |

130 |
add v3, v4, fp /* a3 = A3 + B3 */ |

131 |
sub v7, v4, fp /* a3 = A3 - B3 */ |

132 |
.endm |

133 | |

134 |
/* |

135 |
Compute final part of IDCT single row. |

136 |
shift = right-shift amount |

137 |
Input/output in registers v1--v8 |

138 |
*/ |

139 |
.macro idct_finish_shift shift |

140 |
add a4, v1, v5 /* a4 = A0 + B0 */ |

141 |
sub a3, v1, v5 /* a3 = A0 - B0 */ |

142 |
mov v1, a4, asr #\shift |

143 |
mov v5, a3, asr #\shift |

144 | |

145 |
sub a4, v2, v6 /* a4 = A1 + B1 */ |

146 |
add a3, v2, v6 /* a3 = A1 - B1 */ |

147 |
mov v2, a4, asr #\shift |

148 |
mov v6, a3, asr #\shift |

149 | |

150 |
add a4, v3, v7 /* a4 = A2 + B2 */ |

151 |
sub a3, v3, v7 /* a3 = A2 - B2 */ |

152 |
mov v3, a4, asr #\shift |

153 |
mov v7, a3, asr #\shift |

154 | |

155 |
add a4, v4, fp /* a4 = A3 + B3 */ |

156 |
sub a3, v4, fp /* a3 = A3 - B3 */ |

157 |
mov v4, a4, asr #\shift |

158 |
mov fp, a3, asr #\shift |

159 |
.endm |

160 | |

161 |
/* |

162 |
Compute final part of IDCT single row, saturating results at 8 bits. |

163 |
shift = right-shift amount |

164 |
Input/output in registers v1--v8 |

165 |
*/ |

166 |
.macro idct_finish_shift_sat shift |

167 |
add a4, v1, v5 /* a4 = A0 + B0 */ |

168 |
sub ip, v1, v5 /* ip = A0 - B0 */ |

169 |
usat v1, #8, a4, asr #\shift |

170 |
usat v5, #8, ip, asr #\shift |

171 | |

172 |
sub a4, v2, v6 /* a4 = A1 + B1 */ |

173 |
add ip, v2, v6 /* ip = A1 - B1 */ |

174 |
usat v2, #8, a4, asr #\shift |

175 |
usat v6, #8, ip, asr #\shift |

176 | |

177 |
add a4, v3, v7 /* a4 = A2 + B2 */ |

178 |
sub ip, v3, v7 /* ip = A2 - B2 */ |

179 |
usat v3, #8, a4, asr #\shift |

180 |
usat v7, #8, ip, asr #\shift |

181 | |

182 |
add a4, v4, fp /* a4 = A3 + B3 */ |

183 |
sub ip, v4, fp /* ip = A3 - B3 */ |

184 |
usat v4, #8, a4, asr #\shift |

185 |
usat fp, #8, ip, asr #\shift |

186 |
.endm |

187 | |

188 |
/* |

189 |
Compute IDCT of single row, storing as column. |

190 |
a1 = source |

191 |
a2 = dest |

192 |
*/ |

193 |
.align |

194 |
.type idct_row_armv6, %function |

195 |
.func idct_row_armv6 |

196 |
idct_row_armv6: |

197 |
str lr, [sp, #-4]! |

198 | |

199 |
ldr lr, [a1, #12] /* lr = row[7,5] */ |

200 |
ldr ip, [a1, #4] /* ip = row[6,4] */ |

201 |
ldr a4, [a1, #8] /* a4 = row[3,1] */ |

202 |
ldr a3, [a1] /* a3 = row[2,0] */ |

203 |
orrs lr, lr, ip |

204 |
cmpeq lr, a4 |

205 |
cmpeq lr, a3, lsr #16 |

206 |
beq 1f |

207 |
str a2, [sp, #-4]! |

208 |
ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |

209 |
cmp lr, #0 |

210 |
beq 2f |

211 | |

212 |
idct_row ROW_SHIFT |

213 |
b 3f |

214 | |

215 |
2: idct_row4 ROW_SHIFT |

216 | |

217 |
3: ldr a2, [sp], #4 |

218 |
idct_finish_shift ROW_SHIFT |

219 | |

220 |
strh v1, [a2] |

221 |
strh v2, [a2, #(16*2)] |

222 |
strh v3, [a2, #(16*4)] |

223 |
strh v4, [a2, #(16*6)] |

224 |
strh fp, [a2, #(16*1)] |

225 |
strh v7, [a2, #(16*3)] |

226 |
strh v6, [a2, #(16*5)] |

227 |
strh v5, [a2, #(16*7)] |

228 | |

229 |
ldr pc, [sp], #4 |

230 | |

231 |
1: mov a3, a3, lsl #3 |

232 |
strh a3, [a2] |

233 |
strh a3, [a2, #(16*2)] |

234 |
strh a3, [a2, #(16*4)] |

235 |
strh a3, [a2, #(16*6)] |

236 |
strh a3, [a2, #(16*1)] |

237 |
strh a3, [a2, #(16*3)] |

238 |
strh a3, [a2, #(16*5)] |

239 |
strh a3, [a2, #(16*7)] |

240 |
ldr pc, [sp], #4 |

241 |
.endfunc |

242 | |

243 |
/* |

244 |
Compute IDCT of single column, read as row. |

245 |
a1 = source |

246 |
a2 = dest |

247 |
*/ |

248 |
.align |

249 |
.type idct_col_armv6, %function |

250 |
.func idct_col_armv6 |

251 |
idct_col_armv6: |

252 |
stmfd sp!, {a2, lr} |

253 | |

254 |
ldr a3, [a1] /* a3 = row[2,0] */ |

255 |
ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |

256 |
ldr a4, [a1, #8] /* a4 = row[3,1] */ |

257 |
idct_row COL_SHIFT |

258 |
ldr a2, [sp], #4 |

259 |
idct_finish_shift COL_SHIFT |

260 | |

261 |
strh v1, [a2] |

262 |
strh v2, [a2, #(16*1)] |

263 |
strh v3, [a2, #(16*2)] |

264 |
strh v4, [a2, #(16*3)] |

265 |
strh fp, [a2, #(16*4)] |

266 |
strh v7, [a2, #(16*5)] |

267 |
strh v6, [a2, #(16*6)] |

268 |
strh v5, [a2, #(16*7)] |

269 | |

270 |
ldr pc, [sp], #4 |

271 |
.endfunc |

272 | |

273 |
/* |

274 |
Compute IDCT of single column, read as row, store saturated 8-bit. |

275 |
a1 = source |

276 |
a2 = dest |

277 |
a3 = line size |

278 |
*/ |

279 |
.align |

280 |
.type idct_col_put_armv6, %function |

281 |
.func idct_col_put_armv6 |

282 |
idct_col_put_armv6: |

283 |
stmfd sp!, {a2, a3, lr} |

284 | |

285 |
ldr a3, [a1] /* a3 = row[2,0] */ |

286 |
ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |

287 |
ldr a4, [a1, #8] /* a4 = row[3,1] */ |

288 |
idct_row COL_SHIFT |

289 |
ldmfd sp!, {a2, a3} |

290 |
idct_finish_shift_sat COL_SHIFT |

291 | |

292 |
strb v1, [a2], a3 |

293 |
strb v2, [a2], a3 |

294 |
strb v3, [a2], a3 |

295 |
strb v4, [a2], a3 |

296 |
strb fp, [a2], a3 |

297 |
strb v7, [a2], a3 |

298 |
strb v6, [a2], a3 |

299 |
strb v5, [a2], a3 |

300 | |

301 |
sub a2, a2, a3, lsl #3 |

302 | |

303 |
ldr pc, [sp], #4 |

304 |
.endfunc |

305 | |

306 |
/* |

307 |
Compute IDCT of single column, read as row, add/store saturated 8-bit. |

308 |
a1 = source |

309 |
a2 = dest |

310 |
a3 = line size |

311 |
*/ |

312 |
.align |

313 |
.type idct_col_add_armv6, %function |

314 |
.func idct_col_add_armv6 |

315 |
idct_col_add_armv6: |

316 |
stmfd sp!, {a2, a3, lr} |

317 | |

318 |
ldr a3, [a1] /* a3 = row[2,0] */ |

319 |
ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |

320 |
ldr a4, [a1, #8] /* a4 = row[3,1] */ |

321 |
idct_row COL_SHIFT |

322 |
ldmfd sp!, {a2, a3} |

323 |
idct_finish |

324 | |

325 |
ldrb a4, [a2] |

326 |
ldrb v4, [a2, a3] |

327 |
ldrb fp, [a2, a3, lsl #2] |

328 |
add ip, a4, ip, asr #COL_SHIFT |

329 |
usat ip, #8, ip |

330 |
add v1, v4, v1, asr #COL_SHIFT |

331 |
strb ip, [a2], a3 |

332 |
ldrb ip, [a2, a3] |

333 |
usat v1, #8, v1 |

334 |
ldrb fp, [a2, a3, lsl #2] |

335 |
add v2, ip, v2, asr #COL_SHIFT |

336 |
usat v2, #8, v2 |

337 |
strb v1, [a2], a3 |

338 |
ldrb a4, [a2, a3] |

339 |
ldrb ip, [a2, a3, lsl #2] |

340 |
strb v2, [a2], a3 |

341 |
ldrb v4, [a2, a3] |

342 |
ldrb v1, [a2, a3, lsl #2] |

343 |
add v3, a4, v3, asr #COL_SHIFT |

344 |
usat v3, #8, v3 |

345 |
add v7, v4, v7, asr #COL_SHIFT |

346 |
usat v7, #8, v7 |

347 |
add v6, fp, v6, asr #COL_SHIFT |

348 |
usat v6, #8, v6 |

349 |
add v5, ip, v5, asr #COL_SHIFT |

350 |
usat v5, #8, v5 |

351 |
add lr, v1, lr, asr #COL_SHIFT |

352 |
usat lr, #8, lr |

353 |
strb v3, [a2], a3 |

354 |
strb v7, [a2], a3 |

355 |
strb v6, [a2], a3 |

356 |
strb v5, [a2], a3 |

357 |
strb lr, [a2], a3 |

358 | |

359 |
sub a2, a2, a3, lsl #3 |

360 | |

361 |
ldr pc, [sp], #4 |

362 |
.endfunc |

363 | |

364 |
/* |

365 |
Compute 8 IDCT row transforms. |

366 |
func = IDCT row->col function |

367 |
width = width of columns in bytes |

368 |
*/ |

369 |
.macro idct_rows func width |

370 |
bl \func |

371 |
add a1, a1, #(16*2) |

372 |
add a2, a2, #\width |

373 |
bl \func |

374 |
add a1, a1, #(16*2) |

375 |
add a2, a2, #\width |

376 |
bl \func |

377 |
add a1, a1, #(16*2) |

378 |
add a2, a2, #\width |

379 |
bl \func |

380 |
sub a1, a1, #(16*5) |

381 |
add a2, a2, #\width |

382 |
bl \func |

383 |
add a1, a1, #(16*2) |

384 |
add a2, a2, #\width |

385 |
bl \func |

386 |
add a1, a1, #(16*2) |

387 |
add a2, a2, #\width |

388 |
bl \func |

389 |
add a1, a1, #(16*2) |

390 |
add a2, a2, #\width |

391 |
bl \func |

392 | |

393 |
sub a1, a1, #(16*7) |

394 |
.endm |

395 | |

396 |
.align |

397 |
.global ff_simple_idct_armv6 |

398 |
.type ff_simple_idct_armv6, %function |

399 |
.func ff_simple_idct_armv6 |

400 |
/* void ff_simple_idct_armv6(DCTELEM *data); */ |

401 |
ff_simple_idct_armv6: |

402 |
stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} |

403 |
sub sp, sp, #128 |

404 | |

405 |
mov a2, sp |

406 |
idct_rows idct_row_armv6, 2 |

407 |
mov a2, a1 |

408 |
mov a1, sp |

409 |
idct_rows idct_col_armv6, 2 |

410 | |

411 |
add sp, sp, #128 |

412 |
ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} |

413 |
.endfunc |

414 | |

415 |
.align |

416 |
.global ff_simple_idct_add_armv6 |

417 |
.type ff_simple_idct_add_armv6, %function |

418 |
.func ff_simple_idct_add_armv6 |

419 |
/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ |

420 |
ff_simple_idct_add_armv6: |

421 |
stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} |

422 |
sub sp, sp, #128 |

423 | |

424 |
mov a1, a3 |

425 |
mov a2, sp |

426 |
idct_rows idct_row_armv6, 2 |

427 |
mov a1, sp |

428 |
ldr a2, [sp, #128] |

429 |
ldr a3, [sp, #(128+4)] |

430 |
idct_rows idct_col_add_armv6, 1 |

431 | |

432 |
add sp, sp, #(128+8) |

433 |
ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} |

434 |
.endfunc |

435 | |

436 |
.align |

437 |
.global ff_simple_idct_put_armv6 |

438 |
.type ff_simple_idct_put_armv6, %function |

439 |
.func ff_simple_idct_put_armv6 |

440 |
/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ |

441 |
ff_simple_idct_put_armv6: |

442 |
stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} |

443 |
sub sp, sp, #128 |

444 | |

445 |
mov a1, a3 |

446 |
mov a2, sp |

447 |
idct_rows idct_row_armv6, 2 |

448 |
mov a1, sp |

449 |
ldr a2, [sp, #128] |

450 |
ldr a3, [sp, #(128+4)] |

451 |
idct_rows idct_col_put_armv6, 1 |

452 | |

453 |
add sp, sp, #(128+8) |

454 |
ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} |

455 |
.endfunc |