ffmpeg / libavcodec / arm / simple_idct_armv6.S

/* |
* Simple IDCT |

* |

* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> |

* Copyright (c) 2007 Mans Rullgard <mans@mansr.com> |

* |

* This file is part of Libav. |

* |

* Libav is free software; you can redistribute it and/or |

* modify it under the terms of the GNU Lesser General Public |

* License as published by the Free Software Foundation; either |

* version 2.1 of the License, or (at your option) any later version. |

* |

* Libav is distributed in the hope that it will be useful, |

* but WITHOUT ANY WARRANTY; without even the implied warranty of |

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

* Lesser General Public License for more details. |

* |

* You should have received a copy of the GNU Lesser General Public |

* License along with Libav; if not, write to the Free Software |

* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

*/ |

23 | |

#include "asm.S" |

25 | |

#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

#define ROW_SHIFT 11 |

#define COL_SHIFT 20 |

35 | |

#define W13 (W1 | (W3 << 16)) |

#define W26 (W2 | (W6 << 16)) |

#define W42 (W4 | (W2 << 16)) |

#define W42n (-W4&0xffff | (-W2 << 16)) |

#define W46 (W4 | (W6 << 16)) |

#define W57 (W5 | (W7 << 16)) |

42 | |

.text |

.align |

w13: .long W13 |

w26: .long W26 |

w42: .long W42 |

w42n: .long W42n |

w46: .long W46 |

w57: .long W57 |

51 | |

/* |

Compute partial IDCT of single row. |

shift = left-shift amount |

r0 = source address |

r2 = row[2,0] <= 2 cycles |

r3 = row[3,1] |

ip = w42 <= 2 cycles |

59 | |

Output in registers r4--r11 |

*/ |

.macro idct_row shift |

ldr lr, w46 /* lr = W4 | (W6 << 16) */ |

mov r1, #(1<<(\shift-1)) |

smlad r4, r2, ip, r1 |

smlsd r7, r2, ip, r1 |

ldr ip, w13 /* ip = W1 | (W3 << 16) */ |

ldr r10,w57 /* r10 = W5 | (W7 << 16) */ |

smlad r5, r2, lr, r1 |

smlsd r6, r2, lr, r1 |

71 | |

smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ |

smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ |

ldr lr, [r0, #12] /* lr = row[7,5] */ |

pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ |

pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ |

smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ |

smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */ |

smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ |

ldr r3, w42n /* r3 = -W4 | (-W2 << 16) */ |

smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */ |

ldr r2, [r0, #4] /* r2 = row[6,4] */ |

smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */ |

ldr ip, w46 /* ip = W4 | (W6 << 16) */ |

smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */ |

smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */ |

smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */ |

smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */ |

smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */ |

.endm |

93 | |

/* |

Compute partial IDCT of half row. |

shift = left-shift amount |

r2 = row[2,0] |

r3 = row[3,1] |

ip = w42 |

100 | |

Output in registers r4--r11 |

*/ |

.macro idct_row4 shift |

ldr lr, w46 /* lr = W4 | (W6 << 16) */ |

ldr r10,w57 /* r10 = W5 | (W7 << 16) */ |

mov r1, #(1<<(\shift-1)) |

smlad r4, r2, ip, r1 |

smlsd r7, r2, ip, r1 |

ldr ip, w13 /* ip = W1 | (W3 << 16) */ |

smlad r5, r2, lr, r1 |

smlsd r6, r2, lr, r1 |

smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ |

smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ |

pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ |

pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ |

smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ |

smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ |

.endm |

119 | |

/* |

Compute final part of IDCT single row without shift. |

Input in registers r4--r11 |

Output in registers ip, r4--r6, lr, r8--r10 |

*/ |

.macro idct_finish |

126 |
127 |
128 |
129 |
add r8, r5, r9 /* r2 = A1 - B1 */ |

add r5, r6, r10 /* r1 = A2 + B2 */ |

sub r9, r6, r10 /* r1 = A2 - B2 */ |

add r6, r7, r11 /* r2 = A3 + B3 */ |

133 |
134 |
135 | |

/* |

Compute final part of IDCT single row. |

shift = right-shift amount |

139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 | |

sub r3, r5, r9 /* r3 = A1 + B1 */ |

add r2, r5, r9 /* r2 = A1 - B1 */ |

149 |
150 |
151 | |

add r3, r6, r10 /* r3 = A2 + B2 */ |

sub r2, r6, r10 /* r2 = A2 - B2 */ |

mov r6, r3, asr #\shift |

155 |
156 | |

add r3, r7, r11 /* r3 = A3 + B3 */ |

sub r2, r7, r11 /* r2 = A3 - B3 */ |

mov r7, r3, asr #\shift |

mov r11,r2, asr #\shift |

161 |
162 | |

/* |

Compute final part of IDCT single row, saturating results at 8 bits. |

shift = right-shift amount |

Input/output in registers r4--r11 |

*/ |

.macro idct_finish_shift_sat shift |

add r3, r4, r8 /* r3 = A0 + B0 */ |

sub ip, r4, r8 /* ip = A0 - B0 */ |

usat r4, #8, r3, asr #\shift |

usat r8, #8, ip, asr #\shift |

174 |
175 |
176 |
177 |
178 | |

add r3, r6, r10 /* r3 = A2 + B2 */ |

sub ip, r6, r10 /* ip = A2 - B2 */ |

usat r6, #8, r3, asr #\shift |

usat r10,#8, ip, asr #\shift |

184 |
185 |
186 |
187 |
188 |
189 | |

/* |

Compute IDCT of single row, storing as column. |

r0 = source |

r1 = dest |

*/ |

function idct_row_armv6 |

push {lr} |

198 |
199 |
200 |
201 |
ldr r2, [r0] /* r2 = row[2,0] */ |

orrs lr, lr, ip |

203 |
204 |
205 |
206 |
207 |
208 |
cmp lr, #0 |

beq 2f |

211 |
idct_row ROW_SHIFT |

b 3f |

214 |
215 | |

3: pop {r1} |

idct_finish_shift ROW_SHIFT |

219 |
strh r4, [r1] |

strh r5, [r1, #(16*2)] |

strh r6, [r1, #(16*4)] |

strh r7, [r1, #(16*6)] |

strh r11,[r1, #(16*1)] |

strh r10,[r1, #(16*3)] |

strh r9, [r1, #(16*5)] |

strh r8, [r1, #(16*7)] |

228 |
pop {pc} |

230 |
1: mov r2, r2, lsl #3 |

231 |
strh r2, [r1] |

232 |
strh r2, [r1, #(16*2)] |

233 |
strh r2, [r1, #(16*4)] |

234 |
strh r2, [r1, #(16*6)] |

235 |
strh r2, [r1, #(16*1)] |

236 |
strh r2, [r1, #(16*3)] |

237 |
strh r2, [r1, #(16*5)] |

238 |
strh r2, [r1, #(16*7)] |

239 |
pop {pc} |

240 |
endfunc |

241 | |

242 |
/* |

243 |
Compute IDCT of single column, read as row. |

244 |
r0 = source |

245 |
r1 = dest |

246 |
*/ |

247 |
function idct_col_armv6 |

248 |
push {r1, lr} |

249 | |

250 |
ldr r2, [r0] /* r2 = row[2,0] */ |

251 |
ldr ip, w42 /* ip = W4 | (W2 << 16) */ |

252 |
ldr r3, [r0, #8] /* r3 = row[3,1] */ |

253 |
idct_row COL_SHIFT |

254 |
pop {r1} |

255 |
idct_finish_shift COL_SHIFT |

256 | |

257 |
strh r4, [r1] |

258 |
strh r5, [r1, #(16*1)] |

259 |
strh r6, [r1, #(16*2)] |

260 |
strh r7, [r1, #(16*3)] |

261 |
strh r11,[r1, #(16*4)] |

262 |
strh r10,[r1, #(16*5)] |

263 |
strh r9, [r1, #(16*6)] |

264 |
strh r8, [r1, #(16*7)] |

265 | |

266 |
pop {pc} |

267 |
endfunc |

268 | |

269 |
/* |

270 |
Compute IDCT of single column, read as row, store saturated 8-bit. |

271 |
r0 = source |

272 |
r1 = dest |

273 |
r2 = line size |

274 |
*/ |

275 |
function idct_col_put_armv6 |

276 |
push {r1, r2, lr} |

277 | |

278 |
ldr r2, [r0] /* r2 = row[2,0] */ |

279 |
ldr ip, w42 /* ip = W4 | (W2 << 16) */ |

280 |
ldr r3, [r0, #8] /* r3 = row[3,1] */ |

281 |
idct_row COL_SHIFT |

282 |
pop {r1, r2} |

283 |
idct_finish_shift_sat COL_SHIFT |

284 | |

285 |
strb r4, [r1], r2 |

286 |
strb r5, [r1], r2 |

287 |
strb r6, [r1], r2 |

288 |
strb r7, [r1], r2 |

289 |
strb r11,[r1], r2 |

290 |
strb r10,[r1], r2 |

291 |
strb r9, [r1], r2 |

292 |
strb r8, [r1], r2 |

293 | |

294 |
sub r1, r1, r2, lsl #3 |

295 | |

296 |
pop {pc} |

297 |
endfunc |

298 | |

299 |
/* |

300 |
Compute IDCT of single column, read as row, add/store saturated 8-bit. |

301 |
r0 = source |

302 |
r1 = dest |

303 |
r2 = line size |

304 |
*/ |

305 |
function idct_col_add_armv6 |

306 |
push {r1, r2, lr} |

307 | |

308 |
ldr r2, [r0] /* r2 = row[2,0] */ |

309 |
ldr ip, w42 /* ip = W4 | (W2 << 16) */ |

310 |
ldr r3, [r0, #8] /* r3 = row[3,1] */ |

311 |
idct_row COL_SHIFT |

312 |
pop {r1, r2} |

313 |
idct_finish |

314 | |

315 |
ldrb r3, [r1] |

316 |
ldrb r7, [r1, r2] |

317 |
ldrb r11,[r1, r2, lsl #2] |

318 |
add ip, r3, ip, asr #COL_SHIFT |

319 |
usat ip, #8, ip |

320 |
add r4, r7, r4, asr #COL_SHIFT |

321 |
strb ip, [r1], r2 |

322 |
ldrb ip, [r1, r2] |

323 |
usat r4, #8, r4 |

324 |
ldrb r11,[r1, r2, lsl #2] |

325 |
add r5, ip, r5, asr #COL_SHIFT |

326 |
usat r5, #8, r5 |

327 |
strb r4, [r1], r2 |

328 |
ldrb r3, [r1, r2] |

329 |
ldrb ip, [r1, r2, lsl #2] |

330 |
strb r5, [r1], r2 |

331 |
ldrb r7, [r1, r2] |

332 |
ldrb r4, [r1, r2, lsl #2] |

333 |
add r6, r3, r6, asr #COL_SHIFT |

334 |
usat r6, #8, r6 |

335 |
add r10,r7, r10,asr #COL_SHIFT |

336 |
usat r10,#8, r10 |

337 |
add r9, r11,r9, asr #COL_SHIFT |

338 |
usat r9, #8, r9 |

339 |
add r8, ip, r8, asr #COL_SHIFT |

340 |
usat r8, #8, r8 |

341 |
add lr, r4, lr, asr #COL_SHIFT |

342 |
usat lr, #8, lr |

343 |
strb r6, [r1], r2 |

344 |
strb r10,[r1], r2 |

345 |
strb r9, [r1], r2 |

346 |
strb r8, [r1], r2 |

347 |
strb lr, [r1], r2 |

348 | |

349 |
sub r1, r1, r2, lsl #3 |

350 | |

351 |
pop {pc} |

352 |
endfunc |

353 | |

354 |
/* |

355 |
Compute 8 IDCT row transforms. |

356 |
func = IDCT row->col function |

357 |
width = width of columns in bytes |

358 |
*/ |

359 |
.macro idct_rows func width |

360 |
bl \func |

361 |
add r0, r0, #(16*2) |

362 |
add r1, r1, #\width |

363 |
bl \func |

364 |
add r0, r0, #(16*2) |

365 |
add r1, r1, #\width |

366 |
bl \func |

367 |
add r0, r0, #(16*2) |

368 |
add r1, r1, #\width |

369 |
bl \func |

370 |
sub r0, r0, #(16*5) |

371 |
add r1, r1, #\width |

372 |
bl \func |

373 |
add r0, r0, #(16*2) |

374 |
add r1, r1, #\width |

375 |
bl \func |

376 |
add r0, r0, #(16*2) |

377 |
add r1, r1, #\width |

378 |
bl \func |

379 |
add r0, r0, #(16*2) |

380 |
add r1, r1, #\width |

381 |
bl \func |

382 | |

383 |
sub r0, r0, #(16*7) |

384 |
.endm |

385 | |

386 |
/* void ff_simple_idct_armv6(DCTELEM *data); */ |

387 |
function ff_simple_idct_armv6, export=1 |

388 |
push {r4-r11, lr} |

389 |
sub sp, sp, #128 |

390 | |

391 |
mov r1, sp |

392 |
idct_rows idct_row_armv6, 2 |

393 |
mov r1, r0 |

394 |
mov r0, sp |

395 |
idct_rows idct_col_armv6, 2 |

396 | |

397 |
add sp, sp, #128 |

398 |
pop {r4-r11, pc} |

399 |
endfunc |

400 | |

401 |
/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ |

402 |
function ff_simple_idct_add_armv6, export=1 |

403 |
push {r0, r1, r4-r11, lr} |

404 |
sub sp, sp, #128 |

405 | |

406 |
mov r0, r2 |

407 |
mov r1, sp |

408 |
idct_rows idct_row_armv6, 2 |

409 |
mov r0, sp |

410 |
ldr r1, [sp, #128] |

411 |
ldr r2, [sp, #(128+4)] |

412 |
idct_rows idct_col_add_armv6, 1 |

413 | |

414 |
add sp, sp, #(128+8) |

415 |
pop {r4-r11, pc} |

416 |
endfunc |

417 | |

418 |
/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ |

419 |
function ff_simple_idct_put_armv6, export=1 |

420 |
push {r0, r1, r4-r11, lr} |

421 |
sub sp, sp, #128 |

422 | |

423 |
mov r0, r2 |

424 |
mov r1, sp |

425 |
idct_rows idct_row_armv6, 2 |

426 |
mov r0, sp |

427 |
ldr r1, [sp, #128] |

428 |
ldr r2, [sp, #(128+4)] |

429 |
idct_rows idct_col_put_armv6, 1 |

430 | |

431 |
add sp, sp, #(128+8) |

432 |
pop {r4-r11, pc} |

433 |
endfunc |