Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / simple_idct_armv6.S @ a2fc0f6a

History | View | Annotate | Download (13.3 KB)

1
/*
2
 * Simple IDCT
3
 *
4
 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5
 * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
6
 *
7
 * This file is part of FFmpeg.
8
 *
9
 * FFmpeg is free software; you can redistribute it and/or
10
 * modify it under the terms of the GNU Lesser General Public
11
 * License as published by the Free Software Foundation; either
12
 * version 2.1 of the License, or (at your option) any later version.
13
 *
14
 * FFmpeg is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 * Lesser General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU Lesser General Public
20
 * License along with FFmpeg; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
 */
23

    
24
#include "asm.S"
25

    
26
#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
27
#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
28
#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
29
#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
30
#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
31
#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
32
#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
33
#define ROW_SHIFT 11
34
#define COL_SHIFT 20
35

    
36
#define W13 (W1 | (W3 << 16))
37
#define W26 (W2 | (W6 << 16))
38
#define W42 (W4 | (W2 << 16))
39
#define W42n (-W4&0xffff | (-W2 << 16))
40
#define W46 (W4 | (W6 << 16))
41
#define W57 (W5 | (W7 << 16))
42

    
43
        .text
44
        .align
45
w13:    .long W13
46
w26:    .long W26
47
w42:    .long W42
48
w42n:   .long W42n
49
w46:    .long W46
50
w57:    .long W57
51

    
52
/*
53
  Compute partial IDCT of single row.
54
  shift = left-shift amount
55
  a1 = source address
56
  a3 = row[2,0] <= 2 cycles
57
  a4 = row[3,1]
58
  ip = w42      <= 2 cycles
59

    
60
  Output in registers v1--v8
61
*/
62
        .macro idct_row shift
63
        ldr    lr, [pc, #(w46-.-8)]  /* lr = W4 | (W6 << 16) */
64
        mov    a2, #(1<<(\shift-1))
65
        smlad  v1, a3, ip, a2
66
        smlsd  v4, a3, ip, a2
67
        ldr    ip, [pc, #(w13-.-8)]  /* ip = W1 | (W3 << 16) */
68
        ldr    v7, [pc, #(w57-.-8)]  /* v7 = W5 | (W7 << 16) */
69
        smlad  v2, a3, lr, a2
70
        smlsd  v3, a3, lr, a2
71

    
72
        smuad  v5, a4, ip            /* v5 = B0 = W1*row[1] + W3*row[3] */
73
        smusdx fp, a4, v7            /* fp = B3 = W7*row[1] - W5*row[3] */
74
        ldr    lr, [a1, #12]         /* lr = row[7,5] */
75
        pkhtb  a3, ip, v7, asr #16   /* a4 = W7 | (W3 << 16) */
76
        pkhbt  a2, ip, v7, lsl #16   /* a2 = W1 | (W5 << 16) */
77
        smusdx v6, a3, a4            /* v6 = -B1 = W7*row[3] - W3*row[1] */
78
        smlad  v5, lr, v7, v5        /* B0 += W5*row[5] + W7*row[7] */
79
        smusdx v7, a4, a2            /* v7 = B2 = W5*row[1] - W1*row[3] */
80

    
81
        ldr    a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */
82
        smlad  v7, lr, a3, v7        /* B2 += W7*row[5] + W3*row[7] */
83
        ldr    a3, [a1, #4]          /* a3 = row[6,4] */
84
        smlsdx fp, lr, ip, fp        /* B3 += W3*row[5] - W1*row[7] */
85
        ldr    ip, [pc, #(w46-.-8)]  /* ip = W4 | (W6 << 16) */
86
        smlad  v6, lr, a2, v6        /* B1 -= W1*row[5] + W5*row[7] */
87

    
88
        smlad  v2, a3, a4, v2        /* A1 += -W4*row[4] - W2*row[6] */
89
        smlsd  v3, a3, a4, v3        /* A2 += -W4*row[4] + W2*row[6] */
90
        smlad  v1, a3, ip, v1        /* A0 += W4*row[4] + W6*row[6] */
91
        smlsd  v4, a3, ip, v4        /* A3 += W4*row[4] - W6*row[6] */
92
        .endm
93

    
94
/*
95
  Compute partial IDCT of half row.
96
  shift = left-shift amount
97
  a3 = row[2,0]
98
  a4 = row[3,1]
99
  ip = w42
100

    
101
  Output in registers v1--v8
102
*/
103
        .macro idct_row4 shift
104
        ldr    lr, [pc, #(w46-.-8)]  /* lr = W4 | (W6 << 16) */
105
        ldr    v7, [pc, #(w57-.-8)]  /* v7 = W5 | (W7 << 16) */
106
        mov    a2, #(1<<(\shift-1))
107
        smlad  v1, a3, ip, a2
108
        smlsd  v4, a3, ip, a2
109
        ldr    ip, [pc, #(w13-.-8)]  /* ip = W1 | (W3 << 16) */
110
        smlad  v2, a3, lr, a2
111
        smlsd  v3, a3, lr, a2
112
        smusdx fp, a4, v7            /* fp = B3 = W7*row[1] - W5*row[3] */
113
        smuad  v5, a4, ip            /* v5 = B0 = W1*row[1] + W3*row[3] */
114
        pkhtb  a3, ip, v7, asr #16   /* a4 = W7 | (W3 << 16) */
115
        pkhbt  a2, ip, v7, lsl #16   /* a2 = W1 | (W5 << 16) */
116
        smusdx v6, a3, a4            /* v6 = -B1 = W7*row[3] - W3*row[1] */
117
        smusdx v7, a4, a2            /* v7 = B2 = W5*row[1] - W1*row[3] */
118
        .endm
119

    
120
/*
121
  Compute final part of IDCT single row without shift.
122
  Input in registers v1--v8
123
  Output in registers ip, v1--v3, lr, v5--v7
124
*/
125
        .macro idct_finish
126
        add    ip, v1, v5            /* a2 = A0 + B0 */
127
        sub    lr, v1, v5            /* a3 = A0 - B0 */
128
        sub    v1, v2, v6            /* a3 = A1 + B1 */
129
        add    v5, v2, v6            /* a3 = A1 - B1 */
130
        add    v2, v3, v7            /* a2 = A2 + B2 */
131
        sub    v6, v3, v7            /* a2 = A2 - B2 */
132
        add    v3, v4, fp            /* a3 = A3 + B3 */
133
        sub    v7, v4, fp            /* a3 = A3 - B3 */
134
        .endm
135

    
136
/*
137
  Compute final part of IDCT single row.
138
  shift = right-shift amount
139
  Input/output in registers v1--v8
140
*/
141
        .macro idct_finish_shift shift
142
        add    a4, v1, v5            /* a4 = A0 + B0 */
143
        sub    a3, v1, v5            /* a3 = A0 - B0 */
144
        mov    v1, a4, asr #\shift
145
        mov    v5, a3, asr #\shift
146

    
147
        sub    a4, v2, v6            /* a4 = A1 + B1 */
148
        add    a3, v2, v6            /* a3 = A1 - B1 */
149
        mov    v2, a4, asr #\shift
150
        mov    v6, a3, asr #\shift
151

    
152
        add    a4, v3, v7            /* a4 = A2 + B2 */
153
        sub    a3, v3, v7            /* a3 = A2 - B2 */
154
        mov    v3, a4, asr #\shift
155
        mov    v7, a3, asr #\shift
156

    
157
        add    a4, v4, fp            /* a4 = A3 + B3 */
158
        sub    a3, v4, fp            /* a3 = A3 - B3 */
159
        mov    v4, a4, asr #\shift
160
        mov    fp, a3, asr #\shift
161
        .endm
162

    
163
/*
164
  Compute final part of IDCT single row, saturating results at 8 bits.
165
  shift = right-shift amount
166
  Input/output in registers v1--v8
167
*/
168
        .macro idct_finish_shift_sat shift
169
        add    a4, v1, v5            /* a4 = A0 + B0 */
170
        sub    ip, v1, v5            /* ip = A0 - B0 */
171
        usat   v1, #8, a4, asr #\shift
172
        usat   v5, #8, ip, asr #\shift
173

    
174
        sub    a4, v2, v6            /* a4 = A1 + B1 */
175
        add    ip, v2, v6            /* ip = A1 - B1 */
176
        usat   v2, #8, a4, asr #\shift
177
        usat   v6, #8, ip, asr #\shift
178

    
179
        add    a4, v3, v7            /* a4 = A2 + B2 */
180
        sub    ip, v3, v7            /* ip = A2 - B2 */
181
        usat   v3, #8, a4, asr #\shift
182
        usat   v7, #8, ip, asr #\shift
183

    
184
        add    a4, v4, fp            /* a4 = A3 + B3 */
185
        sub    ip, v4, fp            /* ip = A3 - B3 */
186
        usat   v4, #8, a4, asr #\shift
187
        usat   fp, #8, ip, asr #\shift
188
        .endm
189

    
190
/*
191
  Compute IDCT of single row, storing as column.
192
  a1 = source
193
  a2 = dest
194
*/
195
function idct_row_armv6
196
        str    lr, [sp, #-4]!
197

    
198
        ldr    lr, [a1, #12]         /* lr = row[7,5] */
199
        ldr    ip, [a1, #4]          /* ip = row[6,4] */
200
        ldr    a4, [a1, #8]          /* a4 = row[3,1] */
201
        ldr    a3, [a1]              /* a3 = row[2,0] */
202
        orrs   lr, lr, ip
203
        cmpeq  lr, a4
204
        cmpeq  lr, a3, lsr #16
205
        beq    1f
206
        str    a2, [sp, #-4]!
207
        ldr    ip, [pc, #(w42-.-8)]  /* ip = W4 | (W2 << 16) */
208
        cmp    lr, #0
209
        beq    2f
210

    
211
        idct_row   ROW_SHIFT
212
        b      3f
213

    
214
2:      idct_row4  ROW_SHIFT
215

    
216
3:      ldr    a2, [sp], #4
217
        idct_finish_shift ROW_SHIFT
218

    
219
        strh   v1, [a2]
220
        strh   v2, [a2, #(16*2)]
221
        strh   v3, [a2, #(16*4)]
222
        strh   v4, [a2, #(16*6)]
223
        strh   fp, [a2, #(16*1)]
224
        strh   v7, [a2, #(16*3)]
225
        strh   v6, [a2, #(16*5)]
226
        strh   v5, [a2, #(16*7)]
227

    
228
        ldr    pc, [sp], #4
229

    
230
1:      mov    a3, a3, lsl #3
231
        strh   a3, [a2]
232
        strh   a3, [a2, #(16*2)]
233
        strh   a3, [a2, #(16*4)]
234
        strh   a3, [a2, #(16*6)]
235
        strh   a3, [a2, #(16*1)]
236
        strh   a3, [a2, #(16*3)]
237
        strh   a3, [a2, #(16*5)]
238
        strh   a3, [a2, #(16*7)]
239
        ldr    pc, [sp], #4
240
        .endfunc
241

    
242
/*
243
  Compute IDCT of single column, read as row.
244
  a1 = source
245
  a2 = dest
246
*/
247
function idct_col_armv6
248
        stmfd  sp!, {a2, lr}
249

    
250
        ldr    a3, [a1]              /* a3 = row[2,0] */
251
        ldr    ip, [pc, #(w42-.-8)]  /* ip = W4 | (W2 << 16) */
252
        ldr    a4, [a1, #8]          /* a4 = row[3,1] */
253
        idct_row COL_SHIFT
254
        ldr    a2, [sp], #4
255
        idct_finish_shift COL_SHIFT
256

    
257
        strh   v1, [a2]
258
        strh   v2, [a2, #(16*1)]
259
        strh   v3, [a2, #(16*2)]
260
        strh   v4, [a2, #(16*3)]
261
        strh   fp, [a2, #(16*4)]
262
        strh   v7, [a2, #(16*5)]
263
        strh   v6, [a2, #(16*6)]
264
        strh   v5, [a2, #(16*7)]
265

    
266
        ldr    pc, [sp], #4
267
        .endfunc
268

    
269
/*
270
  Compute IDCT of single column, read as row, store saturated 8-bit.
271
  a1 = source
272
  a2 = dest
273
  a3 = line size
274
*/
275
function idct_col_put_armv6
276
        stmfd  sp!, {a2, a3, lr}
277

    
278
        ldr    a3, [a1]              /* a3 = row[2,0] */
279
        ldr    ip, [pc, #(w42-.-8)]  /* ip = W4 | (W2 << 16) */
280
        ldr    a4, [a1, #8]          /* a4 = row[3,1] */
281
        idct_row COL_SHIFT
282
        ldmfd  sp!, {a2, a3}
283
        idct_finish_shift_sat COL_SHIFT
284

    
285
        strb   v1, [a2], a3
286
        strb   v2, [a2], a3
287
        strb   v3, [a2], a3
288
        strb   v4, [a2], a3
289
        strb   fp, [a2], a3
290
        strb   v7, [a2], a3
291
        strb   v6, [a2], a3
292
        strb   v5, [a2], a3
293

    
294
        sub    a2, a2, a3, lsl #3
295

    
296
        ldr    pc, [sp], #4
297
        .endfunc
298

    
299
/*
300
  Compute IDCT of single column, read as row, add/store saturated 8-bit.
301
  a1 = source
302
  a2 = dest
303
  a3 = line size
304
*/
305
function idct_col_add_armv6
306
        stmfd  sp!, {a2, a3, lr}
307

    
308
        ldr    a3, [a1]              /* a3 = row[2,0] */
309
        ldr    ip, [pc, #(w42-.-8)]  /* ip = W4 | (W2 << 16) */
310
        ldr    a4, [a1, #8]          /* a4 = row[3,1] */
311
        idct_row COL_SHIFT
312
        ldmfd  sp!, {a2, a3}
313
        idct_finish
314

    
315
        ldrb   a4, [a2]
316
        ldrb   v4, [a2, a3]
317
        ldrb   fp, [a2, a3, lsl #2]
318
        add    ip, a4, ip, asr #COL_SHIFT
319
        usat   ip, #8, ip
320
        add    v1, v4, v1, asr #COL_SHIFT
321
        strb   ip, [a2], a3
322
        ldrb   ip, [a2, a3]
323
        usat   v1, #8, v1
324
        ldrb   fp, [a2, a3, lsl #2]
325
        add    v2, ip, v2, asr #COL_SHIFT
326
        usat   v2, #8, v2
327
        strb   v1, [a2], a3
328
        ldrb   a4, [a2, a3]
329
        ldrb   ip, [a2, a3, lsl #2]
330
        strb   v2, [a2], a3
331
        ldrb   v4, [a2, a3]
332
        ldrb   v1, [a2, a3, lsl #2]
333
        add    v3, a4, v3, asr #COL_SHIFT
334
        usat   v3, #8, v3
335
        add    v7, v4, v7, asr #COL_SHIFT
336
        usat   v7, #8, v7
337
        add    v6, fp, v6, asr #COL_SHIFT
338
        usat   v6, #8, v6
339
        add    v5, ip, v5, asr #COL_SHIFT
340
        usat   v5, #8, v5
341
        add    lr, v1, lr, asr #COL_SHIFT
342
        usat   lr, #8, lr
343
        strb   v3, [a2], a3
344
        strb   v7, [a2], a3
345
        strb   v6, [a2], a3
346
        strb   v5, [a2], a3
347
        strb   lr, [a2], a3
348

    
349
        sub    a2, a2, a3, lsl #3
350

    
351
        ldr    pc, [sp], #4
352
        .endfunc
353

    
354
/*
355
  Compute 8 IDCT row transforms.
356
  func = IDCT row->col function
357
  width = width of columns in bytes
358
*/
359
        .macro idct_rows func width
360
        bl     \func
361
        add    a1, a1, #(16*2)
362
        add    a2, a2, #\width
363
        bl     \func
364
        add    a1, a1, #(16*2)
365
        add    a2, a2, #\width
366
        bl     \func
367
        add    a1, a1, #(16*2)
368
        add    a2, a2, #\width
369
        bl     \func
370
        sub    a1, a1, #(16*5)
371
        add    a2, a2, #\width
372
        bl     \func
373
        add    a1, a1, #(16*2)
374
        add    a2, a2, #\width
375
        bl     \func
376
        add    a1, a1, #(16*2)
377
        add    a2, a2, #\width
378
        bl     \func
379
        add    a1, a1, #(16*2)
380
        add    a2, a2, #\width
381
        bl     \func
382

    
383
        sub    a1, a1, #(16*7)
384
        .endm
385

    
386
/* void ff_simple_idct_armv6(DCTELEM *data); */
387
function ff_simple_idct_armv6, export=1
388
        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
389
        sub    sp, sp, #128
390

    
391
        mov    a2, sp
392
        idct_rows idct_row_armv6, 2
393
        mov    a2, a1
394
        mov    a1, sp
395
        idct_rows idct_col_armv6, 2
396

    
397
        add    sp, sp, #128
398
        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
399
        .endfunc
400

    
401
/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
402
function ff_simple_idct_add_armv6, export=1
403
        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
404
        sub    sp, sp, #128
405

    
406
        mov    a1, a3
407
        mov    a2, sp
408
        idct_rows idct_row_armv6, 2
409
        mov    a1, sp
410
        ldr    a2, [sp, #128]
411
        ldr    a3, [sp, #(128+4)]
412
        idct_rows idct_col_add_armv6, 1
413

    
414
        add    sp, sp, #(128+8)
415
        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
416
        .endfunc
417

    
418
/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
419
function ff_simple_idct_put_armv6, export=1
420
        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
421
        sub    sp, sp, #128
422

    
423
        mov    a1, a3
424
        mov    a2, sp
425
        idct_rows idct_row_armv6, 2
426
        mov    a1, sp
427
        ldr    a2, [sp, #128]
428
        ldr    a3, [sp, #(128+4)]
429
        idct_rows idct_col_put_armv6, 1
430

    
431
        add    sp, sp, #(128+8)
432
        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
433
        .endfunc