Statistics
| Branch: | Revision:

ffmpeg / libavcodec / armv4l / simple_idct_armv6.S @ d761f089

History | View | Annotate | Download (14 KB)

1 7d42886b Måns Rullgård
/*
2
 * Simple IDCT
3
 *
4
 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 f2250162 Måns Rullgård
 * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
6 7d42886b Måns Rullgård
 *
7
 * This file is part of FFmpeg.
8
 *
9
 * FFmpeg is free software; you can redistribute it and/or
10
 * modify it under the terms of the GNU Lesser General Public
11
 * License as published by the Free Software Foundation; either
12
 * version 2.1 of the License, or (at your option) any later version.
13
 *
14
 * FFmpeg is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 * Lesser General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU Lesser General Public
20
 * License along with FFmpeg; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
 */
23
24
#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
25
#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
26
#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
27
#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
28
#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
29
#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
30
#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
31
#define ROW_SHIFT 11
32
#define COL_SHIFT 20
33
34
#define W13 (W1 | (W3 << 16))
35
#define W26 (W2 | (W6 << 16))
36
#define W42 (W4 | (W2 << 16))
37
#define W42n (-W4&0xffff | (-W2 << 16))
38
#define W46 (W4 | (W6 << 16))
39
#define W57 (W5 | (W7 << 16))
40
41
        .text
42
        .align
43
w13:    .long W13
44
w26:    .long W26
45
w42:    .long W42
46
w42n:   .long W42n
47
w46:    .long W46
48
w57:    .long W57
49
50
/*
51
  Compute partial IDCT of single row.
52
  shift = left-shift amount
53
  a1 = source address
54 ac62b626 Måns Rullgård
  a3 = row[2,0] <= 2 cycles
55 173fd724 Måns Rullgård
  a4 = row[3,1]
56 ac62b626 Måns Rullgård
  ip = w42      <= 2 cycles
57 7d42886b Måns Rullgård
58
  Output in registers v1--v8
59
*/
60
        .macro idct_row shift
61 7ee82992 Måns Rullgård
        ldr    lr, [pc, #(w46-.-8)]  /* lr = W4 | (W6 << 16) */
62
        mov    a2, #(1<<(\shift-1))
63
        smlad  v1, a3, ip, a2
64
        smlsd  v4, a3, ip, a2
65
        ldr    ip, [pc, #(w13-.-8)]  /* ip = W1 | (W3 << 16) */
66
        ldr    v7, [pc, #(w57-.-8)]  /* v7 = W5 | (W7 << 16) */
67
        smlad  v2, a3, lr, a2
68
        smlsd  v3, a3, lr, a2
69
70
        smuad  v5, a4, ip            /* v5 = B0 = W1*row[1] + W3*row[3] */
71
        smusdx fp, a4, v7            /* fp = B3 = W7*row[1] - W5*row[3] */
72 7d42886b Måns Rullgård
        ldr    lr, [a1, #12]         /* lr = row[7,5] */
73
        pkhtb  a3, ip, v7, asr #16   /* a4 = W7 | (W3 << 16) */
74
        pkhbt  a2, ip, v7, lsl #16   /* a2 = W1 | (W5 << 16) */
75 34b1b8fd Måns Rullgård
        smusdx v6, a3, a4            /* v6 = -B1 = W7*row[3] - W3*row[1] */
76 7d42886b Måns Rullgård
        smlad  v5, lr, v7, v5        /* B0 += W5*row[5] + W7*row[7] */
77
        smusdx v7, a4, a2            /* v7 = B2 = W5*row[1] - W1*row[3] */
78
79 34b1b8fd Måns Rullgård
        ldr    a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */
80 7d42886b Måns Rullgård
        smlad  v7, lr, a3, v7        /* B2 += W7*row[5] + W3*row[7] */
81
        ldr    a3, [a1, #4]          /* a3 = row[6,4] */
82
        smlsdx fp, lr, ip, fp        /* B3 += W3*row[5] - W1*row[7] */
83
        ldr    ip, [pc, #(w46-.-8)]  /* ip = W4 | (W6 << 16) */
84
        smlad  v6, lr, a2, v6        /* B1 -= W1*row[5] + W5*row[7] */
85
86
        smlad  v2, a3, a4, v2        /* A1 += -W4*row[4] - W2*row[6] */
87
        smlsd  v3, a3, a4, v3        /* A2 += -W4*row[4] + W2*row[6] */
88
        smlad  v1, a3, ip, v1        /* A0 += W4*row[4] + W6*row[6] */
89
        smlsd  v4, a3, ip, v4        /* A3 += W4*row[4] - W6*row[6] */
90
        .endm
91
92
/*
93 118a49b0 Måns Rullgård
  Compute partial IDCT of half row.
94
  shift = left-shift amount
95
  a3 = row[2,0]
96
  a4 = row[3,1]
97 7348ed6e Måns Rullgård
  ip = w42
98 118a49b0 Måns Rullgård
99
  Output in registers v1--v8
100
*/
101
        .macro idct_row4 shift
102 7ee82992 Måns Rullgård
        ldr    lr, [pc, #(w46-.-8)]  /* lr = W4 | (W6 << 16) */
103
        ldr    v7, [pc, #(w57-.-8)]  /* v7 = W5 | (W7 << 16) */
104
        mov    a2, #(1<<(\shift-1))
105
        smlad  v1, a3, ip, a2
106
        smlsd  v4, a3, ip, a2
107
        ldr    ip, [pc, #(w13-.-8)]  /* ip = W1 | (W3 << 16) */
108
        smlad  v2, a3, lr, a2
109
        smlsd  v3, a3, lr, a2
110
        smusdx fp, a4, v7            /* fp = B3 = W7*row[1] - W5*row[3] */
111
        smuad  v5, a4, ip            /* v5 = B0 = W1*row[1] + W3*row[3] */
112 118a49b0 Måns Rullgård
        pkhtb  a3, ip, v7, asr #16   /* a4 = W7 | (W3 << 16) */
113
        pkhbt  a2, ip, v7, lsl #16   /* a2 = W1 | (W5 << 16) */
114
        smusdx v6, a3, a4            /* v6 = -B1 = W7*row[3] - W3*row[1] */
115
        smusdx v7, a4, a2            /* v7 = B2 = W5*row[1] - W1*row[3] */
116
        .endm
117
118
/*
119 7d42886b Måns Rullgård
  Compute final part of IDCT single row without shift.
120
  Input in registers v1--v8
121
  Output in registers ip, v1--v3, lr, v5--v7
122
*/
123
        .macro idct_finish
124
        add    ip, v1, v5            /* a2 = A0 + B0 */
125
        sub    lr, v1, v5            /* a3 = A0 - B0 */
126
        sub    v1, v2, v6            /* a3 = A1 + B1 */
127
        add    v5, v2, v6            /* a3 = A1 - B1 */
128
        add    v2, v3, v7            /* a2 = A2 + B2 */
129
        sub    v6, v3, v7            /* a2 = A2 - B2 */
130
        add    v3, v4, fp            /* a3 = A3 + B3 */
131
        sub    v7, v4, fp            /* a3 = A3 - B3 */
132
        .endm
133
134
/*
135
  Compute final part of IDCT single row.
136
  shift = right-shift amount
137
  Input/output in registers v1--v8
138
*/
139
        .macro idct_finish_shift shift
140
        add    a4, v1, v5            /* a4 = A0 + B0 */
141
        sub    a3, v1, v5            /* a3 = A0 - B0 */
142
        mov    v1, a4, asr #\shift
143
        mov    v5, a3, asr #\shift
144
145
        sub    a4, v2, v6            /* a4 = A1 + B1 */
146
        add    a3, v2, v6            /* a3 = A1 - B1 */
147
        mov    v2, a4, asr #\shift
148
        mov    v6, a3, asr #\shift
149
150
        add    a4, v3, v7            /* a4 = A2 + B2 */
151
        sub    a3, v3, v7            /* a3 = A2 - B2 */
152
        mov    v3, a4, asr #\shift
153
        mov    v7, a3, asr #\shift
154
155
        add    a4, v4, fp            /* a4 = A3 + B3 */
156
        sub    a3, v4, fp            /* a3 = A3 - B3 */
157
        mov    v4, a4, asr #\shift
158
        mov    fp, a3, asr #\shift
159
        .endm
160
161
/*
162
  Compute final part of IDCT single row, saturating results at 8 bits.
163
  shift = right-shift amount
164
  Input/output in registers v1--v8
165
*/
166
        .macro idct_finish_shift_sat shift
167
        add    a4, v1, v5            /* a4 = A0 + B0 */
168
        sub    ip, v1, v5            /* ip = A0 - B0 */
169
        usat   v1, #8, a4, asr #\shift
170
        usat   v5, #8, ip, asr #\shift
171
172
        sub    a4, v2, v6            /* a4 = A1 + B1 */
173
        add    ip, v2, v6            /* ip = A1 - B1 */
174
        usat   v2, #8, a4, asr #\shift
175
        usat   v6, #8, ip, asr #\shift
176
177
        add    a4, v3, v7            /* a4 = A2 + B2 */
178
        sub    ip, v3, v7            /* ip = A2 - B2 */
179
        usat   v3, #8, a4, asr #\shift
180
        usat   v7, #8, ip, asr #\shift
181
182
        add    a4, v4, fp            /* a4 = A3 + B3 */
183
        sub    ip, v4, fp            /* ip = A3 - B3 */
184
        usat   v4, #8, a4, asr #\shift
185
        usat   fp, #8, ip, asr #\shift
186
        .endm
187
188
/*
189
  Compute IDCT of single row, storing as column.
190
  a1 = source
191
  a2 = dest
192
*/
193
        .align
194 8520b634 Måns Rullgård
        .type idct_row_armv6, %function
195 7d42886b Måns Rullgård
        .func idct_row_armv6
196
idct_row_armv6:
197 7ee82992 Måns Rullgård
        str    lr, [sp, #-4]!
198
199
        ldr    lr, [a1, #12]         /* lr = row[7,5] */
200
        ldr    ip, [a1, #4]          /* ip = row[6,4] */
201 118a49b0 Måns Rullgård
        ldr    a4, [a1, #8]          /* a4 = row[3,1] */
202
        ldr    a3, [a1]              /* a3 = row[2,0] */
203 7ee82992 Måns Rullgård
        orrs   lr, lr, ip
204
        cmpeq  lr, a4
205
        cmpeq  lr, a3, lsr #16
206 118a49b0 Måns Rullgård
        beq    1f
207 7ee82992 Måns Rullgård
        str    a2, [sp, #-4]!
208
        ldr    ip, [pc, #(w42-.-8)]  /* ip = W4 | (W2 << 16) */
209
        cmp    lr, #0
210
        beq    2f
211 271593f1 Måns Rullgård
212 7ee82992 Måns Rullgård
        idct_row   ROW_SHIFT
213
        b      3f
214 271593f1 Måns Rullgård
215 7ee82992 Måns Rullgård
2:      idct_row4  ROW_SHIFT
216 118a49b0 Måns Rullgård
217
3:      ldr    a2, [sp], #4
218 7ee82992 Måns Rullgård
        idct_finish_shift ROW_SHIFT
219 7d42886b Måns Rullgård
220
        strh   v1, [a2]
221
        strh   v2, [a2, #(16*2)]
222
        strh   v3, [a2, #(16*4)]
223
        strh   v4, [a2, #(16*6)]
224
        strh   fp, [a2, #(16*1)]
225
        strh   v7, [a2, #(16*3)]
226
        strh   v6, [a2, #(16*5)]
227
        strh   v5, [a2, #(16*7)]
228
229
        ldr    pc, [sp], #4
230 118a49b0 Måns Rullgård
231
1:      mov    a3, a3, lsl #3
232
        strh   a3, [a2]
233
        strh   a3, [a2, #(16*2)]
234
        strh   a3, [a2, #(16*4)]
235
        strh   a3, [a2, #(16*6)]
236
        strh   a3, [a2, #(16*1)]
237
        strh   a3, [a2, #(16*3)]
238
        strh   a3, [a2, #(16*5)]
239
        strh   a3, [a2, #(16*7)]
240 7ee82992 Måns Rullgård
        ldr    pc, [sp], #4
241 7d42886b Måns Rullgård
        .endfunc
242
243
/*
244
  Compute IDCT of single column, read as row.
245
  a1 = source
246
  a2 = dest
247
*/
248
        .align
249 8520b634 Måns Rullgård
        .type idct_col_armv6, %function
250 7d42886b Måns Rullgård
        .func idct_col_armv6
251
idct_col_armv6:
252
        stmfd  sp!, {a2, lr}
253
254 173fd724 Måns Rullgård
        ldr    a3, [a1]              /* a3 = row[2,0] */
255 7ee82992 Måns Rullgård
        ldr    ip, [pc, #(w42-.-8)]  /* ip = W4 | (W2 << 16) */
256 173fd724 Måns Rullgård
        ldr    a4, [a1, #8]          /* a4 = row[3,1] */
257 7d42886b Måns Rullgård
        idct_row COL_SHIFT
258
        ldr    a2, [sp], #4
259
        idct_finish_shift COL_SHIFT
260
261
        strh   v1, [a2]
262
        strh   v2, [a2, #(16*1)]
263
        strh   v3, [a2, #(16*2)]
264
        strh   v4, [a2, #(16*3)]
265
        strh   fp, [a2, #(16*4)]
266
        strh   v7, [a2, #(16*5)]
267
        strh   v6, [a2, #(16*6)]
268
        strh   v5, [a2, #(16*7)]
269
270
        ldr    pc, [sp], #4
271
        .endfunc
272
273
/*
274
  Compute IDCT of single column, read as row, store saturated 8-bit.
275
  a1 = source
276
  a2 = dest
277
  a3 = line size
278
*/
279
        .align
280 8520b634 Måns Rullgård
        .type idct_col_put_armv6, %function
281 7d42886b Måns Rullgård
        .func idct_col_put_armv6
282
idct_col_put_armv6:
283
        stmfd  sp!, {a2, a3, lr}
284
285 173fd724 Måns Rullgård
        ldr    a3, [a1]              /* a3 = row[2,0] */
286 7ee82992 Måns Rullgård
        ldr    ip, [pc, #(w42-.-8)]  /* ip = W4 | (W2 << 16) */
287 173fd724 Måns Rullgård
        ldr    a4, [a1, #8]          /* a4 = row[3,1] */
288 7d42886b Måns Rullgård
        idct_row COL_SHIFT
289
        ldmfd  sp!, {a2, a3}
290
        idct_finish_shift_sat COL_SHIFT
291
292
        strb   v1, [a2], a3
293
        strb   v2, [a2], a3
294
        strb   v3, [a2], a3
295
        strb   v4, [a2], a3
296
        strb   fp, [a2], a3
297
        strb   v7, [a2], a3
298
        strb   v6, [a2], a3
299
        strb   v5, [a2], a3
300
301
        sub    a2, a2, a3, lsl #3
302
303
        ldr    pc, [sp], #4
304
        .endfunc
305
306
/*
307
  Compute IDCT of single column, read as row, add/store saturated 8-bit.
308
  a1 = source
309
  a2 = dest
310
  a3 = line size
311
*/
312
        .align
313 8520b634 Måns Rullgård
        .type idct_col_add_armv6, %function
314 7d42886b Måns Rullgård
        .func idct_col_add_armv6
315
idct_col_add_armv6:
316
        stmfd  sp!, {a2, a3, lr}
317
318 173fd724 Måns Rullgård
        ldr    a3, [a1]              /* a3 = row[2,0] */
319 7ee82992 Måns Rullgård
        ldr    ip, [pc, #(w42-.-8)]  /* ip = W4 | (W2 << 16) */
320 173fd724 Måns Rullgård
        ldr    a4, [a1, #8]          /* a4 = row[3,1] */
321 7d42886b Måns Rullgård
        idct_row COL_SHIFT
322
        ldmfd  sp!, {a2, a3}
323
        idct_finish
324
325
        ldrb   a4, [a2]
326
        ldrb   v4, [a2, a3]
327
        ldrb   fp, [a2, a3, lsl #2]
328
        add    ip, a4, ip, asr #COL_SHIFT
329
        usat   ip, #8, ip
330
        add    v1, v4, v1, asr #COL_SHIFT
331
        strb   ip, [a2], a3
332
        ldrb   ip, [a2, a3]
333
        usat   v1, #8, v1
334
        ldrb   fp, [a2, a3, lsl #2]
335
        add    v2, ip, v2, asr #COL_SHIFT
336
        usat   v2, #8, v2
337
        strb   v1, [a2], a3
338
        ldrb   a4, [a2, a3]
339
        ldrb   ip, [a2, a3, lsl #2]
340
        strb   v2, [a2], a3
341
        ldrb   v4, [a2, a3]
342
        ldrb   v1, [a2, a3, lsl #2]
343
        add    v3, a4, v3, asr #COL_SHIFT
344
        usat   v3, #8, v3
345
        add    v7, v4, v7, asr #COL_SHIFT
346
        usat   v7, #8, v7
347
        add    v6, fp, v6, asr #COL_SHIFT
348
        usat   v6, #8, v6
349
        add    v5, ip, v5, asr #COL_SHIFT
350
        usat   v5, #8, v5
351
        add    lr, v1, lr, asr #COL_SHIFT
352
        usat   lr, #8, lr
353
        strb   v3, [a2], a3
354
        strb   v7, [a2], a3
355
        strb   v6, [a2], a3
356
        strb   v5, [a2], a3
357
        strb   lr, [a2], a3
358
359
        sub    a2, a2, a3, lsl #3
360
361
        ldr    pc, [sp], #4
362
        .endfunc
363
364
/*
365
  Compute 8 IDCT row transforms.
366
  func = IDCT row->col function
367
  width = width of columns in bytes
368
*/
369
        .macro idct_rows func width
370
        bl     \func
371
        add    a1, a1, #(16*2)
372
        add    a2, a2, #\width
373
        bl     \func
374
        add    a1, a1, #(16*2)
375
        add    a2, a2, #\width
376
        bl     \func
377
        add    a1, a1, #(16*2)
378
        add    a2, a2, #\width
379
        bl     \func
380
        sub    a1, a1, #(16*5)
381
        add    a2, a2, #\width
382
        bl     \func
383
        add    a1, a1, #(16*2)
384
        add    a2, a2, #\width
385
        bl     \func
386
        add    a1, a1, #(16*2)
387
        add    a2, a2, #\width
388
        bl     \func
389
        add    a1, a1, #(16*2)
390
        add    a2, a2, #\width
391
        bl     \func
392
393
        sub    a1, a1, #(16*7)
394
        .endm
395
396
        .align
397
        .global ff_simple_idct_armv6
398 8520b634 Måns Rullgård
        .type ff_simple_idct_armv6, %function
399 7d42886b Måns Rullgård
        .func ff_simple_idct_armv6
400
/* void ff_simple_idct_armv6(DCTELEM *data); */
401
ff_simple_idct_armv6:
402
        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
403
        sub    sp, sp, #128
404
405
        mov    a2, sp
406
        idct_rows idct_row_armv6, 2
407
        mov    a2, a1
408
        mov    a1, sp
409
        idct_rows idct_col_armv6, 2
410
411
        add    sp, sp, #128
412
        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
413
        .endfunc
414
415
        .align
416
        .global ff_simple_idct_add_armv6
417 8520b634 Måns Rullgård
        .type ff_simple_idct_add_armv6, %function
418 7d42886b Måns Rullgård
        .func ff_simple_idct_add_armv6
419
/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
420
ff_simple_idct_add_armv6:
421
        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
422
        sub    sp, sp, #128
423
424
        mov    a1, a3
425
        mov    a2, sp
426
        idct_rows idct_row_armv6, 2
427
        mov    a1, sp
428
        ldr    a2, [sp, #128]
429
        ldr    a3, [sp, #(128+4)]
430
        idct_rows idct_col_add_armv6, 1
431
432
        add    sp, sp, #(128+8)
433
        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
434
        .endfunc
435
436
        .align
437
        .global ff_simple_idct_put_armv6
438 8520b634 Måns Rullgård
        .type ff_simple_idct_put_armv6, %function
439 7d42886b Måns Rullgård
        .func ff_simple_idct_put_armv6
440
/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
441
ff_simple_idct_put_armv6:
442
        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
443
        sub    sp, sp, #128
444
445
        mov    a1, a3
446
        mov    a2, sp
447
        idct_rows idct_row_armv6, 2
448
        mov    a1, sp
449
        ldr    a2, [sp, #128]
450
        ldr    a3, [sp, #(128+4)]
451
        idct_rows idct_col_put_armv6, 1
452
453
        add    sp, sp, #(128+8)
454
        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
455
        .endfunc