Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / simple_idct_arm.S @ 2912e87a

History | View | Annotate | Download (21.5 KB)

1 115329f1 Diego Biurrun
/*
2 bd7d1ea7 Alex Beregszaszi
 * simple_idct_arm.S
3 406792e7 Diego Biurrun
 * Copyright (C) 2002 Frederic 'dilb' Boulay
4 bd7d1ea7 Alex Beregszaszi
 *
5
 * Author: Frederic Boulay <dilb@handhelds.org>
6
 *
7 7b94177e Diego Biurrun
 * The function defined in this file is derived from the simple_idct function
8 2912e87a Mans Rullgard
 * from the libavcodec library part of the Libav project.
9 7b94177e Diego Biurrun
 *
10 2912e87a Mans Rullgard
 * This file is part of Libav.
11 b78e7197 Diego Biurrun
 *
12 2912e87a Mans Rullgard
 * Libav is free software; you can redistribute it and/or
13 f7b106cb Diego Biurrun
 * modify it under the terms of the GNU Lesser General Public
14
 * License as published by the Free Software Foundation; either
15 b78e7197 Diego Biurrun
 * version 2.1 of the License, or (at your option) any later version.
16 bd7d1ea7 Alex Beregszaszi
 *
17 2912e87a Mans Rullgard
 * Libav is distributed in the hope that it will be useful,
18 bd7d1ea7 Alex Beregszaszi
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 f7b106cb Diego Biurrun
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
 * Lesser General Public License for more details.
21 bd7d1ea7 Alex Beregszaszi
 *
22 f7b106cb Diego Biurrun
 * You should have received a copy of the GNU Lesser General Public
23 2912e87a Mans Rullgard
 * License along with Libav; if not, write to the Free Software
24 5509bffa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 bd7d1ea7 Alex Beregszaszi
 */
26
27 c130bedc Måns Rullgård
#include "asm.S"
28
29 bd7d1ea7 Alex Beregszaszi
/* useful constants for the algorithm, they are save in __constant_ptr__ at */
30
/* the end of the source code.*/
31
#define W1  22725
32
#define W2  21407
33
#define W3  19266
34
#define W4  16383
35
#define W5  12873
36
#define W6  8867
37
#define W7  4520
38
#define MASK_MSHW 0xFFFF0000
39
40
/* offsets of the constants in the vector */
41
#define offW1  0
42
#define offW2  4
43
#define offW3  8
44
#define offW4  12
45
#define offW5  16
46
#define offW6  20
47
#define offW7  24
48
#define offMASK_MSHW 28
49
50
#define ROW_SHIFT 11
51
#define ROW_SHIFT2MSHW (16-11)
52
#define COL_SHIFT 20
53
#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
54
#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
55
56
57 bb270c08 Diego Biurrun
        .text
58 bd7d1ea7 Alex Beregszaszi
59 2ad4c241 Måns Rullgård
function ff_simple_idct_arm, export=1
60
        @@ void simple_idct_arm(int16_t *block)
61 bd7d1ea7 Alex Beregszaszi
        @@ save stack for reg needed (take all of them),
62
        @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
63
        @@ so it must not be overwritten, if it is not saved!!
64
        @@ R12 is another scratch register, so it should not be saved too
65
        @@ save all registers
66
        stmfd sp!, {r4-r11, r14} @ R14 is also called LR
67
        @@ at this point, R0=block, other registers are free.
68
        add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
69 c61e40b7 Måns Rullgård
        adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
70 bd7d1ea7 Alex Beregszaszi
        @@ add 2 temporary variables in the stack: R0 and R14
71
        sub sp, sp, #8          @ allow 2 local variables
72
        str r0, [sp, #0]        @ save block in sp[0]
73
        @@ stack status
74
        @@ sp+4   free
75
        @@ sp+0   R0  (block)
76
77
78
        @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
79
80
81
__row_loop:
82 56cc85a0 Diego Biurrun
        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
83 bd7d1ea7 Alex Beregszaszi
        ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
84
        ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
85
        ldr r3, [r14, #8]        @ R3=ROWr32[2]
86
        ldr r4, [r14, #12]       @ R4=ROWr32[3]
87
        @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
88
        @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
89
        @@ else follow the complete algorithm.
90
        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
91
        @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
92
        orr r5, r4, r3           @ R5=R4 | R3
93
        orr r5, r5, r2           @ R5=R4 | R3 | R2
94
        orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
95
        beq __end_row_loop
96
        mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
97
        ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
98
        orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
99
        beq __almost_empty_row
100
101
__b_evaluation:
102
        @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
103
        @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
104
        @@     R12=__const_ptr_, R14=&block[n]
105
        @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
106
107
        @@ MUL16(b0, W1, row[1]);
108
        @@ MUL16(b1, W3, row[1]);
109
        @@ MUL16(b2, W5, row[1]);
110
        @@ MUL16(b3, W7, row[1]);
111
        @@ MAC16(b0, W3, row[3]);
112
        @@ MAC16(b1, -W7, row[3]);
113
        @@ MAC16(b2, -W1, row[3]);
114
        @@ MAC16(b3, -W5, row[3]);
115
        ldr r8, [r12, #offW1]    @ R8=W1
116
        mov r2, r2, asr #16      @ R2=ROWr16[3]
117
        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
118
        ldr r9, [r12, #offW3]    @ R9=W3
119
        ldr r10, [r12, #offW5]   @ R10=W5
120
        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
121
        ldr r11, [r12, #offW7]   @ R11=W7
122
        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
123
        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
124 bb270c08 Diego Biurrun
                teq r2, #0               @ if null avoid muls
125
                mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
126 bd7d1ea7 Alex Beregszaszi
        rsbne r2, r2, #0         @ R2=-ROWr16[3]
127
        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
128
        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
129
        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
130
131
        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
132
        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
133
        @@     R12=__const_ptr_, R14=&block[n]
134
        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
135
        @@ if (temp != 0) {}
136
        orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
137
        beq __end_b_evaluation
138
139
        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
140
        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
141
        @@     R12=__const_ptr_, R14=&block[n]
142
        @@ MAC16(b0, W5, row[5]);
143
        @@ MAC16(b2, W7, row[5]);
144
        @@ MAC16(b3, W3, row[5]);
145
        @@ MAC16(b1, -W1, row[5]);
146
        @@ MAC16(b0, W7, row[7]);
147
        @@ MAC16(b2, W3, row[7]);
148
        @@ MAC16(b3, -W1, row[7]);
149
        @@ MAC16(b1, -W5, row[7]);
150
        mov r3, r3, asr #16      @ R3=ROWr16[5]
151 bb270c08 Diego Biurrun
                teq r3, #0               @ if null avoid muls
152 bd7d1ea7 Alex Beregszaszi
        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
153
        mov r4, r4, asr #16      @ R4=ROWr16[7]
154
        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
155
        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
156
        rsbne r3, r3, #0         @ R3=-ROWr16[5]
157
        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
158
        @@ R3 is free now
159 bb270c08 Diego Biurrun
                teq r4, #0               @ if null avoid muls
160 bd7d1ea7 Alex Beregszaszi
        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
161
        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
162
        rsbne r4, r4, #0         @ R4=-ROWr16[7]
163
        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
164
        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
165
        @@ R4 is free now
166
__end_b_evaluation:
167
        @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
168
        @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
169
        @@     R12=__const_ptr_, R14=&block[n]
170
171
__a_evaluation:
172
        @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
173
        @@ a1 = a0 + W6 * row[2];
174
        @@ a2 = a0 - W6 * row[2];
175
        @@ a3 = a0 - W2 * row[2];
176
        @@ a0 = a0 + W2 * row[2];
177
        ldr r9, [r12, #offW4]    @ R9=W4
178
        mul r6, r9, r6           @ R6=W4*ROWr16[0]
179
        ldr r10, [r12, #offW6]   @ R10=W6
180
        ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
181
        add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
182
183
        mul r11, r10, r4         @ R11=W6*ROWr16[2]
184
        ldr r8, [r12, #offW2]    @ R8=W2
185
        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
186
        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
187
        @@ if (temp != 0) {}
188
        teq r2, #0
189
        beq __end_bef_a_evaluation
190
191 bb270c08 Diego Biurrun
        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
192 bd7d1ea7 Alex Beregszaszi
        mul r11, r8, r4          @ R11=W2*ROWr16[2]
193
        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
194
        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
195
196
197
        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
198
        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
199
        @@     R12=__const_ptr_, R14=&block[n]
200
201
202
        @@ a0 += W4*row[4]
203
        @@ a1 -= W4*row[4]
204
        @@ a2 -= W4*row[4]
205
        @@ a3 += W4*row[4]
206
        ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
207 bb270c08 Diego Biurrun
                teq r11, #0              @ if null avoid muls
208 bd7d1ea7 Alex Beregszaszi
        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
209
        @@ R9 is free now
210
        ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
211
        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
212
        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
213
        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
214
        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
215
        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
216 bb270c08 Diego Biurrun
                teq r9, #0               @ if null avoid muls
217 bd7d1ea7 Alex Beregszaszi
        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
218
        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
219
        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
220
        @@ a0 += W6*row[6];
221
        @@ a3 -= W6*row[6];
222
        @@ a1 -= W2*row[6];
223
        @@ a2 += W2*row[6];
224
        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
225
        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
226
        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
227
228
__end_a_evaluation:
229
        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
230
        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
231
        @@     R12=__const_ptr_, R14=&block[n]
232
        @@ row[0] = (a0 + b0) >> ROW_SHIFT;
233
        @@ row[1] = (a1 + b1) >> ROW_SHIFT;
234
        @@ row[2] = (a2 + b2) >> ROW_SHIFT;
235
        @@ row[3] = (a3 + b3) >> ROW_SHIFT;
236
        @@ row[4] = (a3 - b3) >> ROW_SHIFT;
237
        @@ row[5] = (a2 - b2) >> ROW_SHIFT;
238
        @@ row[6] = (a1 - b1) >> ROW_SHIFT;
239
        @@ row[7] = (a0 - b0) >> ROW_SHIFT;
240
        add r8, r6, r0           @ R8=a0+b0
241
        add r9, r2, r1           @ R9=a1+b1
242
        @@ put 2 16 bits half-words in a 32bits word
243
        @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
244
        ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
245
        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
246
        mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
247
        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
248
        orr r8, r8, r9
249
        str r8, [r14, #0]
250
251
        add r8, r3, r5           @ R8=a2+b2
252
        add r9, r4, r7           @ R9=a3+b3
253
        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
254
        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
255
        orr r8, r8, r9
256
        str r8, [r14, #4]
257
258
        sub r8, r4, r7           @ R8=a3-b3
259
        sub r9, r3, r5           @ R9=a2-b2
260
        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
261
        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
262
        orr r8, r8, r9
263
        str r8, [r14, #8]
264
265
        sub r8, r2, r1           @ R8=a1-b1
266
        sub r9, r6, r0           @ R9=a0-b0
267
        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
268
        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
269
        orr r8, r8, r9
270
        str r8, [r14, #12]
271
272
        bal __end_row_loop
273
274
__almost_empty_row:
275
        @@ the row was empty, except ROWr16[0], now, management of this special case
276
        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
277
        @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
278
        @@                R8=0xFFFF (temp), R9-R11 free
279
        mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
280
        sub r8, r8, #1           @ R8 is now ready.
281
        and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
282
        orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
283
        str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
284
        str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
285
        str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
286
        str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
287
288
__end_row_loop:
289
        @@ at this point, R0-R11 (free)
290
        @@     R12=__const_ptr_, R14=&block[n]
291
        ldr r0, [sp, #0]         @ R0=block
292
        teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
293
        sub r14, r14, #16
294
        bne __row_loop
295
296
297
298 bb270c08 Diego Biurrun
        @@ at this point, R0=block, R1-R11 (free)
299
        @@     R12=__const_ptr_, R14=&block[n]
300
        add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
301 bd7d1ea7 Alex Beregszaszi
__col_loop:
302
303
__b_evaluation2:
304 bb270c08 Diego Biurrun
        @@ at this point, R0=block (temp),  R1-R11 (free)
305
        @@     R12=__const_ptr_, R14=&block[n]
306
        @@ proceed with b0-b3 first, followed by a0-a3
307
        @@ MUL16(b0, W1, col[8x1]);
308
        @@ MUL16(b1, W3, col[8x1]);
309
        @@ MUL16(b2, W5, col[8x1]);
310
        @@ MUL16(b3, W7, col[8x1]);
311
        @@ MAC16(b0, W3, col[8x3]);
312
        @@ MAC16(b1, -W7, col[8x3]);
313
        @@ MAC16(b2, -W1, col[8x3]);
314
        @@ MAC16(b3, -W5, col[8x3]);
315
        ldr r8, [r12, #offW1]    @ R8=W1
316
        ldrsh r7, [r14, #16]
317
        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
318
        ldr r9, [r12, #offW3]    @ R9=W3
319
        ldr r10, [r12, #offW5]   @ R10=W5
320
        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
321
        ldr r11, [r12, #offW7]   @ R11=W7
322
        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
323
        ldrsh r2, [r14, #48]
324
        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
325
        teq r2, #0               @ if 0, then avoid muls
326
        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
327
        rsbne r2, r2, #0         @ R2=-ROWr16[3]
328
        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
329
        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
330
        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
331
332
        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
333
        @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
334
        @@     R12=__const_ptr_, R14=&block[n]
335
        @@ MAC16(b0, W5, col[5x8]);
336
        @@ MAC16(b2, W7, col[5x8]);
337
        @@ MAC16(b3, W3, col[5x8]);
338
        @@ MAC16(b1, -W1, col[5x8]);
339
        @@ MAC16(b0, W7, col[7x8]);
340
        @@ MAC16(b2, W3, col[7x8]);
341
        @@ MAC16(b3, -W1, col[7x8]);
342
        @@ MAC16(b1, -W5, col[7x8]);
343
        ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
344
        teq r3, #0               @ if 0 then avoid muls
345
        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
346
        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
347
        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
348
        rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
349
        ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
350
        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
351
        @@ R3 is free now
352
        teq r4, #0               @ if 0 then avoid muls
353
        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
354
        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
355
        rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
356
        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
357
        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
358
        @@ R4 is free now
359 bd7d1ea7 Alex Beregszaszi
__end_b_evaluation2:
360 bb270c08 Diego Biurrun
        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
361
        @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
362
        @@     R12=__const_ptr_, R14=&block[n]
363 bd7d1ea7 Alex Beregszaszi
364
__a_evaluation2:
365 bb270c08 Diego Biurrun
        @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
366
        @@ a1 = a0 + W6 * row[2];
367
        @@ a2 = a0 - W6 * row[2];
368
        @@ a3 = a0 - W2 * row[2];
369
        @@ a0 = a0 + W2 * row[2];
370
        ldrsh r6, [r14, #0]
371
        ldr r9, [r12, #offW4]    @ R9=W4
372
        mul r6, r9, r6           @ R6=W4*ROWr16[0]
373
        ldr r10, [r12, #offW6]   @ R10=W6
374
        ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
375
        add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
376
        mul r11, r10, r4         @ R11=W6*ROWr16[2]
377
        ldr r8, [r12, #offW2]    @ R8=W2
378
        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
379
        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
380
        mul r11, r8, r4          @ R11=W2*ROWr16[2]
381
        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
382
        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
383
384
        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
385
        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
386
        @@     R12=__const_ptr_, R14=&block[n]
387
        @@ a0 += W4*row[4]
388
        @@ a1 -= W4*row[4]
389
        @@ a2 -= W4*row[4]
390
        @@ a3 += W4*row[4]
391
        ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
392
        teq r11, #0              @ if null avoid muls
393
        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
394
        @@ R9 is free now
395
        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
396
        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
397
        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
398
        ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
399
        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
400
        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
401
        teq r9, #0               @ if null avoid muls
402
        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
403
        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
404
        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
405
        @@ a0 += W6*row[6];
406
        @@ a3 -= W6*row[6];
407
        @@ a1 -= W2*row[6];
408
        @@ a2 += W2*row[6];
409
        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
410
        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
411
        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
412 bd7d1ea7 Alex Beregszaszi
__end_a_evaluation2:
413 bb270c08 Diego Biurrun
        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
414
        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
415
        @@     R12=__const_ptr_, R14=&block[n]
416
        @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
417
        @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
418
        @@ col[16] = ((a2 + b2) >> COL_SHIFT);
419
        @@ col[24] = ((a3 + b3) >> COL_SHIFT);
420
        @@ col[32] = ((a3 - b3) >> COL_SHIFT);
421
        @@ col[40] = ((a2 - b2) >> COL_SHIFT);
422
        @@ col[48] = ((a1 - b1) >> COL_SHIFT);
423
        @@ col[56] = ((a0 - b0) >> COL_SHIFT);
424 56cc85a0 Diego Biurrun
        @@@@@ no optimization here @@@@@
425 bb270c08 Diego Biurrun
        add r8, r6, r0           @ R8=a0+b0
426
        add r9, r2, r1           @ R9=a1+b1
427
        mov r8, r8, asr #COL_SHIFT
428
        mov r9, r9, asr #COL_SHIFT
429
        strh r8, [r14, #0]
430
        strh r9, [r14, #16]
431
        add r8, r3, r5           @ R8=a2+b2
432
        add r9, r4, r7           @ R9=a3+b3
433
        mov r8, r8, asr #COL_SHIFT
434
        mov r9, r9, asr #COL_SHIFT
435
        strh r8, [r14, #32]
436
        strh r9, [r14, #48]
437
        sub r8, r4, r7           @ R8=a3-b3
438
        sub r9, r3, r5           @ R9=a2-b2
439
        mov r8, r8, asr #COL_SHIFT
440
        mov r9, r9, asr #COL_SHIFT
441
        strh r8, [r14, #64]
442
        strh r9, [r14, #80]
443
        sub r8, r2, r1           @ R8=a1-b1
444
        sub r9, r6, r0           @ R9=a0-b0
445
        mov r8, r8, asr #COL_SHIFT
446
        mov r9, r9, asr #COL_SHIFT
447
        strh r8, [r14, #96]
448
        strh r9, [r14, #112]
449 bd7d1ea7 Alex Beregszaszi
450
__end_col_loop:
451 bb270c08 Diego Biurrun
        @@ at this point, R0-R11 (free)
452
        @@     R12=__const_ptr_, R14=&block[n]
453
        ldr r0, [sp, #0]         @ R0=block
454
        teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
455
        sub r14, r14, #2
456
        bne __col_loop
457 bd7d1ea7 Alex Beregszaszi
458
459
460
461 2ad4c241 Måns Rullgård
__end_simple_idct_arm:
462 bd7d1ea7 Alex Beregszaszi
        @@ restore registers to previous status!
463
        add sp, sp, #8 @@ the local variables!
464
        ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
465
466
467
468
@@ kind of sub-function, here not to overload the common case.
469
__end_bef_a_evaluation:
470 bb270c08 Diego Biurrun
        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
471 bd7d1ea7 Alex Beregszaszi
        mul r11, r8, r4          @ R11=W2*ROWr16[2]
472
        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
473
        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
474 bb270c08 Diego Biurrun
        bal __end_a_evaluation
475 bd7d1ea7 Alex Beregszaszi
476
477
__constant_ptr__:  @@ see #defines at the beginning of the source code for values.
478 bb270c08 Diego Biurrun
        .align
479 bd7d1ea7 Alex Beregszaszi
        .word   W1
480
        .word   W2
481
        .word   W3
482
        .word   W4
483
        .word   W5
484
        .word   W6
485
        .word   W7
486
        .word   MASK_MSHW