Statistics
| Branch: | Revision:

ffmpeg / libavcodec / armv4l / simple_idct_arm.S @ 5509bffa

History | View | Annotate | Download (21.5 KB)

1
/*
2
 * simple_idct_arm.S
3
 * Copyright (C) 2002 Frederic 'dilb' Boulay.
4
 * All Rights Reserved.
5
 *
6
 * Author: Frederic Boulay <dilb@handhelds.org>
7
 *
8
 * You can redistribute this file and/or modify
9
 * it under the terms of the GNU General Public License (version 2)
10
 * as published by the Free Software Foundation.
11
 *
12
 * This file is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this library; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 *
21
 *
22
 * The function defined in this file, is derived from the simple_idct function
23
 * from the libavcodec library part of the ffmpeg project.
24
 */
25

    
26
/* useful constants for the algorithm, they are save in __constant_ptr__ at */
27
/* the end of the source code.*/
28
#define W1  22725
29
#define W2  21407
30
#define W3  19266
31
#define W4  16383
32
#define W5  12873
33
#define W6  8867
34
#define W7  4520
35
#define MASK_MSHW 0xFFFF0000
36

    
37
/* offsets of the constants in the vector */
38
#define offW1  0
39
#define offW2  4
40
#define offW3  8
41
#define offW4  12
42
#define offW5  16
43
#define offW6  20
44
#define offW7  24
45
#define offMASK_MSHW 28
46

    
47
#define ROW_SHIFT 11
48
#define ROW_SHIFT2MSHW (16-11)
49
#define COL_SHIFT 20
50
#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
51
#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
52

    
53

    
54
        .text
55
        .align
56
        .global simple_idct_ARM
57

    
58
simple_idct_ARM:
59
        @@ void simple_idct_ARM(int16_t *block)
60
        @@ save stack for reg needed (take all of them),
61
        @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
62
        @@ so it must not be overwritten, if it is not saved!!
63
        @@ R12 is another scratch register, so it should not be saved too
64
        @@ save all registers
65
        stmfd sp!, {r4-r11, r14} @ R14 is also called LR
66
        @@ at this point, R0=block, other registers are free.
67
        add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
68
        add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
69
        @@ add 2 temporary variables in the stack: R0 and R14
70
        sub sp, sp, #8          @ allow 2 local variables
71
        str r0, [sp, #0]        @ save block in sp[0]
72
        @@ stack status
73
        @@ sp+4   free
74
        @@ sp+0   R0  (block)
75

    
76

    
77
        @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
78

    
79

    
80
__row_loop:
81
        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimise ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
82
        ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
83
        ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
84
        ldr r3, [r14, #8]        @ R3=ROWr32[2]
85
        ldr r4, [r14, #12]       @ R4=ROWr32[3]
86
        @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
87
        @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
88
        @@ else follow the complete algorithm.
89
        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
90
        @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
91
        orr r5, r4, r3           @ R5=R4 | R3
92
        orr r5, r5, r2           @ R5=R4 | R3 | R2
93
        orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
94
        beq __end_row_loop
95
        mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
96
        ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
97
        orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
98
        beq __almost_empty_row
99

    
100
__b_evaluation:
101
        @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
102
        @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
103
        @@     R12=__const_ptr_, R14=&block[n]
104
        @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
105

    
106
        @@ MUL16(b0, W1, row[1]);
107
        @@ MUL16(b1, W3, row[1]);
108
        @@ MUL16(b2, W5, row[1]);
109
        @@ MUL16(b3, W7, row[1]);
110
        @@ MAC16(b0, W3, row[3]);
111
        @@ MAC16(b1, -W7, row[3]);
112
        @@ MAC16(b2, -W1, row[3]);
113
        @@ MAC16(b3, -W5, row[3]);
114
        ldr r8, [r12, #offW1]    @ R8=W1
115
        mov r2, r2, asr #16      @ R2=ROWr16[3]
116
        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
117
        ldr r9, [r12, #offW3]    @ R9=W3
118
        ldr r10, [r12, #offW5]   @ R10=W5
119
        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
120
        ldr r11, [r12, #offW7]   @ R11=W7
121
        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
122
        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
123
                teq r2, #0               @ if null avoid muls
124
                mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
125
        rsbne r2, r2, #0         @ R2=-ROWr16[3]
126
        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
127
        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
128
        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
129

    
130
        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
131
        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
132
        @@     R12=__const_ptr_, R14=&block[n]
133
        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
134
        @@ if (temp != 0) {}
135
        orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
136
        beq __end_b_evaluation
137

    
138
        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
139
        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
140
        @@     R12=__const_ptr_, R14=&block[n]
141
        @@ MAC16(b0, W5, row[5]);
142
        @@ MAC16(b2, W7, row[5]);
143
        @@ MAC16(b3, W3, row[5]);
144
        @@ MAC16(b1, -W1, row[5]);
145
        @@ MAC16(b0, W7, row[7]);
146
        @@ MAC16(b2, W3, row[7]);
147
        @@ MAC16(b3, -W1, row[7]);
148
        @@ MAC16(b1, -W5, row[7]);
149
        mov r3, r3, asr #16      @ R3=ROWr16[5]
150
                teq r3, #0               @ if null avoid muls
151
        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
152
        mov r4, r4, asr #16      @ R4=ROWr16[7]
153
        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
154
        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
155
        rsbne r3, r3, #0         @ R3=-ROWr16[5]
156
        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
157
        @@ R3 is free now
158
                teq r4, #0               @ if null avoid muls
159
        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
160
        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
161
        rsbne r4, r4, #0         @ R4=-ROWr16[7]
162
        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
163
        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
164
        @@ R4 is free now
165
__end_b_evaluation:
166
        @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
167
        @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
168
        @@     R12=__const_ptr_, R14=&block[n]
169

    
170
__a_evaluation:
171
        @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
172
        @@ a1 = a0 + W6 * row[2];
173
        @@ a2 = a0 - W6 * row[2];
174
        @@ a3 = a0 - W2 * row[2];
175
        @@ a0 = a0 + W2 * row[2];
176
        ldr r9, [r12, #offW4]    @ R9=W4
177
        mul r6, r9, r6           @ R6=W4*ROWr16[0]
178
        ldr r10, [r12, #offW6]   @ R10=W6
179
        ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
180
        add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
181

    
182
        mul r11, r10, r4         @ R11=W6*ROWr16[2]
183
        ldr r8, [r12, #offW2]    @ R8=W2
184
        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
185
        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
186
        @@ if (temp != 0) {}
187
        teq r2, #0
188
        beq __end_bef_a_evaluation
189

    
190
        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
191
        mul r11, r8, r4          @ R11=W2*ROWr16[2]
192
        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
193
        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
194

    
195

    
196
        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
197
        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
198
        @@     R12=__const_ptr_, R14=&block[n]
199

    
200

    
201
        @@ a0 += W4*row[4]
202
        @@ a1 -= W4*row[4]
203
        @@ a2 -= W4*row[4]
204
        @@ a3 += W4*row[4]
205
        ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
206
                teq r11, #0              @ if null avoid muls
207
        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
208
        @@ R9 is free now
209
        ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
210
        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
211
        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
212
        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
213
        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
214
        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
215
                teq r9, #0               @ if null avoid muls
216
        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
217
        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
218
        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
219
        @@ a0 += W6*row[6];
220
        @@ a3 -= W6*row[6];
221
        @@ a1 -= W2*row[6];
222
        @@ a2 += W2*row[6];
223
        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
224
        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
225
        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
226

    
227
__end_a_evaluation:
228
        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
229
        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
230
        @@     R12=__const_ptr_, R14=&block[n]
231
        @@ row[0] = (a0 + b0) >> ROW_SHIFT;
232
        @@ row[1] = (a1 + b1) >> ROW_SHIFT;
233
        @@ row[2] = (a2 + b2) >> ROW_SHIFT;
234
        @@ row[3] = (a3 + b3) >> ROW_SHIFT;
235
        @@ row[4] = (a3 - b3) >> ROW_SHIFT;
236
        @@ row[5] = (a2 - b2) >> ROW_SHIFT;
237
        @@ row[6] = (a1 - b1) >> ROW_SHIFT;
238
        @@ row[7] = (a0 - b0) >> ROW_SHIFT;
239
        add r8, r6, r0           @ R8=a0+b0
240
        add r9, r2, r1           @ R9=a1+b1
241
        @@ put 2 16 bits half-words in a 32bits word
242
        @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
243
        ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
244
        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
245
        mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
246
        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
247
        orr r8, r8, r9
248
        str r8, [r14, #0]
249

    
250
        add r8, r3, r5           @ R8=a2+b2
251
        add r9, r4, r7           @ R9=a3+b3
252
        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
253
        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
254
        orr r8, r8, r9
255
        str r8, [r14, #4]
256

    
257
        sub r8, r4, r7           @ R8=a3-b3
258
        sub r9, r3, r5           @ R9=a2-b2
259
        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
260
        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
261
        orr r8, r8, r9
262
        str r8, [r14, #8]
263

    
264
        sub r8, r2, r1           @ R8=a1-b1
265
        sub r9, r6, r0           @ R9=a0-b0
266
        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
267
        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
268
        orr r8, r8, r9
269
        str r8, [r14, #12]
270

    
271
        bal __end_row_loop
272

    
273
__almost_empty_row:
274
        @@ the row was empty, except ROWr16[0], now, management of this special case
275
        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
276
        @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
277
        @@                R8=0xFFFF (temp), R9-R11 free
278
        mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
279
        sub r8, r8, #1           @ R8 is now ready.
280
        and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
281
        orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
282
        str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
283
        str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
284
        str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
285
        str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
286

    
287
__end_row_loop:
288
        @@ at this point, R0-R11 (free)
289
        @@     R12=__const_ptr_, R14=&block[n]
290
        ldr r0, [sp, #0]         @ R0=block
291
        teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
292
        sub r14, r14, #16
293
        bne __row_loop
294

    
295

    
296

    
297
        @@ at this point, R0=block, R1-R11 (free)
298
        @@     R12=__const_ptr_, R14=&block[n]
299
        add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
300
__col_loop:
301

    
302
__b_evaluation2:
303
        @@ at this point, R0=block (temp),  R1-R11 (free)
304
        @@     R12=__const_ptr_, R14=&block[n]
305
        @@ proceed with b0-b3 first, followed by a0-a3
306
        @@ MUL16(b0, W1, col[8x1]);
307
        @@ MUL16(b1, W3, col[8x1]);
308
        @@ MUL16(b2, W5, col[8x1]);
309
        @@ MUL16(b3, W7, col[8x1]);
310
        @@ MAC16(b0, W3, col[8x3]);
311
        @@ MAC16(b1, -W7, col[8x3]);
312
        @@ MAC16(b2, -W1, col[8x3]);
313
        @@ MAC16(b3, -W5, col[8x3]);
314
        ldr r8, [r12, #offW1]    @ R8=W1
315
        ldrsh r7, [r14, #16]
316
        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
317
        ldr r9, [r12, #offW3]    @ R9=W3
318
        ldr r10, [r12, #offW5]   @ R10=W5
319
        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
320
        ldr r11, [r12, #offW7]   @ R11=W7
321
        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
322
        ldrsh r2, [r14, #48]
323
        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
324
        teq r2, #0               @ if 0, then avoid muls
325
        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
326
        rsbne r2, r2, #0         @ R2=-ROWr16[3]
327
        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
328
        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
329
        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
330

    
331
        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
332
        @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
333
        @@     R12=__const_ptr_, R14=&block[n]
334
        @@ MAC16(b0, W5, col[5x8]);
335
        @@ MAC16(b2, W7, col[5x8]);
336
        @@ MAC16(b3, W3, col[5x8]);
337
        @@ MAC16(b1, -W1, col[5x8]);
338
        @@ MAC16(b0, W7, col[7x8]);
339
        @@ MAC16(b2, W3, col[7x8]);
340
        @@ MAC16(b3, -W1, col[7x8]);
341
        @@ MAC16(b1, -W5, col[7x8]);
342
        ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
343
        teq r3, #0               @ if 0 then avoid muls
344
        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
345
        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
346
        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
347
        rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
348
        ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
349
        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
350
        @@ R3 is free now
351
        teq r4, #0               @ if 0 then avoid muls
352
        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
353
        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
354
        rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
355
        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
356
        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
357
        @@ R4 is free now
358
__end_b_evaluation2:
359
        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
360
        @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
361
        @@     R12=__const_ptr_, R14=&block[n]
362

    
363
__a_evaluation2:
364
        @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
365
        @@ a1 = a0 + W6 * row[2];
366
        @@ a2 = a0 - W6 * row[2];
367
        @@ a3 = a0 - W2 * row[2];
368
        @@ a0 = a0 + W2 * row[2];
369
        ldrsh r6, [r14, #0]
370
        ldr r9, [r12, #offW4]    @ R9=W4
371
        mul r6, r9, r6           @ R6=W4*ROWr16[0]
372
        ldr r10, [r12, #offW6]   @ R10=W6
373
        ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
374
        add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
375
        mul r11, r10, r4         @ R11=W6*ROWr16[2]
376
        ldr r8, [r12, #offW2]    @ R8=W2
377
        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
378
        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
379
        mul r11, r8, r4          @ R11=W2*ROWr16[2]
380
        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
381
        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
382

    
383
        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
384
        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
385
        @@     R12=__const_ptr_, R14=&block[n]
386
        @@ a0 += W4*row[4]
387
        @@ a1 -= W4*row[4]
388
        @@ a2 -= W4*row[4]
389
        @@ a3 += W4*row[4]
390
        ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
391
        teq r11, #0              @ if null avoid muls
392
        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
393
        @@ R9 is free now
394
        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
395
        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
396
        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
397
        ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
398
        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
399
        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
400
        teq r9, #0               @ if null avoid muls
401
        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
402
        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
403
        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
404
        @@ a0 += W6*row[6];
405
        @@ a3 -= W6*row[6];
406
        @@ a1 -= W2*row[6];
407
        @@ a2 += W2*row[6];
408
        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
409
        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
410
        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
411
__end_a_evaluation2:
412
        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
413
        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
414
        @@     R12=__const_ptr_, R14=&block[n]
415
        @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
416
        @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
417
        @@ col[16] = ((a2 + b2) >> COL_SHIFT);
418
        @@ col[24] = ((a3 + b3) >> COL_SHIFT);
419
        @@ col[32] = ((a3 - b3) >> COL_SHIFT);
420
        @@ col[40] = ((a2 - b2) >> COL_SHIFT);
421
        @@ col[48] = ((a1 - b1) >> COL_SHIFT);
422
        @@ col[56] = ((a0 - b0) >> COL_SHIFT);
423
        @@@@@ no optimisation here @@@@@
424
        add r8, r6, r0           @ R8=a0+b0
425
        add r9, r2, r1           @ R9=a1+b1
426
        mov r8, r8, asr #COL_SHIFT
427
        mov r9, r9, asr #COL_SHIFT
428
        strh r8, [r14, #0]
429
        strh r9, [r14, #16]
430
        add r8, r3, r5           @ R8=a2+b2
431
        add r9, r4, r7           @ R9=a3+b3
432
        mov r8, r8, asr #COL_SHIFT
433
        mov r9, r9, asr #COL_SHIFT
434
        strh r8, [r14, #32]
435
        strh r9, [r14, #48]
436
        sub r8, r4, r7           @ R8=a3-b3
437
        sub r9, r3, r5           @ R9=a2-b2
438
        mov r8, r8, asr #COL_SHIFT
439
        mov r9, r9, asr #COL_SHIFT
440
        strh r8, [r14, #64]
441
        strh r9, [r14, #80]
442
        sub r8, r2, r1           @ R8=a1-b1
443
        sub r9, r6, r0           @ R9=a0-b0
444
        mov r8, r8, asr #COL_SHIFT
445
        mov r9, r9, asr #COL_SHIFT
446
        strh r8, [r14, #96]
447
        strh r9, [r14, #112]
448

    
449
__end_col_loop:
450
        @@ at this point, R0-R11 (free)
451
        @@     R12=__const_ptr_, R14=&block[n]
452
        ldr r0, [sp, #0]         @ R0=block
453
        teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
454
        sub r14, r14, #2
455
        bne __col_loop
456

    
457

    
458

    
459

    
460
__end_simple_idct_ARM:
461
        @@ restore registers to previous status!
462
        add sp, sp, #8 @@ the local variables!
463
        ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
464

    
465

    
466

    
467
@@ kind of sub-function, here not to overload the common case.
468
__end_bef_a_evaluation:
469
        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
470
        mul r11, r8, r4          @ R11=W2*ROWr16[2]
471
        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
472
        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
473
        bal __end_a_evaluation
474

    
475

    
476
__constant_ptr__:  @@ see #defines at the beginning of the source code for values.
477
        .align
478
        .word   W1
479
        .word   W2
480
        .word   W3
481
        .word   W4
482
        .word   W5
483
        .word   W6
484
        .word   W7
485
        .word   MASK_MSHW