/* |
* idct BlackFin |

* |

* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> |

* |

* This file is part of Libav. |

* |

* Libav is free software; you can redistribute it and/or |

* modify it under the terms of the GNU Lesser General Public |

* License as published by the Free Software Foundation; either |

* version 2.1 of the License, or (at your option) any later version. |

* |

* Libav is distributed in the hope that it will be useful, |

* but WITHOUT ANY WARRANTY; without even the implied warranty of |

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

* Lesser General Public License for more details. |

* |

* You should have received a copy of the GNU Lesser General Public |

* License along with Libav; if not, write to the Free Software |

* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

*/ |

/* |

This blackfin DSP code implements an 8x8 inverse type II DCT. |

25 |
Prototype : void ff_bfin_idct(DCTELEM *in) |

27 |
Registers Used : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M2, L0-L3, P0-P5, LC0. |

29 |
Performance : |

Code Size : 498 Bytes. |

Cycle Count : 417 Cycles |

34 |
----------------------------------------------------------- |

FFMPEG conformance testing results |

----------------------------------------------------------- |

38 |
dct-test: modified with the following |

dct_error("BFINidct", 1, ff_bfin_idct, idct, test); |

produces the following output |

42 |
root:/u/ffmpeg/bhead/libavcodec> ./dct-test -i |

ffmpeg DCT/IDCT test |

45 |
8 15 -2 21 24 17 0 10 |

46 |
2 -10 -5 -5 -3 7 -14 -3 |

47 |
2 -13 -10 -19 18 -6 6 -2 |

48 |
9 4 16 -3 9 12 10 15 |

49 |
15 -9 -2 10 1 16 0 -15 |

50 |
-15 5 7 3 13 0 13 20 |

51 |
-6 -15 24 9 -18 1 9 -22 |

52 |
-8 25 23 2 -7 0 30 13 |

53 |
IDCT BFINidct: err_inf=1 err2=0.01002344 syserr=0.00150000 maxout=266 blockSumErr=64 |

IDCT BFINidct: 88.3 kdct/s |

56 |
*/ |

58 |
#include "config.h" |

#include "config_bfin.h" |

61 |
#if defined(__FDPIC__) && CONFIG_SRAM |

.section .l1.data.B,"aw",@progbits |

#else |

.data |

#endif |

67 |
.align 4; |

coefs: |

.short 0x5a82; // C4 |

.short 0x5a82; // C4 |

.short 0x30FC; //cos(3pi/8) C6 |

.short 0x7642; //cos(pi/8) C2 |

.short 0x18F9; //cos(7pi/16) |

.short 0x7D8A; //cos(pi/16) |

.short 0x471D; //cos(5pi/16) |

.short 0x6A6E; //cos(3pi/16) |

.short 0x18F9; //cos(7pi/16) |

.short 0x7D8A; //cos(pi/16) |

80 |
#if defined(__FDPIC__) && CONFIG_SRAM |

.section .l1.data.A,"aw",@progbits |

#endif |

84 |
vtmp: .space 256 |

86 |
#define TMP0 FP-8 |

87 |
#define TMP1 FP-12 |

88 |
#define TMP2 FP-16 |

91 |
.text |

DEFUN(idct,mL1, |

(DCTELEM *block)): |

95 |
/********************** Function Prologue *********************************/ |

link 16; |

[--SP] = (R7:4, P5:3); // Push the registers onto the stack. |

B0 = R0; // Pointer to Input matrix |

RELOC(R1, P3, coefs); // Pointer to Coefficients |

RELOC(R2, P3, vtmp); // Pointer to Temporary matrix |

B3 = R1; |

B2 = R2; |

L3 = 20; // L3 is used for making the coefficient array |

// circular. |

// MUST BE RESTORED TO ZERO at function exit. |

M1 = 16 (X); // All these registers are initialized for |

M3 = 8(X); // modifying address offsets. |

109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 | |

129 |
/* |

130 |
* A1 = Y0 * cos(pi/4) |

131 |
* A0 = Y0 * cos(pi/4) |

132 |
* A1 = A1 + Y4 * cos(pi/4) |

133 |
* A0 = A0 - Y4 * cos(pi/4) |

134 |
* load: |

135 |
* R1=(Y2,Y6) |

136 |
* R7=(C2,C6) |

137 |
* res: |

138 |
* R3=Y0, R2=Y4 |

139 |
*/ |

140 |
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || I0+= 4 || R1.L=W[I1++]; |

141 |
R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++]; |

143 |
144 | |

145 |
P2 = 112 (X); |

146 |
P1 = P1 + P2; // P1 points to element (7, 0) of temp buffer. |

147 |
P2 = -94(X); |

149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
* Y4 = Y4 + Y2. |

165 |
* Y2 = Y4 - Y2. |

166 |
* Y6 = Y0 - Y6. |

167 |
* R3 is saved |

168 |
* R6.l=Y3 |

169 |
* note: R3: Y0, R2: Y4, R1: Y2, R0: Y6 |

170 |
*/ |

171 |
R3=R3+R0, R0=R3-R0; |

172 |
R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--]; |

173 |
/* |

174 |
* Compute the odd portion (1,3,5,7) even is done. |

175 |
* |

176 |
* Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3. |

177 |
* Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3. |

178 |
* Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3. |

179 |
* Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3. |

180 |
*/ |

181 |
// R5=(Y1,Y7) R6=(Y5,Y3) // R7=(C1,C7) |

182 |
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || [TMP1]=R2 || R6.H=W[I2--]; |

183 |
A1-=R7.H*R5.L, A0+=R7.L*R5.L (IS) || I0-=4 || R7=[I3++]; |

184 |
A1+=R7.H*R6.H, A0+=R7.L*R6.H (IS) || I0+=M1; // R7=(C3,C5) |

185 |
R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS); |

186 |
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || R4=[TMP0]; |

187 |
A1+=R7.H*R5.L, A0-=R7.L*R5.L (IS) || I1+=M1 || R7=[I3++]; // R7=(C1,C7) |

188 |
A1+=R7.L*R6.H, A0-=R7.H*R6.H (IS); |

189 |
R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1; |

190 |
// R3=Y1, R2=Y7, R7=Y5, R6=Y3 |

191 | |

192 |
/* Transpose write column. */ |

193 |
R5.H=R4+R2 (RND12); // Y0=Y0+Y7 |

194 |
R5.L=R4-R2 (RND12) || R4 = [TMP1]; // Y7=Y7-Y0 |

195 |
R2.H=R1+R7 (RND12) || W[P0++P3]=R5.H; // Y2=Y2+Y5 st Y0 |

196 |
R2.L=R1-R7 (RND12) || W[P1++P4]=R5.L || R7=[I3++]; // Y5=Y2-Y5 st Y7 |

197 |
R5.H=R0-R3 (RND12) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2 |

198 |
R5.L=R0+R3 (RND12) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5 |

199 |
R3.H=R4-R6 (RND12) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1 |

200 |
R3.L=R4+R6 (RND12) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6 |

201 | |

202 |
/* pipeline loop start, + drain Y3, Y4 */ |

203 |
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || W[P0++P2]= R3.H || R1.H = W[I0--]; |

204 |
.1: R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++]; |

208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
238 |
LSETUP (.2, .3) LC0 = P2; // peform 8 1d idcts |

P2 = 112 (X); |

P1 = P1 + P2; |

P2 = -94(X); |

243 |
.2: |

/* |

* A1 = Y2 * cos(3pi/8) |

* A0 = Y2 * cos(pi/8) |

* A1 = A1 - Y6 * cos(pi/8) |

* A0 = A0 + Y6 * cos(3pi/8) |

* R5 = (Y1,Y7) |

* R7 = (C1,C7) |

* res: |

* R1=Y2, R0=Y6 |

*/ |

A1=R7.L*R1.H, A0=R7.H*R1.H (IS) || I0+=4 || R5.H=W[I0]; |

R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS) || R5.L=W[I1--] || R7=[I3++]; |

/* |

* Y0 = Y0 + Y6. |

* Y4 = Y4 + Y2. |

* Y2 = Y4 - Y2. |

* Y6 = Y0 - Y6. |

* R3 is saved |

* R6.l=Y3 |

* note: R3: Y0, R2: Y4, R1: Y2, R0: Y6 |

*/ |

R3=R3+R0, R0=R3-R0; |

R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--]; |

/* |

* Compute the odd portion (1,3,5,7) even is done. |

* |

* Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3. |

* Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3. |

* Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3. |

* Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3. |

*/ |

// R5=(Y1,Y7) R6=(Y5,Y3) // R7=(C1,C7) |

A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || [TMP1]=R2 || R6.H=W[I2--]; |

A1-=R7.H*R5.L, A0+=R7.L*R5.L (IS) || I0-=4 || R7=[I3++]; |

A1+=R7.H*R6.H, A0+=R7.L*R6.H (IS) || I0+=M1; // R7=(C3,C5) |

R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS); |

A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || R4=[TMP0]; |

A1+=R7.H*R5.L, A0-=R7.L*R5.L (IS) || I1+=M1 || R7=[I3++]; // R7=(C1,C7) |

A1+=R7.L*R6.H, A0-=R7.H*R6.H (IS); |

R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1; |

// R3=Y1, R2=Y7, R7=Y5, R6=Y3 |

286 |
/* Transpose write column. */ |

R5.H=R4+R2 (RND20); // Y0=Y0+Y7 |

R5.L=R4-R2 (RND20) || R4 = [TMP1]; // Y7=Y7-Y0 |

R2.H=R1+R7 (RND20) || W[P0++P3]=R5.H; // Y2=Y2+Y5 st Y0 |

R2.L=R1-R7 (RND20) || W[P1++P4]=R5.L || R7=[I3++]; // Y5=Y2-Y5 st Y7 |

R5.H=R0-R3 (RND20) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2 |

R5.L=R0+R3 (RND20) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5 |

R3.H=R4-R6 (RND20) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1 |

R3.L=R4+R6 (RND20) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6 |

296 |
297 |
298 |
300 |
301 |
302 |
303 |
304 |
306 |