## ffmpeg / libavcodec / bfin / idct_bfin.S @ 2912e87a

History | View | Annotate | Download (11.2 KB)

1 |
/* |
---|---|

2 |
* idct BlackFin |

3 |
* |

4 |
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> |

5 |
* |

6 |
* This file is part of Libav. |

7 |
* |

8 |
* Libav is free software; you can redistribute it and/or |

9 |
* modify it under the terms of the GNU Lesser General Public |

10 |
* License as published by the Free Software Foundation; either |

11 |
* version 2.1 of the License, or (at your option) any later version. |

12 |
* |

13 |
* Libav is distributed in the hope that it will be useful, |

14 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |

15 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

16 |
* Lesser General Public License for more details. |

17 |
* |

18 |
* You should have received a copy of the GNU Lesser General Public |

19 |
* License along with Libav; if not, write to the Free Software |

20 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

21 |
*/ |

22 |
/* |

23 |
This blackfin DSP code implements an 8x8 inverse type II DCT. |

24 | |

25 |
Prototype : void ff_bfin_idct(DCTELEM *in) |

26 | |

27 |
Registers Used : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M2, L0-L3, P0-P5, LC0. |

28 | |

29 |
Performance : |

30 |
Code Size : 498 Bytes. |

31 |
Cycle Count : 417 Cycles |

32 | |

33 | |

34 |
----------------------------------------------------------- |

35 |
FFMPEG conformance testing results |

36 |
----------------------------------------------------------- |

37 | |

38 |
dct-test: modified with the following |

39 |
dct_error("BFINidct", 1, ff_bfin_idct, idct, test); |

40 |
produces the following output |

41 | |

42 |
root:/u/ffmpeg/bhead/libavcodec> ./dct-test -i |

43 |
ffmpeg DCT/IDCT test |

44 | |

45 |
8 15 -2 21 24 17 0 10 |

46 |
2 -10 -5 -5 -3 7 -14 -3 |

47 |
2 -13 -10 -19 18 -6 6 -2 |

48 |
9 4 16 -3 9 12 10 15 |

49 |
15 -9 -2 10 1 16 0 -15 |

50 |
-15 5 7 3 13 0 13 20 |

51 |
-6 -15 24 9 -18 1 9 -22 |

52 |
-8 25 23 2 -7 0 30 13 |

53 |
IDCT BFINidct: err_inf=1 err2=0.01002344 syserr=0.00150000 maxout=266 blockSumErr=64 |

54 |
IDCT BFINidct: 88.3 kdct/s |

55 | |

56 |
*/ |

57 | |

58 |
#include "config.h" |

59 |
#include "config_bfin.h" |

60 | |

61 |
#if defined(__FDPIC__) && CONFIG_SRAM |

62 |
.section .l1.data.B,"aw",@progbits |

63 |
#else |

64 |
.data |

65 |
#endif |

66 | |

67 |
.align 4; |

68 |
coefs: |

69 |
.short 0x5a82; // C4 |

70 |
.short 0x5a82; // C4 |

71 |
.short 0x30FC; //cos(3pi/8) C6 |

72 |
.short 0x7642; //cos(pi/8) C2 |

73 |
.short 0x18F9; //cos(7pi/16) |

74 |
.short 0x7D8A; //cos(pi/16) |

75 |
.short 0x471D; //cos(5pi/16) |

76 |
.short 0x6A6E; //cos(3pi/16) |

77 |
.short 0x18F9; //cos(7pi/16) |

78 |
.short 0x7D8A; //cos(pi/16) |

79 | |

80 |
#if defined(__FDPIC__) && CONFIG_SRAM |

81 |
.section .l1.data.A,"aw",@progbits |

82 |
#endif |

83 | |

84 |
vtmp: .space 256 |

85 | |

86 |
#define TMP0 FP-8 |

87 |
#define TMP1 FP-12 |

88 |
#define TMP2 FP-16 |

89 | |

90 | |

91 |
.text |

92 |
DEFUN(idct,mL1, |

93 |
(DCTELEM *block)): |

94 | |

95 |
/********************** Function Prologue *********************************/ |

96 |
link 16; |

97 |
[--SP] = (R7:4, P5:3); // Push the registers onto the stack. |

98 |
B0 = R0; // Pointer to Input matrix |

99 |
RELOC(R1, P3, coefs); // Pointer to Coefficients |

100 |
RELOC(R2, P3, vtmp); // Pointer to Temporary matrix |

101 |
B3 = R1; |

102 |
B2 = R2; |

103 |
L3 = 20; // L3 is used for making the coefficient array |

104 |
// circular. |

105 |
// MUST BE RESTORED TO ZERO at function exit. |

106 |
M1 = 16 (X); // All these registers are initialized for |

107 |
M3 = 8(X); // modifying address offsets. |

108 | |

109 |
I0 = B0; // I0 points to Input Element (0, 0). |

110 |
I2 = B0; // I2 points to Input Element (0, 0). |

111 |
I2 += M3 || R0.H = W[I0]; |

112 |
// Element 0 is read into R0.H |

113 |
I1 = I2; // I1 points to input Element (0, 6). |

114 |
I1 += 4 || R0.L = W[I2++]; |

115 |
// I2 points to input Element (0, 4). |

116 |
// Element 4 is read into R0.L. |

117 |
P2 = 8 (X); |

118 |
P3 = 32 (X); |

119 |
P4 = -32 (X); |

120 |
P5 = 98 (X); |

121 |
R7 = 0x8000(Z); |

122 |
I3 = B3; // I3 points to Coefficients |

123 |
P0 = B2; // P0 points to array Element (0, 0) of temp |

124 |
P1 = B2; |

125 |
R7 = [I3++] || [TMP2]=R7; // Coefficient C4 is read into R7.H and R7.L. |

126 |
MNOP; |

127 |
NOP; |

128 | |

129 |
/* |

130 |
* A1 = Y0 * cos(pi/4) |

131 |
* A0 = Y0 * cos(pi/4) |

132 |
* A1 = A1 + Y4 * cos(pi/4) |

133 |
* A0 = A0 - Y4 * cos(pi/4) |

134 |
* load: |

135 |
* R1=(Y2,Y6) |

136 |
* R7=(C2,C6) |

137 |
* res: |

138 |
* R3=Y0, R2=Y4 |

139 |
*/ |

140 |
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || I0+= 4 || R1.L=W[I1++]; |

141 |
R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++]; |

142 | |

143 |
LSETUP (.0, .1) LC0 = P2; // perform 8 1d idcts |

144 | |

145 |
P2 = 112 (X); |

146 |
P1 = P1 + P2; // P1 points to element (7, 0) of temp buffer. |

147 |
P2 = -94(X); |

148 | |

149 |
.0: |

150 |
/* |

151 |
* A1 = Y2 * cos(3pi/8) |

152 |
* A0 = Y2 * cos(pi/8) |

153 |
* A1 = A1 - Y6 * cos(pi/8) |

154 |
* A0 = A0 + Y6 * cos(3pi/8) |

155 |
* R5 = (Y1,Y7) |

156 |
* R7 = (C1,C7) |

157 |
* res: |

158 |
* R1=Y2, R0=Y6 |

159 |
*/ |

160 |
A1=R7.L*R1.H, A0=R7.H*R1.H (IS) || I0+=4 || R5.H=W[I0]; |

161 |
R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS) || R5.L=W[I1--] || R7=[I3++]; |

162 |
/* |

163 |
* Y0 = Y0 + Y6. |

164 |
* Y4 = Y4 + Y2. |

165 |
* Y2 = Y4 - Y2. |

166 |
* Y6 = Y0 - Y6. |

167 |
* R3 is saved |

168 |
* R6.l=Y3 |

169 |
* note: R3: Y0, R2: Y4, R1: Y2, R0: Y6 |

170 |
*/ |

171 |
R3=R3+R0, R0=R3-R0; |

172 |
R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--]; |

173 |
/* |

174 |
* Compute the odd portion (1,3,5,7) even is done. |

175 |
* |

176 |
* Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3. |

177 |
* Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3. |

178 |
* Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3. |

179 |
* Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3. |

180 |
*/ |

181 |
// R5=(Y1,Y7) R6=(Y5,Y3) // R7=(C1,C7) |

182 |
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || [TMP1]=R2 || R6.H=W[I2--]; |

183 |
A1-=R7.H*R5.L, A0+=R7.L*R5.L (IS) || I0-=4 || R7=[I3++]; |

184 |
A1+=R7.H*R6.H, A0+=R7.L*R6.H (IS) || I0+=M1; // R7=(C3,C5) |

185 |
R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS); |

186 |
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || R4=[TMP0]; |

187 |
A1+=R7.H*R5.L, A0-=R7.L*R5.L (IS) || I1+=M1 || R7=[I3++]; // R7=(C1,C7) |

188 |
A1+=R7.L*R6.H, A0-=R7.H*R6.H (IS); |

189 |
R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1; |

190 |
// R3=Y1, R2=Y7, R7=Y5, R6=Y3 |

191 | |

192 |
/* Transpose write column. */ |

193 |
R5.H=R4+R2 (RND12); // Y0=Y0+Y7 |

194 |
R5.L=R4-R2 (RND12) || R4 = [TMP1]; // Y7=Y7-Y0 |

195 |
R2.H=R1+R7 (RND12) || W[P0++P3]=R5.H; // Y2=Y2+Y5 st Y0 |

196 |
R2.L=R1-R7 (RND12) || W[P1++P4]=R5.L || R7=[I3++]; // Y5=Y2-Y5 st Y7 |

197 |
R5.H=R0-R3 (RND12) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2 |

198 |
R5.L=R0+R3 (RND12) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5 |

199 |
R3.H=R4-R6 (RND12) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1 |

200 |
R3.L=R4+R6 (RND12) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6 |

201 | |

202 |
/* pipeline loop start, + drain Y3, Y4 */ |

203 |
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || W[P0++P2]= R3.H || R1.H = W[I0--]; |

204 |
.1: R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++]; |

205 | |

206 | |

207 | |

208 |
I0 = B2; // I0 points to Input Element (0, 0) |

209 |
I2 = B2; // I2 points to Input Element (0, 0) |

210 |
I2 += M3 || R0.H = W[I0]; |

211 |
// Y0 is read in R0.H |

212 |
I1 = I2; // I1 points to input Element (0, 6) |

213 |
I1 += 4 || R0.L = W[I2++]; |

214 |
// I2 points to input Element (0, 4) |

215 |
// Y4 is read in R0.L |

216 |
P2 = 8 (X); |

217 |
I3 = B3; // I3 points to Coefficients |

218 |
P0 = B0; // P0 points to array Element (0, 0) for writing |

219 |
// output |

220 |
P1 = B0; |

221 |
R7 = [I3++]; // R7.H = C4 and R7.L = C4 |

222 |
NOP; |

223 | |

224 |
/* |

225 |
* A1 = Y0 * cos(pi/4) |

226 |
* A0 = Y0 * cos(pi/4) |

227 |
* A1 = A1 + Y4 * cos(pi/4) |

228 |
* A0 = A0 - Y4 * cos(pi/4) |

229 |
* load: |

230 |
* R1=(Y2,Y6) |

231 |
* R7=(C2,C6) |

232 |
* res: |

233 |
* R3=Y0, R2=Y4 |

234 |
*/ |

235 |
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || I0+=4 || R1.L=W[I1++]; |

236 |
R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++]; |

237 | |

238 |
LSETUP (.2, .3) LC0 = P2; // peform 8 1d idcts |

239 |
P2 = 112 (X); |

240 |
P1 = P1 + P2; |

241 |
P2 = -94(X); |

242 | |

243 |
.2: |

244 |
/* |

245 |
* A1 = Y2 * cos(3pi/8) |

246 |
* A0 = Y2 * cos(pi/8) |

247 |
* A1 = A1 - Y6 * cos(pi/8) |

248 |
* A0 = A0 + Y6 * cos(3pi/8) |

249 |
* R5 = (Y1,Y7) |

250 |
* R7 = (C1,C7) |

251 |
* res: |

252 |
* R1=Y2, R0=Y6 |

253 |
*/ |

254 |
A1=R7.L*R1.H, A0=R7.H*R1.H (IS) || I0+=4 || R5.H=W[I0]; |

255 |
R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS) || R5.L=W[I1--] || R7=[I3++]; |

256 |
/* |

257 |
* Y0 = Y0 + Y6. |

258 |
* Y4 = Y4 + Y2. |

259 |
* Y2 = Y4 - Y2. |

260 |
* Y6 = Y0 - Y6. |

261 |
* R3 is saved |

262 |
* R6.l=Y3 |

263 |
* note: R3: Y0, R2: Y4, R1: Y2, R0: Y6 |

264 |
*/ |

265 |
R3=R3+R0, R0=R3-R0; |

266 |
R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--]; |

267 |
/* |

268 |
* Compute the odd portion (1,3,5,7) even is done. |

269 |
* |

270 |
* Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3. |

271 |
* Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3. |

272 |
* Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3. |

273 |
* Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3. |

274 |
*/ |

275 |
// R5=(Y1,Y7) R6=(Y5,Y3) // R7=(C1,C7) |

276 |
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || [TMP1]=R2 || R6.H=W[I2--]; |

277 |
A1-=R7.H*R5.L, A0+=R7.L*R5.L (IS) || I0-=4 || R7=[I3++]; |

278 |
A1+=R7.H*R6.H, A0+=R7.L*R6.H (IS) || I0+=M1; // R7=(C3,C5) |

279 |
R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS); |

280 |
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || R4=[TMP0]; |

281 |
A1+=R7.H*R5.L, A0-=R7.L*R5.L (IS) || I1+=M1 || R7=[I3++]; // R7=(C1,C7) |

282 |
A1+=R7.L*R6.H, A0-=R7.H*R6.H (IS); |

283 |
R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1; |

284 |
// R3=Y1, R2=Y7, R7=Y5, R6=Y3 |

285 | |

286 |
/* Transpose write column. */ |

287 |
R5.H=R4+R2 (RND20); // Y0=Y0+Y7 |

288 |
R5.L=R4-R2 (RND20) || R4 = [TMP1]; // Y7=Y7-Y0 |

289 |
R2.H=R1+R7 (RND20) || W[P0++P3]=R5.H; // Y2=Y2+Y5 st Y0 |

290 |
R2.L=R1-R7 (RND20) || W[P1++P4]=R5.L || R7=[I3++]; // Y5=Y2-Y5 st Y7 |

291 |
R5.H=R0-R3 (RND20) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2 |

292 |
R5.L=R0+R3 (RND20) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5 |

293 |
R3.H=R4-R6 (RND20) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1 |

294 |
R3.L=R4+R6 (RND20) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6 |

295 | |

296 |
/* pipeline loop start, + drain Y3, Y4 */ |

297 |
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || W[P0++P2]= R3.H || R1.H = W[I0--]; |

298 |
.3: R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++]; |

299 | |

300 |
L3 = 0; |

301 |
(R7:4,P5:3)=[SP++]; |

302 |
unlink; |

303 |
RTS; |

304 |
DEFUN_END(idct) |

305 | |

306 |