## ffmpeg / libavcodec / sh4 / idct_sh4.c @ b6204677

History | View | Annotate | Download (9.73 KB)

1 | 0c6bd2ea | BERO | ```
/*
``` |
---|---|---|---|

2 | ```
* idct for sh4
``` |
||

3 | ```
*
``` |
||

4 | ```
* Copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
``` |
||

5 | ```
*
``` |
||

6 | ```
* This library is free software; you can redistribute it and/or
``` |
||

7 | ```
* modify it under the terms of the GNU Lesser General Public
``` |
||

8 | ```
* License as published by the Free Software Foundation; either
``` |
||

9 | ```
* version 2 of the License, or (at your option) any later version.
``` |
||

10 | ```
*
``` |
||

11 | ```
* This library is distributed in the hope that it will be useful,
``` |
||

12 | ```
* but WITHOUT ANY WARRANTY; without even the implied warranty of
``` |
||

13 | ```
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
``` |
||

14 | ```
* Lesser General Public License for more details.
``` |
||

15 | ```
*
``` |
||

16 | ```
* You should have received a copy of the GNU Lesser General Public
``` |
||

17 | ```
* License along with this library; if not, write to the Free Software
``` |
||

18 | ```
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
``` |
||

19 | ```
*/
``` |
||

20 | |||

21 | #include "../dsputil.h" |
||

22 | bb270c08 | Diego Biurrun | #define c1 1.38703984532214752434 /* sqrt(2)*cos(1*pi/16) */ |

23 | #define c2 1.30656296487637657577 /* sqrt(2)*cos(2*pi/16) */ |
||

24 | #define c3 1.17587560241935884520 /* sqrt(2)*cos(3*pi/16) */ |
||

25 | #define c4 1.00000000000000000000 /* sqrt(2)*cos(4*pi/16) */ |
||

26 | #define c5 0.78569495838710234903 /* sqrt(2)*cos(5*pi/16) */ |
||

27 | #define c6 0.54119610014619712324 /* sqrt(2)*cos(6*pi/16) */ |
||

28 | #define c7 0.27589937928294311353 /* sqrt(2)*cos(7*pi/16) */ |
||

29 | |||

30 | const static float even_table[] __attribute__ ((aligned(8))) = { |
||

31 | c4, c4, c4, c4, |
||

32 | c2, c6,-c6,-c2, |
||

33 | c4,-c4,-c4, c4, |
||

34 | c6,-c2, c2,-c6 |
||

35 | 0c6bd2ea | BERO | }; |

36 | |||

37 | bb270c08 | Diego Biurrun | const static float odd_table[] __attribute__ ((aligned(8))) = { |

38 | c1, c3, c5, c7, |
||

39 | c3,-c7,-c1,-c5, |
||

40 | c5,-c1, c7, c3, |
||

41 | c7,-c5, c3,-c1 |
||

42 | 0c6bd2ea | BERO | }; |

43 | |||

44 | bb270c08 | Diego Biurrun | ```
#undef c1
``` |

45 | ```
#undef c2
``` |
||

46 | ```
#undef c3
``` |
||

47 | ```
#undef c4
``` |
||

48 | ```
#undef c5
``` |
||

49 | ```
#undef c6
``` |
||

50 | ```
#undef c7
``` |
||

51 | 0c6bd2ea | BERO | |

52 | ```
#if defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
``` |
||

53 | |||

54 | bb270c08 | Diego Biurrun | ```
#define load_matrix(table) \
``` |

55 | ```
__asm__ volatile( \
``` |
||

56 | ```
" fschg\n" \
``` |
||

57 | ```
" fmov @%0+,xd0\n" \
``` |
||

58 | ```
" fmov @%0+,xd2\n" \
``` |
||

59 | ```
" fmov @%0+,xd4\n" \
``` |
||

60 | ```
" fmov @%0+,xd6\n" \
``` |
||

61 | ```
" fmov @%0+,xd8\n" \
``` |
||

62 | ```
" fmov @%0+,xd10\n" \
``` |
||

63 | ```
" fmov @%0+,xd12\n" \
``` |
||

64 | ```
" fmov @%0+,xd14\n" \
``` |
||

65 | ```
" fschg\n" \
``` |
||

66 | :\ |
||

67 | ```
: "r"(table)\
``` |
||

68 | ```
: "0" \
``` |
||

69 | ) |
||

70 | |||

71 | ```
#define ftrv() \
``` |
||

72 | __asm__ volatile("ftrv xmtrx,fv0" \ |
||

73 | : "=f"(fr0),"=f"(fr1),"=f"(fr2),"=f"(fr3) \ |
||

74 | : "0"(fr0), "1"(fr1), "2"(fr2), "3"(fr3) ); |
||

75 | |||

76 | ```
#define DEFREG \
``` |
||

77 | register float fr0 __asm__("fr0"); \ |
||

78 | register float fr1 __asm__("fr1"); \ |
||

79 | register float fr2 __asm__("fr2"); \ |
||

80 | register float fr3 __asm__("fr3") |
||

81 | 0c6bd2ea | BERO | |

82 | ```
#else
``` |
||

83 | |||

84 | ```
/* generic C code for check */
``` |
||

85 | |||

86 | static void ftrv_(const float xf[],float fv[]) |
||

87 | { |
||

88 | bb270c08 | Diego Biurrun | ```
float f0,f1,f2,f3;
``` |

89 | ```
f0 = fv[0];
``` |
||

90 | ```
f1 = fv[1];
``` |
||

91 | ```
f2 = fv[2];
``` |
||

92 | ```
f3 = fv[3];
``` |
||

93 | fv[0] = xf[0]*f0 + xf[4]*f1 + xf[ 8]*f2 + xf[12]*f3; |
||

94 | fv[1] = xf[1]*f0 + xf[5]*f1 + xf[ 9]*f2 + xf[13]*f3; |
||

95 | fv[2] = xf[2]*f0 + xf[6]*f1 + xf[10]*f2 + xf[14]*f3; |
||

96 | fv[3] = xf[3]*f0 + xf[7]*f1 + xf[11]*f2 + xf[15]*f3; |
||

97 | 0c6bd2ea | BERO | } |

98 | |||

99 | static void load_matrix_(float xf[],const float table[]) |
||

100 | { |
||

101 | bb270c08 | Diego Biurrun | ```
int i;
``` |

102 | for(i=0;i<16;i++) xf[i]=table[i]; |
||

103 | 0c6bd2ea | BERO | } |

104 | |||

105 | bb270c08 | Diego Biurrun | ```
#define ftrv() ftrv_(xf,fv)
``` |

106 | ```
#define load_matrix(table) load_matrix_(xf,table)
``` |
||

107 | 0c6bd2ea | BERO | |

108 | bb270c08 | Diego Biurrun | ```
#define DEFREG \
``` |

109 | float fv[4],xf[16] |
||

110 | 0c6bd2ea | BERO | |

111 | bb270c08 | Diego Biurrun | #define fr0 fv[0] |

112 | #define fr1 fv[1] |
||

113 | #define fr2 fv[2] |
||

114 | #define fr3 fv[3] |
||

115 | 0c6bd2ea | BERO | |

116 | ```
#endif
``` |
||

117 | |||

118 | #if 1 |
||

119 | bb270c08 | Diego Biurrun | #define DESCALE(x,n) (x)*(1.0f/(1<<(n))) |

120 | 0c6bd2ea | BERO | ```
#else
``` |

121 | bb270c08 | Diego Biurrun | #define DESCALE(x,n) (((int)(x)+(1<<(n-1)))>>(n)) |

122 | 0c6bd2ea | BERO | ```
#endif
``` |

123 | |||

124 | ```
/* this code work worse on gcc cvs. 3.2.3 work fine */
``` |
||

125 | |||

126 | |||

127 | #if 1 |
||

128 | 115329f1 | Diego Biurrun | ```
//optimized
``` |

129 | 0c6bd2ea | BERO | |

130 | ```
void idct_sh4(DCTELEM *block)
``` |
||

131 | { |
||

132 | bb270c08 | Diego Biurrun | DEFREG; |

133 | 0c6bd2ea | BERO | |

134 | bb270c08 | Diego Biurrun | ```
int i;
``` |

135 | float tblock[8*8],*fblock; |
||

136 | ```
int ofs1,ofs2,ofs3;
``` |
||

137 | 0c6bd2ea | BERO | |

138 | ```
#if defined(__SH4__)
``` |
||

139 | bb270c08 | Diego Biurrun | #error "FIXME!! change to single float" |

140 | 0c6bd2ea | BERO | ```
#endif
``` |

141 | |||

142 | bb270c08 | Diego Biurrun | ```
/* row */
``` |

143 | |||

144 | ```
/* even part */
``` |
||

145 | load_matrix(even_table); |
||

146 | |||

147 | ```
fblock = tblock+4;
``` |
||

148 | ```
i = 8;
``` |
||

149 | ```
do {
``` |
||

150 | ```
fr0 = block[0];
``` |
||

151 | ```
fr1 = block[2];
``` |
||

152 | ```
fr2 = block[4];
``` |
||

153 | ```
fr3 = block[6];
``` |
||

154 | ```
block+=8;
``` |
||

155 | ftrv(); |
||

156 | *--fblock = fr3; |
||

157 | *--fblock = fr2; |
||

158 | *--fblock = fr1; |
||

159 | *--fblock = fr0; |
||

160 | fblock+=8+4; |
||

161 | ```
} while(--i);
``` |
||

162 | block-=8*8; |
||

163 | fblock-=8*8+4; |
||

164 | |||

165 | load_matrix(odd_table); |
||

166 | |||

167 | ```
i = 8;
``` |
||

168 | |||

169 | ```
// ofs1 = sizeof(float)*1;
``` |
||

170 | ```
// ofs2 = sizeof(float)*2;
``` |
||

171 | ```
// ofs3 = sizeof(float)*3;
``` |
||

172 | |||

173 | ```
do {
``` |
||

174 | ```
float t0,t1,t2,t3;
``` |
||

175 | ```
fr0 = block[1];
``` |
||

176 | ```
fr1 = block[3];
``` |
||

177 | ```
fr2 = block[5];
``` |
||

178 | ```
fr3 = block[7];
``` |
||

179 | ```
block+=8;
``` |
||

180 | ftrv(); |
||

181 | t0 = *fblock++; |
||

182 | t1 = *fblock++; |
||

183 | t2 = *fblock++; |
||

184 | t3 = *fblock++; |
||

185 | ```
fblock+=4;
``` |
||

186 | *--fblock = t0 - fr0; |
||

187 | *--fblock = t1 - fr1; |
||

188 | *--fblock = t2 - fr2; |
||

189 | *--fblock = t3 - fr3; |
||

190 | *--fblock = t3 + fr3; |
||

191 | *--fblock = t2 + fr2; |
||

192 | *--fblock = t1 + fr1; |
||

193 | *--fblock = t0 + fr0; |
||

194 | ```
fblock+=8;
``` |
||

195 | ```
} while(--i);
``` |
||

196 | block-=8*8; |
||

197 | fblock-=8*8; |
||

198 | |||

199 | ```
/* col */
``` |
||

200 | |||

201 | ```
/* even part */
``` |
||

202 | load_matrix(even_table); |
||

203 | |||

204 | ofs1 = sizeof(float)*2*8; |
||

205 | ofs2 = sizeof(float)*4*8; |
||

206 | ofs3 = sizeof(float)*6*8; |
||

207 | |||

208 | ```
i = 8;
``` |
||

209 | |||

210 | #define OA(fblock,ofs) *(float*)((char*)fblock + ofs) |
||

211 | |||

212 | ```
do {
``` |
||

213 | ```
fr0 = OA(fblock, 0);
``` |
||

214 | fr1 = OA(fblock,ofs1); |
||

215 | fr2 = OA(fblock,ofs2); |
||

216 | fr3 = OA(fblock,ofs3); |
||

217 | ftrv(); |
||

218 | ```
OA(fblock,0 ) = fr0;
``` |
||

219 | OA(fblock,ofs1) = fr1; |
||

220 | OA(fblock,ofs2) = fr2; |
||

221 | OA(fblock,ofs3) = fr3; |
||

222 | fblock++; |
||

223 | ```
} while(--i);
``` |
||

224 | ```
fblock-=8;
``` |
||

225 | |||

226 | load_matrix(odd_table); |
||

227 | |||

228 | ```
i=8;
``` |
||

229 | ```
do {
``` |
||

230 | ```
float t0,t1,t2,t3;
``` |
||

231 | t0 = OA(fblock, 0); /* [8*0] */ |
||

232 | ```
t1 = OA(fblock,ofs1); /* [8*2] */
``` |
||

233 | ```
t2 = OA(fblock,ofs2); /* [8*4] */
``` |
||

234 | ```
t3 = OA(fblock,ofs3); /* [8*6] */
``` |
||

235 | ```
fblock+=8;
``` |
||

236 | fr0 = OA(fblock, 0); /* [8*1] */ |
||

237 | ```
fr1 = OA(fblock,ofs1); /* [8*3] */
``` |
||

238 | ```
fr2 = OA(fblock,ofs2); /* [8*5] */
``` |
||

239 | ```
fr3 = OA(fblock,ofs3); /* [8*7] */
``` |
||

240 | fblock+=-8+1; |
||

241 | ftrv(); |
||

242 | block[8*0] = DESCALE(t0 + fr0,3); |
||

243 | block[8*7] = DESCALE(t0 - fr0,3); |
||

244 | block[8*1] = DESCALE(t1 + fr1,3); |
||

245 | block[8*6] = DESCALE(t1 - fr1,3); |
||

246 | block[8*2] = DESCALE(t2 + fr2,3); |
||

247 | block[8*5] = DESCALE(t2 - fr2,3); |
||

248 | block[8*3] = DESCALE(t3 + fr3,3); |
||

249 | block[8*4] = DESCALE(t3 - fr3,3); |
||

250 | block++; |
||

251 | ```
} while(--i);
``` |
||

252 | 0c6bd2ea | BERO | |

253 | ```
#if defined(__SH4__)
``` |
||

254 | bb270c08 | Diego Biurrun | #error "FIXME!! change to double" |

255 | 0c6bd2ea | BERO | ```
#endif
``` |

256 | } |
||

257 | ```
#else
``` |
||

258 | ```
void idct_sh4(DCTELEM *block)
``` |
||

259 | { |
||

260 | bb270c08 | Diego Biurrun | DEFREG; |

261 | |||

262 | ```
int i;
``` |
||

263 | float tblock[8*8],*fblock; |
||

264 | |||

265 | ```
/* row */
``` |
||

266 | |||

267 | ```
/* even part */
``` |
||

268 | load_matrix(even_table); |
||

269 | |||

270 | fblock = tblock; |
||

271 | ```
i = 8;
``` |
||

272 | ```
do {
``` |
||

273 | ```
fr0 = block[0];
``` |
||

274 | ```
fr1 = block[2];
``` |
||

275 | ```
fr2 = block[4];
``` |
||

276 | ```
fr3 = block[6];
``` |
||

277 | ```
block+=8;
``` |
||

278 | ftrv(); |
||

279 | ```
fblock[0] = fr0;
``` |
||

280 | ```
fblock[2] = fr1;
``` |
||

281 | ```
fblock[4] = fr2;
``` |
||

282 | ```
fblock[6] = fr3;
``` |
||

283 | ```
fblock+=8;
``` |
||

284 | ```
} while(--i);
``` |
||

285 | block-=8*8; |
||

286 | fblock-=8*8; |
||

287 | |||

288 | load_matrix(odd_table); |
||

289 | |||

290 | ```
i = 8;
``` |
||

291 | |||

292 | ```
do {
``` |
||

293 | ```
float t0,t1,t2,t3;
``` |
||

294 | ```
fr0 = block[1];
``` |
||

295 | ```
fr1 = block[3];
``` |
||

296 | ```
fr2 = block[5];
``` |
||

297 | ```
fr3 = block[7];
``` |
||

298 | ```
block+=8;
``` |
||

299 | ftrv(); |
||

300 | ```
t0 = fblock[0];
``` |
||

301 | ```
t1 = fblock[2];
``` |
||

302 | ```
t2 = fblock[4];
``` |
||

303 | ```
t3 = fblock[6];
``` |
||

304 | ```
fblock[0] = t0 + fr0;
``` |
||

305 | ```
fblock[7] = t0 - fr0;
``` |
||

306 | ```
fblock[1] = t1 + fr1;
``` |
||

307 | ```
fblock[6] = t1 - fr1;
``` |
||

308 | ```
fblock[2] = t2 + fr2;
``` |
||

309 | ```
fblock[5] = t2 - fr2;
``` |
||

310 | ```
fblock[3] = t3 + fr3;
``` |
||

311 | ```
fblock[4] = t3 - fr3;
``` |
||

312 | ```
fblock+=8;
``` |
||

313 | ```
} while(--i);
``` |
||

314 | block-=8*8; |
||

315 | fblock-=8*8; |
||

316 | |||

317 | ```
/* col */
``` |
||

318 | |||

319 | ```
/* even part */
``` |
||

320 | load_matrix(even_table); |
||

321 | |||

322 | ```
i = 8;
``` |
||

323 | |||

324 | ```
do {
``` |
||

325 | fr0 = fblock[8*0]; |
||

326 | fr1 = fblock[8*2]; |
||

327 | fr2 = fblock[8*4]; |
||

328 | fr3 = fblock[8*6]; |
||

329 | ftrv(); |
||

330 | fblock[8*0] = fr0; |
||

331 | fblock[8*2] = fr1; |
||

332 | fblock[8*4] = fr2; |
||

333 | fblock[8*6] = fr3; |
||

334 | fblock++; |
||

335 | ```
} while(--i);
``` |
||

336 | ```
fblock-=8;
``` |
||

337 | |||

338 | load_matrix(odd_table); |
||

339 | |||

340 | ```
i=8;
``` |
||

341 | ```
do {
``` |
||

342 | ```
float t0,t1,t2,t3;
``` |
||

343 | fr0 = fblock[8*1]; |
||

344 | fr1 = fblock[8*3]; |
||

345 | fr2 = fblock[8*5]; |
||

346 | fr3 = fblock[8*7]; |
||

347 | ftrv(); |
||

348 | t0 = fblock[8*0]; |
||

349 | t1 = fblock[8*2]; |
||

350 | t2 = fblock[8*4]; |
||

351 | t3 = fblock[8*6]; |
||

352 | fblock++; |
||

353 | block[8*0] = DESCALE(t0 + fr0,3); |
||

354 | block[8*7] = DESCALE(t0 - fr0,3); |
||

355 | block[8*1] = DESCALE(t1 + fr1,3); |
||

356 | block[8*6] = DESCALE(t1 - fr1,3); |
||

357 | block[8*2] = DESCALE(t2 + fr2,3); |
||

358 | block[8*5] = DESCALE(t2 - fr2,3); |
||

359 | block[8*3] = DESCALE(t3 + fr3,3); |
||

360 | block[8*4] = DESCALE(t3 - fr3,3); |
||

361 | block++; |
||

362 | ```
} while(--i);
``` |
||

363 | 0c6bd2ea | BERO | } |

364 | `#endif` |