ffmpeg / libavcodec / ppc / dsputil_altivec.c @ 8dbcc9f2

1 | 05c4072b | Michael Niedermayer | ```
/*
``` |
---|---|---|---|

2 | ```
* Copyright (c) 2002 Brian Foley
``` |
||

3 | ```
* Copyright (c) 2002 Dieter Shirley
``` |
||

4 | fe50f385 | Romain Dolbeau | ```
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
``` |

5 | 05c4072b | Michael Niedermayer | ```
*
``` |

6 | ```
* This library is free software; you can redistribute it and/or
``` |
||

7 | ```
* modify it under the terms of the GNU Lesser General Public
``` |
||

8 | ```
* License as published by the Free Software Foundation; either
``` |
||

9 | ```
* version 2 of the License, or (at your option) any later version.
``` |
||

10 | ```
*
``` |
||

11 | ```
* This library is distributed in the hope that it will be useful,
``` |
||

12 | ```
* but WITHOUT ANY WARRANTY; without even the implied warranty of
``` |
||

13 | ```
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
``` |
||

14 | ```
* Lesser General Public License for more details.
``` |
||

15 | ```
*
``` |
||

16 | ```
* You should have received a copy of the GNU Lesser General Public
``` |
||

17 | ```
* License along with this library; if not, write to the Free Software
``` |
||

18 | ```
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
``` |
||

19 | ```
*/
``` |
||

20 | |||

21 | 59925ef2 | Brian Foley | #include "../dsputil.h" |

22 | a9a07762 | Michael Niedermayer | |

23 | #include "gcc_fixes.h" |
||

24 | |||

25 | 05c4072b | Michael Niedermayer | #include "dsputil_altivec.h" |

26 | 59925ef2 | Brian Foley | |

27 | 3b991c54 | Romain Dolbeau | ```
#ifdef CONFIG_DARWIN
``` |

28 | 59925ef2 | Brian Foley | #include <sys/sysctl.h> |

29 | 3b991c54 | Romain Dolbeau | #else /* CONFIG_DARWIN */ |

30 | #include <signal.h> |
||

31 | #include <setjmp.h> |
||

32 | |||

33 | ```
static sigjmp_buf jmpbuf;
``` |
||

34 | static volatile sig_atomic_t canjump = 0; |
||

35 | |||

36 | static void sigill_handler (int sig) |
||

37 | { |
||

38 | ```
if (!canjump) {
``` |
||

39 | signal (sig, SIG_DFL); |
||

40 | raise (sig); |
||

41 | } |
||

42 | |||

43 | ```
canjump = 0;
``` |
||

44 | ```
siglongjmp (jmpbuf, 1);
``` |
||

45 | } |
||

46 | #endif /* CONFIG_DARWIN */ |
||

47 | 59925ef2 | Brian Foley | |

48 | f2677d6b | Brian Foley | int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |

49 | { |
||

50 | 4013fcf4 | Fabrice Bellard | ```
int i;
``` |

51 | int s __attribute__((aligned(16))); |
||

52 | 3b991c54 | Romain Dolbeau | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |

53 | 4013fcf4 | Fabrice Bellard | vector unsigned char *tv; |

54 | f2677d6b | Brian Foley | vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; |

55 | vector unsigned int sad; |
||

56 | vector signed int sumdiffs; |
||

57 | |||

58 | ```
s = 0;
``` |
||

59 | 3b991c54 | Romain Dolbeau | sad = (vector unsigned int)vec_splat_u32(0); |

60 | f2677d6b | Brian Foley | for(i=0;i<16;i++) { |

61 | ```
/*
``` |
||

62 | ```
Read unaligned pixels into our vectors. The vectors are as follows:
``` |
||

63 | ```
pix1v: pix1[0]-pix1[15]
``` |
||

64 | ```
pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
``` |
||

65 | ```
*/
``` |
||

66 | tv = (vector unsigned char *) pix1; |
||

67 | pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); |
||

68 | |||

69 | tv = (vector unsigned char *) &pix2[0]; |
||

70 | pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); |
||

71 | |||

72 | tv = (vector unsigned char *) &pix2[1]; |
||

73 | pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); |
||

74 | |||

75 | ```
/* Calculate the average vector */
``` |
||

76 | avgv = vec_avg(pix2v, pix2iv); |
||

77 | |||

78 | ```
/* Calculate a sum of abs differences vector */
``` |
||

79 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
||

80 | |||

81 | ```
/* Add each 4 pixel group together and put 4 results into sad */
``` |
||

82 | sad = vec_sum4s(t5, sad); |
||

83 | |||

84 | pix1 += line_size; |
||

85 | pix2 += line_size; |
||

86 | } |
||

87 | ```
/* Sum up the four partial sums, and put the result into s */
``` |
||

88 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
||

89 | ```
sumdiffs = vec_splat(sumdiffs, 3);
``` |
||

90 | ```
vec_ste(sumdiffs, 0, &s);
``` |
||

91 | |||

92 | ```
return s;
``` |
||

93 | } |
||

94 | |||

95 | int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |
||

96 | { |
||

97 | 4013fcf4 | Fabrice Bellard | ```
int i;
``` |

98 | int s __attribute__((aligned(16))); |
||

99 | 3b991c54 | Romain Dolbeau | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |

100 | 4013fcf4 | Fabrice Bellard | vector unsigned char *tv; |

101 | f2677d6b | Brian Foley | vector unsigned char pix1v, pix2v, pix3v, avgv, t5; |

102 | vector unsigned int sad; |
||

103 | vector signed int sumdiffs; |
||

104 | uint8_t *pix3 = pix2 + line_size; |
||

105 | |||

106 | ```
s = 0;
``` |
||

107 | 3b991c54 | Romain Dolbeau | sad = (vector unsigned int)vec_splat_u32(0); |

108 | f2677d6b | Brian Foley | |

109 | ```
/*
``` |
||

110 | ```
Due to the fact that pix3 = pix2 + line_size, the pix3 of one
``` |
||

111 | ```
iteration becomes pix2 in the next iteration. We can use this
``` |
||

112 | ```
fact to avoid a potentially expensive unaligned read, each
``` |
||

113 | ```
time around the loop.
``` |
||

114 | ```
Read unaligned pixels into our vectors. The vectors are as follows:
``` |
||

115 | ```
pix2v: pix2[0]-pix2[15]
``` |
||

116 | ```
Split the pixel vectors into shorts
``` |
||

117 | ```
*/
``` |
||

118 | tv = (vector unsigned char *) &pix2[0]; |
||

119 | pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); |
||

120 | |||

121 | for(i=0;i<16;i++) { |
||

122 | ```
/*
``` |
||

123 | ```
Read unaligned pixels into our vectors. The vectors are as follows:
``` |
||

124 | ```
pix1v: pix1[0]-pix1[15]
``` |
||

125 | ```
pix3v: pix3[0]-pix3[15]
``` |
||

126 | ```
*/
``` |
||

127 | tv = (vector unsigned char *) pix1; |
||

128 | pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); |
||

129 | |||

130 | tv = (vector unsigned char *) &pix3[0]; |
||

131 | pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); |
||

132 | |||

133 | ```
/* Calculate the average vector */
``` |
||

134 | avgv = vec_avg(pix2v, pix3v); |
||

135 | |||

136 | ```
/* Calculate a sum of abs differences vector */
``` |
||

137 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
||

138 | |||

139 | ```
/* Add each 4 pixel group together and put 4 results into sad */
``` |
||

140 | sad = vec_sum4s(t5, sad); |
||

141 | |||

142 | pix1 += line_size; |
||

143 | pix2v = pix3v; |
||

144 | pix3 += line_size; |
||

145 | |||

146 | } |
||

147 | |||

148 | ```
/* Sum up the four partial sums, and put the result into s */
``` |
||

149 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
||

150 | ```
sumdiffs = vec_splat(sumdiffs, 3);
``` |
||

151 | ```
vec_ste(sumdiffs, 0, &s);
``` |
||

152 | ```
return s;
``` |
||

153 | } |
||

154 | |||

155 | int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |
||

156 | { |
||

157 | 4013fcf4 | Fabrice Bellard | ```
int i;
``` |

158 | int s __attribute__((aligned(16))); |
||

159 | f2677d6b | Brian Foley | uint8_t *pix3 = pix2 + line_size; |

160 | 3b991c54 | Romain Dolbeau | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |

161 | const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); |
||

162 | 4013fcf4 | Fabrice Bellard | vector unsigned char *tv, avgv, t5; |

163 | f2677d6b | Brian Foley | vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; |

164 | vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; |
||

165 | vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; |
||

166 | 4013fcf4 | Fabrice Bellard | vector unsigned short avghv, avglv; |

167 | f2677d6b | Brian Foley | vector unsigned short t1, t2, t3, t4; |

168 | vector unsigned int sad; |
||

169 | vector signed int sumdiffs; |
||

170 | |||

171 | 3b991c54 | Romain Dolbeau | sad = (vector unsigned int)vec_splat_u32(0); |

172 | f2677d6b | Brian Foley | |

173 | ```
s = 0;
``` |
||

174 | |||

175 | ```
/*
``` |
||

176 | ```
Due to the fact that pix3 = pix2 + line_size, the pix3 of one
``` |
||

177 | ```
iteration becomes pix2 in the next iteration. We can use this
``` |
||

178 | ```
fact to avoid a potentially expensive unaligned read, as well
``` |
||

179 | ```
as some splitting, and vector addition each time around the loop.
``` |
||

180 | ```
Read unaligned pixels into our vectors. The vectors are as follows:
``` |
||

181 | ```
pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
``` |
||

182 | ```
Split the pixel vectors into shorts
``` |
||

183 | ```
*/
``` |
||

184 | tv = (vector unsigned char *) &pix2[0]; |
||

185 | pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); |
||

186 | |||

187 | tv = (vector unsigned char *) &pix2[1]; |
||

188 | pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); |
||

189 | |||

190 | pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); |
||

191 | pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); |
||

192 | pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); |
||

193 | pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); |
||

194 | t1 = vec_add(pix2hv, pix2ihv); |
||

195 | t2 = vec_add(pix2lv, pix2ilv); |
||

196 | |||

197 | for(i=0;i<16;i++) { |
||

198 | ```
/*
``` |
||

199 | ```
Read unaligned pixels into our vectors. The vectors are as follows:
``` |
||

200 | ```
pix1v: pix1[0]-pix1[15]
``` |
||

201 | ```
pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
``` |
||

202 | ```
*/
``` |
||

203 | tv = (vector unsigned char *) pix1; |
||

204 | pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); |
||

205 | |||

206 | tv = (vector unsigned char *) &pix3[0]; |
||

207 | pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); |
||

208 | |||

209 | tv = (vector unsigned char *) &pix3[1]; |
||

210 | pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); |
||

211 | |||

212 | ```
/*
``` |
||

213 | ```
Note that Altivec does have vec_avg, but this works on vector pairs
``` |
||

214 | ```
and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
``` |
||

215 | ```
would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
``` |
||

216 | ```
Instead, we have to split the pixel vectors into vectors of shorts,
``` |
||

217 | ```
and do the averaging by hand.
``` |
||

218 | ```
*/
``` |
||

219 | |||

220 | ```
/* Split the pixel vectors into shorts */
``` |
||

221 | pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); |
||

222 | pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); |
||

223 | pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); |
||

224 | pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); |
||

225 | |||

226 | ```
/* Do the averaging on them */
``` |
||

227 | t3 = vec_add(pix3hv, pix3ihv); |
||

228 | t4 = vec_add(pix3lv, pix3ilv); |
||

229 | |||

230 | 9c76bd48 | Brian Foley | avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); |

231 | avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); |
||

232 | f2677d6b | Brian Foley | |

233 | ```
/* Pack the shorts back into a result */
``` |
||

234 | avgv = vec_pack(avghv, avglv); |
||

235 | |||

236 | ```
/* Calculate a sum of abs differences vector */
``` |
||

237 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
||

238 | |||

239 | ```
/* Add each 4 pixel group together and put 4 results into sad */
``` |
||

240 | sad = vec_sum4s(t5, sad); |
||

241 | |||

242 | pix1 += line_size; |
||

243 | pix3 += line_size; |
||

244 | ```
/* Transfer the calculated values for pix3 into pix2 */
``` |
||

245 | t1 = t3; |
||

246 | t2 = t4; |
||

247 | } |
||

248 | ```
/* Sum up the four partial sums, and put the result into s */
``` |
||

249 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
||

250 | ```
sumdiffs = vec_splat(sumdiffs, 3);
``` |
||

251 | ```
vec_ste(sumdiffs, 0, &s);
``` |
||

252 | |||

253 | ```
return s;
``` |
||

254 | } |
||

255 | |||

256 | 59925ef2 | Brian Foley | int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |

257 | { |
||

258 | 4013fcf4 | Fabrice Bellard | ```
int i;
``` |

259 | int s __attribute__((aligned(16))); |
||

260 | 3b991c54 | Romain Dolbeau | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |

261 | 59925ef2 | Brian Foley | vector unsigned char perm1, perm2, *pix1v, *pix2v; |

262 | vector unsigned char t1, t2, t3,t4, t5; |
||

263 | 4013fcf4 | Fabrice Bellard | vector unsigned int sad; |

264 | 59925ef2 | Brian Foley | vector signed int sumdiffs; |

265 | |||

266 | 3b991c54 | Romain Dolbeau | sad = (vector unsigned int)vec_splat_u32(0); |

267 | 59925ef2 | Brian Foley | |

268 | |||

269 | for(i=0;i<16;i++) { |
||

270 | ```
/* Read potentially unaligned pixels into t1 and t2 */
``` |
||

271 | ```
perm1 = vec_lvsl(0, pix1);
``` |
||

272 | pix1v = (vector unsigned char *) pix1; |
||

273 | ```
perm2 = vec_lvsl(0, pix2);
``` |
||

274 | pix2v = (vector unsigned char *) pix2; |
||

275 | t1 = vec_perm(pix1v[0], pix1v[1], perm1); |
||

276 | t2 = vec_perm(pix2v[0], pix2v[1], perm2); |
||

277 | |||

278 | ```
/* Calculate a sum of abs differences vector */
``` |
||

279 | t3 = vec_max(t1, t2); |
||

280 | t4 = vec_min(t1, t2); |
||

281 | t5 = vec_sub(t3, t4); |
||

282 | |||

283 | ```
/* Add each 4 pixel group together and put 4 results into sad */
``` |
||

284 | sad = vec_sum4s(t5, sad); |
||

285 | |||

286 | pix1 += line_size; |
||

287 | pix2 += line_size; |
||

288 | } |
||

289 | |||

290 | ```
/* Sum up the four partial sums, and put the result into s */
``` |
||

291 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
||

292 | ```
sumdiffs = vec_splat(sumdiffs, 3);
``` |
||

293 | ```
vec_ste(sumdiffs, 0, &s);
``` |
||

294 | |||

295 | ```
return s;
``` |
||

296 | } |
||

297 | |||

298 | int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |
||

299 | { |
||

300 | 4013fcf4 | Fabrice Bellard | ```
int i;
``` |

301 | int s __attribute__((aligned(16))); |
||

302 | 3b991c54 | Romain Dolbeau | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |

303 | 59925ef2 | Brian Foley | vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; |

304 | vector unsigned char t1, t2, t3,t4, t5; |
||

305 | 4013fcf4 | Fabrice Bellard | vector unsigned int sad; |

306 | 59925ef2 | Brian Foley | vector signed int sumdiffs; |

307 | |||

308 | 3b991c54 | Romain Dolbeau | sad = (vector unsigned int)vec_splat_u32(0); |

309 | a9a07762 | Michael Niedermayer | |

310 | permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); |
||

311 | 59925ef2 | Brian Foley | |

312 | for(i=0;i<8;i++) { |
||

313 | ```
/* Read potentially unaligned pixels into t1 and t2
``` |
||

314 | ```
Since we're reading 16 pixels, and actually only want 8,
``` |
||

315 | ```
mask out the last 8 pixels. The 0s don't change the sum. */
``` |
||

316 | ```
perm1 = vec_lvsl(0, pix1);
``` |
||

317 | pix1v = (vector unsigned char *) pix1; |
||

318 | ```
perm2 = vec_lvsl(0, pix2);
``` |
||

319 | pix2v = (vector unsigned char *) pix2; |
||

320 | t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); |
||

321 | t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); |
||

322 | |||

323 | ```
/* Calculate a sum of abs differences vector */
``` |
||

324 | t3 = vec_max(t1, t2); |
||

325 | t4 = vec_min(t1, t2); |
||

326 | t5 = vec_sub(t3, t4); |
||

327 | |||

328 | ```
/* Add each 4 pixel group together and put 4 results into sad */
``` |
||

329 | sad = vec_sum4s(t5, sad); |
||

330 | |||

331 | pix1 += line_size; |
||

332 | pix2 += line_size; |
||

333 | } |
||

334 | |||

335 | ```
/* Sum up the four partial sums, and put the result into s */
``` |
||

336 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
||

337 | ```
sumdiffs = vec_splat(sumdiffs, 3);
``` |
||

338 | ```
vec_ste(sumdiffs, 0, &s);
``` |
||

339 | |||

340 | ```
return s;
``` |
||

341 | } |
||

342 | |||

343 | f2677d6b | Brian Foley | int pix_norm1_altivec(uint8_t *pix, int line_size) |

344 | { |
||

345 | 4013fcf4 | Fabrice Bellard | ```
int i;
``` |

346 | int s __attribute__((aligned(16))); |
||

347 | 3b991c54 | Romain Dolbeau | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |

348 | 4013fcf4 | Fabrice Bellard | vector unsigned char *tv; |

349 | f2677d6b | Brian Foley | vector unsigned char pixv; |

350 | vector unsigned int sv; |
||

351 | vector signed int sum; |
||

352 | 4013fcf4 | Fabrice Bellard | |

353 | 3b991c54 | Romain Dolbeau | sv = (vector unsigned int)vec_splat_u32(0); |

354 | f2677d6b | Brian Foley | |

355 | ```
s = 0;
``` |
||

356 | for (i = 0; i < 16; i++) { |
||

357 | ```
/* Read in the potentially unaligned pixels */
``` |
||

358 | tv = (vector unsigned char *) pix; |
||

359 | pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); |
||

360 | |||

361 | 9c76bd48 | Brian Foley | ```
/* Square the values, and add them to our sum */
``` |

362 | sv = vec_msum(pixv, pixv, sv); |
||

363 | f2677d6b | Brian Foley | |

364 | pix += line_size; |
||

365 | } |
||

366 | ```
/* Sum up the four partial sums, and put the result into s */
``` |
||

367 | sum = vec_sums((vector signed int) sv, (vector signed int) zero); |
||

368 | ```
sum = vec_splat(sum, 3);
``` |
||

369 | ```
vec_ste(sum, 0, &s);
``` |
||

370 | |||

371 | ```
return s;
``` |
||

372 | } |
||

373 | |||

374 | 4013fcf4 | Fabrice Bellard | ```
/**
``` |

375 | ```
* Sum of Squared Errors for a 8x8 block.
``` |
||

376 | ```
* AltiVec-enhanced.
``` |
||

377 | ```
* It's the pix_abs8x8_altivec code above w/ squaring added.
``` |
||

378 | ```
*/
``` |
||

379 | int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) |
||

380 | { |
||

381 | ```
int i;
``` |
||

382 | int s __attribute__((aligned(16))); |
||

383 | 3b991c54 | Romain Dolbeau | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |

384 | 4013fcf4 | Fabrice Bellard | vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; |

385 | vector unsigned char t1, t2, t3,t4, t5; |
||

386 | vector unsigned int sum; |
||

387 | vector signed int sumsqr; |
||

388 | |||

389 | 3b991c54 | Romain Dolbeau | sum = (vector unsigned int)vec_splat_u32(0); |

390 | a9a07762 | Michael Niedermayer | |

391 | permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); |
||

392 | |||

393 | 4013fcf4 | Fabrice Bellard | |

394 | for(i=0;i<8;i++) { |
||

395 | ```
/* Read potentially unaligned pixels into t1 and t2
``` |
||

396 | ```
Since we're reading 16 pixels, and actually only want 8,
``` |
||

397 | ```
mask out the last 8 pixels. The 0s don't change the sum. */
``` |
||

398 | ```
perm1 = vec_lvsl(0, pix1);
``` |
||

399 | pix1v = (vector unsigned char *) pix1; |
||

400 | ```
perm2 = vec_lvsl(0, pix2);
``` |
||

401 | pix2v = (vector unsigned char *) pix2; |
||

402 | t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); |
||

403 | t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); |
||

404 | |||

405 | ```
/*
``` |
||

406 | ```
Since we want to use unsigned chars, we can take advantage
``` |
||

407 | ```
of the fact that abs(a-b)^2 = (a-b)^2.
``` |
||

408 | ```
*/
``` |
||

409 | |||

410 | ```
/* Calculate abs differences vector */
``` |
||

411 | t3 = vec_max(t1, t2); |
||

412 | t4 = vec_min(t1, t2); |
||

413 | t5 = vec_sub(t3, t4); |
||

414 | |||

415 | ```
/* Square the values and add them to our sum */
``` |
||

416 | sum = vec_msum(t5, t5, sum); |
||

417 | |||

418 | pix1 += line_size; |
||

419 | pix2 += line_size; |
||

420 | } |
||

421 | |||

422 | ```
/* Sum up the four partial sums, and put the result into s */
``` |
||

423 | sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); |
||

424 | ```
sumsqr = vec_splat(sumsqr, 3);
``` |
||

425 | ```
vec_ste(sumsqr, 0, &s);
``` |
||

426 | |||

427 | ```
return s;
``` |
||

428 | } |
||

429 | |||

430 | ```
/**
``` |
||

431 | ```
* Sum of Squared Errors for a 16x16 block.
``` |
||

432 | ```
* AltiVec-enhanced.
``` |
||

433 | ```
* It's the pix_abs16x16_altivec code above w/ squaring added.
``` |
||

434 | ```
*/
``` |
||

435 | int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) |
||

436 | 59925ef2 | Brian Foley | { |

437 | 4013fcf4 | Fabrice Bellard | ```
int i;
``` |

438 | int s __attribute__((aligned(16))); |
||

439 | 3b991c54 | Romain Dolbeau | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |

440 | 4013fcf4 | Fabrice Bellard | vector unsigned char perm1, perm2, *pix1v, *pix2v; |

441 | vector unsigned char t1, t2, t3,t4, t5; |
||

442 | vector unsigned int sum; |
||

443 | vector signed int sumsqr; |
||

444 | |||

445 | 3b991c54 | Romain Dolbeau | sum = (vector unsigned int)vec_splat_u32(0); |

446 | 4013fcf4 | Fabrice Bellard | |

447 | for(i=0;i<16;i++) { |
||

448 | ```
/* Read potentially unaligned pixels into t1 and t2 */
``` |
||

449 | ```
perm1 = vec_lvsl(0, pix1);
``` |
||

450 | pix1v = (vector unsigned char *) pix1; |
||

451 | ```
perm2 = vec_lvsl(0, pix2);
``` |
||

452 | pix2v = (vector unsigned char *) pix2; |
||

453 | t1 = vec_perm(pix1v[0], pix1v[1], perm1); |
||

454 | t2 = vec_perm(pix2v[0], pix2v[1], perm2); |
||

455 | |||

456 | ```
/*
``` |
||

457 | ```
Since we want to use unsigned chars, we can take advantage
``` |
||

458 | ```
of the fact that abs(a-b)^2 = (a-b)^2.
``` |
||

459 | ```
*/
``` |
||

460 | |||

461 | ```
/* Calculate abs differences vector */
``` |
||

462 | t3 = vec_max(t1, t2); |
||

463 | t4 = vec_min(t1, t2); |
||

464 | t5 = vec_sub(t3, t4); |
||

465 | |||

466 | ```
/* Square the values and add them to our sum */
``` |
||

467 | sum = vec_msum(t5, t5, sum); |
||

468 | |||

469 | pix1 += line_size; |
||

470 | pix2 += line_size; |
||

471 | } |
||

472 | |||

473 | ```
/* Sum up the four partial sums, and put the result into s */
``` |
||

474 | sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); |
||

475 | ```
sumsqr = vec_splat(sumsqr, 3);
``` |
||

476 | ```
vec_ste(sumsqr, 0, &s);
``` |
||

477 | |||

478 | ```
return s;
``` |
||

479 | } |
||

480 | 59925ef2 | Brian Foley | |

481 | 0c1a9eda | Zdenek Kabelac | int pix_sum_altivec(uint8_t * pix, int line_size) |

482 | 4013fcf4 | Fabrice Bellard | { |

483 | 3b991c54 | Romain Dolbeau | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |

484 | 59925ef2 | Brian Foley | vector unsigned char perm, *pixv; |

485 | vector unsigned char t1; |
||

486 | 4013fcf4 | Fabrice Bellard | vector unsigned int sad; |

487 | 59925ef2 | Brian Foley | vector signed int sumdiffs; |

488 | |||

489 | 4013fcf4 | Fabrice Bellard | ```
int i;
``` |

490 | int s __attribute__((aligned(16))); |
||

491 | |||

492 | 3b991c54 | Romain Dolbeau | sad = (vector unsigned int)vec_splat_u32(0); |

493 | 59925ef2 | Brian Foley | |

494 | for (i = 0; i < 16; i++) { |
||

495 | ```
/* Read the potentially unaligned 16 pixels into t1 */
``` |
||

496 | ```
perm = vec_lvsl(0, pix);
``` |
||

497 | pixv = (vector unsigned char *) pix; |
||

498 | t1 = vec_perm(pixv[0], pixv[1], perm); |
||

499 | |||

500 | ```
/* Add each 4 pixel group together and put 4 results into sad */
``` |
||

501 | sad = vec_sum4s(t1, sad); |
||

502 | |||

503 | pix += line_size; |
||

504 | } |
||

505 | |||

506 | ```
/* Sum up the four partial sums, and put the result into s */
``` |
||

507 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
||

508 | ```
sumdiffs = vec_splat(sumdiffs, 3);
``` |
||

509 | ```
vec_ste(sumdiffs, 0, &s);
``` |
||

510 | |||

511 | ```
return s;
``` |
||

512 | } |
||

513 | |||

514 | 0c1a9eda | Zdenek Kabelac | void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) |

515 | 05c4072b | Michael Niedermayer | { |

516 | ```
int i;
``` |
||

517 | vector unsigned char perm, bytes, *pixv; |
||

518 | 3b991c54 | Romain Dolbeau | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |

519 | 05c4072b | Michael Niedermayer | vector signed short shorts; |

520 | |||

521 | for(i=0;i<8;i++) |
||

522 | { |
||

523 | ```
// Read potentially unaligned pixels.
``` |
||

524 | ```
// We're reading 16 pixels, and actually only want 8,
``` |
||

525 | ```
// but we simply ignore the extras.
``` |
||

526 | ```
perm = vec_lvsl(0, pixels);
``` |
||

527 | pixv = (vector unsigned char *) pixels; |
||

528 | bytes = vec_perm(pixv[0], pixv[1], perm); |
||

529 | |||

530 | ```
// convert the bytes into shorts
``` |
||

531 | shorts = (vector signed short)vec_mergeh(zero, bytes); |
||

532 | |||

533 | ```
// save the data to the block, we assume the block is 16-byte aligned
``` |
||

534 | vec_st(shorts, i*16, (vector signed short*)block); |
||

535 | |||

536 | pixels += line_size; |
||

537 | } |
||

538 | } |
||

539 | |||

540 | 0c1a9eda | Zdenek Kabelac | void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, |

541 | const uint8_t *s2, int stride) |
||

542 | 05c4072b | Michael Niedermayer | { |

543 | ```
int i;
``` |
||

544 | vector unsigned char perm, bytes, *pixv; |
||

545 | 3b991c54 | Romain Dolbeau | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |

546 | 05c4072b | Michael Niedermayer | vector signed short shorts1, shorts2; |

547 | |||

548 | for(i=0;i<4;i++) |
||

549 | { |
||

550 | ```
// Read potentially unaligned pixels
``` |
||

551 | ```
// We're reading 16 pixels, and actually only want 8,
``` |
||

552 | ```
// but we simply ignore the extras.
``` |
||

553 | ```
perm = vec_lvsl(0, s1);
``` |
||

554 | pixv = (vector unsigned char *) s1; |
||

555 | bytes = vec_perm(pixv[0], pixv[1], perm); |
||

556 | |||

557 | ```
// convert the bytes into shorts
``` |
||

558 | shorts1 = (vector signed short)vec_mergeh(zero, bytes); |
||

559 | |||

560 | ```
// Do the same for the second block of pixels
``` |
||

561 | ```
perm = vec_lvsl(0, s2);
``` |
||

562 | pixv = (vector unsigned char *) s2; |
||

563 | bytes = vec_perm(pixv[0], pixv[1], perm); |
||

564 | |||

565 | ```
// convert the bytes into shorts
``` |
||

566 | shorts2 = (vector signed short)vec_mergeh(zero, bytes); |
||

567 | |||

568 | ```
// Do the subtraction
``` |
||

569 | shorts1 = vec_sub(shorts1, shorts2); |
||

570 | |||

571 | ```
// save the data to the block, we assume the block is 16-byte aligned
``` |
||

572 | vec_st(shorts1, 0, (vector signed short*)block); |
||

573 | |||

574 | s1 += stride; |
||

575 | s2 += stride; |
||

576 | ```
block += 8;
``` |
||

577 | |||

578 | |||

579 | ```
// The code below is a copy of the code above... This is a manual
``` |
||

580 | ```
// unroll.
``` |
||

581 | |||

582 | ```
// Read potentially unaligned pixels
``` |
||

583 | ```
// We're reading 16 pixels, and actually only want 8,
``` |
||

584 | ```
// but we simply ignore the extras.
``` |
||

585 | ```
perm = vec_lvsl(0, s1);
``` |
||

586 | pixv = (vector unsigned char *) s1; |
||

587 | bytes = vec_perm(pixv[0], pixv[1], perm); |
||

588 | |||

589 | ```
// convert the bytes into shorts
``` |
||

590 | shorts1 = (vector signed short)vec_mergeh(zero, bytes); |
||

591 | |||

592 | ```
// Do the same for the second block of pixels
``` |
||

593 | ```
perm = vec_lvsl(0, s2);
``` |
||

594 | pixv = (vector unsigned char *) s2; |
||

595 | bytes = vec_perm(pixv[0], pixv[1], perm); |
||

596 | |||

597 | ```
// convert the bytes into shorts
``` |
||

598 | shorts2 = (vector signed short)vec_mergeh(zero, bytes); |
||

599 | |||

600 | ```
// Do the subtraction
``` |
||

601 | shorts1 = vec_sub(shorts1, shorts2); |
||

602 | |||

603 | ```
// save the data to the block, we assume the block is 16-byte aligned
``` |
||

604 | vec_st(shorts1, 0, (vector signed short*)block); |
||

605 | |||

606 | s1 += stride; |
||

607 | s2 += stride; |
||

608 | ```
block += 8;
``` |
||

609 | } |
||

610 | } |
||

611 | |||

612 | e629ab68 | Romain Dolbeau | int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { |

613 | ```
return pix_abs16x16_altivec(a,b,stride);
``` |
||

614 | } |
||

615 | |||

616 | int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { |
||

617 | ```
return pix_abs8x8_altivec(a,b,stride);
``` |
||

618 | } |
||

619 | |||

620 | void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { |
||

621 | db40a39a | Michael Niedermayer | ```
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
``` |

622 | e629ab68 | Romain Dolbeau | ```
int i;
``` |

623 | for(i=0; i+7<w; i++){ |
||

624 | dst[i+0] += src[i+0]; |
||

625 | dst[i+1] += src[i+1]; |
||

626 | dst[i+2] += src[i+2]; |
||

627 | dst[i+3] += src[i+3]; |
||

628 | dst[i+4] += src[i+4]; |
||

629 | dst[i+5] += src[i+5]; |
||

630 | dst[i+6] += src[i+6]; |
||

631 | dst[i+7] += src[i+7]; |
||

632 | } |
||

633 | ```
for(; i<w; i++)
``` |
||

634 | dst[i+0] += src[i+0]; |
||

635 | db40a39a | Michael Niedermayer | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |

636 | e629ab68 | Romain Dolbeau | register int i; |

637 | db40a39a | Michael Niedermayer | register vector unsigned char vdst, vsrc; |

638 | |||

639 | ```
/* dst and src are 16 bytes-aligned (guaranteed) */
``` |
||

640 | for(i = 0 ; (i + 15) < w ; i++) |
||

641 | e629ab68 | Romain Dolbeau | { |

642 | db40a39a | Michael Niedermayer | vdst = vec_ld(i << 4, (unsigned char*)dst); |

643 | vsrc = vec_ld(i << 4, (unsigned char*)src); |
||

644 | e629ab68 | Romain Dolbeau | vdst = vec_add(vsrc, vdst); |

645 | db40a39a | Michael Niedermayer | vec_st(vdst, i << 4, (unsigned char*)dst); |

646 | e629ab68 | Romain Dolbeau | } |

647 | db40a39a | Michael Niedermayer | ```
/* if w is not a multiple of 16 */
``` |

648 | e629ab68 | Romain Dolbeau | ```
for (; (i < w) ; i++)
``` |

649 | { |
||

650 | dst[i] = src[i]; |
||

651 | } |
||

652 | db40a39a | Michael Niedermayer | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |

653 | } |
||

654 | |||

655 | fe50f385 | Romain Dolbeau | ```
/* next one assumes that ((line_size % 16) == 0) */
``` |

656 | db40a39a | Michael Niedermayer | void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |

657 | { |
||

658 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
``` |

659 | db40a39a | Michael Niedermayer | ```
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
``` |

660 | ```
int i;
``` |
||

661 | |||

662 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
``` |

663 | db40a39a | Michael Niedermayer | |

664 | for(i=0; i<h; i++) { |
||

665 | *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l); |
||

666 | *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l); |
||

667 | *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l); |
||

668 | *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l); |
||

669 | pixels+=line_size; |
||

670 | block +=line_size; |
||

671 | } |
||

672 | |||

673 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
``` |

674 | db40a39a | Michael Niedermayer | |

675 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
||

676 | register vector unsigned char pixelsv1, pixelsv2; |
||

677 | e45a2872 | Romain Dolbeau | register vector unsigned char pixelsv1B, pixelsv2B; |

678 | register vector unsigned char pixelsv1C, pixelsv2C; |
||

679 | register vector unsigned char pixelsv1D, pixelsv2D; |
||

680 | |||

681 | fe50f385 | Romain Dolbeau | register vector unsigned char perm = vec_lvsl(0, pixels); |

682 | db40a39a | Michael Niedermayer | ```
int i;
``` |

683 | e45a2872 | Romain Dolbeau | register int line_size_2 = line_size << 1; |

684 | register int line_size_3 = line_size + line_size_2; |
||

685 | register int line_size_4 = line_size << 2; |
||

686 | |||

687 | ```
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
``` |
||

688 | ```
// hand-unrolling the loop by 4 gains about 15%
``` |
||

689 | ```
// mininum execution time goes from 74 to 60 cycles
``` |
||

690 | ```
// it's faster than -funroll-loops, but using
``` |
||

691 | ```
// -funroll-loops w/ this is bad - 74 cycles again.
``` |
||

692 | ```
// all this is on a 7450, tuning for the 7450
``` |
||

693 | ```
#if 0
``` |
||

694 | db40a39a | Michael Niedermayer | ```
for(i=0; i<h; i++) {
``` |

695 | ```
pixelsv1 = vec_ld(0, (unsigned char*)pixels);
``` |
||

696 | ```
pixelsv2 = vec_ld(16, (unsigned char*)pixels);
``` |
||

697 | fe50f385 | Romain Dolbeau | ```
vec_st(vec_perm(pixelsv1, pixelsv2, perm),
``` |

698 | 35e5fb06 | Romain Dolbeau | ```
0, (unsigned char*)block);
``` |

699 | db40a39a | Michael Niedermayer | ```
pixels+=line_size;
``` |

700 | ```
block +=line_size;
``` |
||

701 | ```
}
``` |
||

702 | e45a2872 | Romain Dolbeau | ```
#else
``` |

703 | for(i=0; i<h; i+=4) { |
||

704 | pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
||

705 | pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
||

706 | pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); |
||

707 | pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels); |
||

708 | pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels); |
||

709 | pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels); |
||

710 | pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels); |
||

711 | pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels); |
||

712 | vec_st(vec_perm(pixelsv1, pixelsv2, perm), |
||

713 | 0, (unsigned char*)block); |
||

714 | vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), |
||

715 | line_size, (unsigned char*)block); |
||

716 | vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), |
||

717 | line_size_2, (unsigned char*)block); |
||

718 | vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), |
||

719 | line_size_3, (unsigned char*)block); |
||

720 | pixels+=line_size_4; |
||

721 | block +=line_size_4; |
||

722 | } |
||

723 | ```
#endif
``` |
||

724 | ```
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
``` |
||

725 | db40a39a | Michael Niedermayer | |

726 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
||

727 | } |
||

728 | |||

729 | fe50f385 | Romain Dolbeau | ```
/* next one assumes that ((line_size % 16) == 0) */
``` |

730 | db40a39a | Michael Niedermayer | #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) |

731 | void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
||

732 | { |
||

733 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
``` |

734 | db40a39a | Michael Niedermayer | ```
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
``` |

735 | ```
int i;
``` |
||

736 | |||

737 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
``` |

738 | db40a39a | Michael Niedermayer | |

739 | for(i=0; i<h; i++) { |
||

740 | op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l)); |
||

741 | op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l)); |
||

742 | op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l)); |
||

743 | op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l)); |
||

744 | pixels+=line_size; |
||

745 | block +=line_size; |
||

746 | } |
||

747 | |||

748 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
``` |

749 | db40a39a | Michael Niedermayer | |

750 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
||

751 | register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; |
||

752 | fe50f385 | Romain Dolbeau | register vector unsigned char perm = vec_lvsl(0, pixels); |

753 | db40a39a | Michael Niedermayer | ```
int i;
``` |

754 | |||

755 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
``` |

756 | db40a39a | Michael Niedermayer | |

757 | for(i=0; i<h; i++) { |
||

758 | pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
||

759 | pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
||

760 | ```
blockv = vec_ld(0, block);
``` |
||

761 | fe50f385 | Romain Dolbeau | pixelsv = vec_perm(pixelsv1, pixelsv2, perm); |

762 | db40a39a | Michael Niedermayer | blockv = vec_avg(blockv,pixelsv); |

763 | vec_st(blockv, 0, (unsigned char*)block); |
||

764 | pixels+=line_size; |
||

765 | block +=line_size; |
||

766 | } |
||

767 | |||

768 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
``` |

769 | db40a39a | Michael Niedermayer | |

770 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
||

771 | e629ab68 | Romain Dolbeau | } |

772 | 05c4072b | Michael Niedermayer | |

773 | fe50f385 | Romain Dolbeau | ```
/* next one assumes that ((line_size % 8) == 0) */
``` |

774 | void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) |
||

775 | 35e5fb06 | Romain Dolbeau | { |

776 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
``` |

777 | 35e5fb06 | Romain Dolbeau | ```
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
``` |

778 | ```
int i;
``` |
||

779 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
``` |

780 | 35e5fb06 | Romain Dolbeau | for (i = 0; i < h; i++) { |

781 | *((uint32_t *) (block)) = |
||

782 | (((*((uint32_t *) (block))) | |
||

783 | ((((const struct unaligned_32 *) (pixels))->l))) - |
||

784 | ((((*((uint32_t *) (block))) ^ |
||

785 | ((((const struct unaligned_32 *) (pixels))-> |
||

786 | l))) & 0xFEFEFEFEUL) >> 1)); |
||

787 | ```
*((uint32_t *) (block + 4)) =
``` |
||

788 | ```
(((*((uint32_t *) (block + 4))) |
``` |
||

789 | ((((const struct unaligned_32 *) (pixels + 4))->l))) - |
||

790 | ```
((((*((uint32_t *) (block + 4))) ^
``` |
||

791 | ((((const struct unaligned_32 *) (pixels + |
||

792 | ```
4))->
``` |
||

793 | l))) & 0xFEFEFEFEUL) >> 1)); |
||

794 | pixels += line_size; |
||

795 | block += line_size; |
||

796 | } |
||

797 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
``` |

798 | 35e5fb06 | Romain Dolbeau | |

799 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
||

800 | register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; |
||

801 | ```
int i;
``` |
||

802 | |||

803 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
``` |

804 | 35e5fb06 | Romain Dolbeau | |

805 | for (i = 0; i < h; i++) { |
||

806 | ```
/*
``` |
||

807 | ```
block is 8 bytes-aligned, so we're either in the
``` |
||

808 | ```
left block (16 bytes-aligned) or in the right block (not)
``` |
||

809 | ```
*/
``` |
||

810 | int rightside = ((unsigned long)block & 0x0000000F); |
||

811 | |||

812 | ```
blockv = vec_ld(0, block);
``` |
||

813 | pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
||

814 | pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
||

815 | ```
pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
``` |
||

816 | |||

817 | ```
if (rightside)
``` |
||

818 | { |
||

819 | pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); |
||

820 | } |
||

821 | ```
else
``` |
||

822 | { |
||

823 | pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); |
||

824 | } |
||

825 | |||

826 | blockv = vec_avg(blockv, pixelsv); |
||

827 | |||

828 | ```
vec_st(blockv, 0, block);
``` |
||

829 | |||

830 | pixels += line_size; |
||

831 | block += line_size; |
||

832 | } |
||

833 | |||

834 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
``` |

835 | 35e5fb06 | Romain Dolbeau | |

836 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
||

837 | } |
||

838 | |||

839 | fe50f385 | Romain Dolbeau | ```
/* next one assumes that ((line_size % 8) == 0) */
``` |

840 | 35e5fb06 | Romain Dolbeau | void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |

841 | { |
||

842 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
``` |

843 | 35e5fb06 | Romain Dolbeau | ```
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
``` |

844 | ```
int j;
``` |
||

845 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
``` |

846 | 35e5fb06 | Romain Dolbeau | for (j = 0; j < 2; j++) { |

847 | ```
int i;
``` |
||

848 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
||

849 | ```
const uint32_t b =
``` |
||

850 | (((const struct unaligned_32 *) (pixels + 1))->l); |
||

851 | uint32_t l0 = |
||

852 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; |
||

853 | uint32_t h0 = |
||

854 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
||

855 | uint32_t l1, h1; |
||

856 | pixels += line_size; |
||

857 | for (i = 0; i < h; i += 2) { |
||

858 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
||

859 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); |
||

860 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL); |
||

861 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
||

862 | *((uint32_t *) block) = |
||

863 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
||

864 | pixels += line_size; |
||

865 | block += line_size; |
||

866 | a = (((const struct unaligned_32 *) (pixels))->l); |
||

867 | b = (((const struct unaligned_32 *) (pixels + 1))->l); |
||

868 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; |
||

869 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
||

870 | *((uint32_t *) block) = |
||

871 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
||

872 | pixels += line_size; |
||

873 | block += line_size; |
||

874 | } pixels += 4 - line_size * (h + 1); |
||

875 | ```
block += 4 - line_size * h;
``` |
||

876 | } |
||

877 | |||

878 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
``` |

879 | 35e5fb06 | Romain Dolbeau | |

880 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
||

881 | register int i; |
||

882 | register vector unsigned char |
||

883 | pixelsv1, pixelsv2, |
||

884 | pixelsavg; |
||

885 | register vector unsigned char |
||

886 | blockv, temp1, temp2; |
||

887 | register vector unsigned short |
||

888 | pixelssum1, pixelssum2, temp3; |
||

889 | 3b991c54 | Romain Dolbeau | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |

890 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); |
||

891 | 35e5fb06 | Romain Dolbeau | |

892 | ```
temp1 = vec_ld(0, pixels);
``` |
||

893 | ```
temp2 = vec_ld(16, pixels);
``` |
||

894 | ```
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
``` |
||

895 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) |
||

896 | { |
||

897 | pixelsv2 = temp2; |
||

898 | } |
||

899 | ```
else
``` |
||

900 | { |
||

901 | ```
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
``` |
||

902 | } |
||

903 | pixelsv1 = vec_mergeh(vczero, pixelsv1); |
||

904 | pixelsv2 = vec_mergeh(vczero, pixelsv2); |
||

905 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
||

906 | (vector unsigned short)pixelsv2); |
||

907 | pixelssum1 = vec_add(pixelssum1, vctwo); |
||

908 | |||

909 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
``` |

910 | 35e5fb06 | Romain Dolbeau | for (i = 0; i < h ; i++) { |

911 | int rightside = ((unsigned long)block & 0x0000000F); |
||

912 | ```
blockv = vec_ld(0, block);
``` |
||

913 | |||

914 | temp1 = vec_ld(line_size, pixels); |
||

915 | ```
temp2 = vec_ld(line_size + 16, pixels);
``` |
||

916 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); |
||

917 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) |
||

918 | { |
||

919 | pixelsv2 = temp2; |
||

920 | } |
||

921 | ```
else
``` |
||

922 | { |
||

923 | ```
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
``` |
||

924 | } |
||

925 | |||

926 | pixelsv1 = vec_mergeh(vczero, pixelsv1); |
||

927 | pixelsv2 = vec_mergeh(vczero, pixelsv2); |
||

928 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
||

929 | (vector unsigned short)pixelsv2); |
||

930 | temp3 = vec_add(pixelssum1, pixelssum2); |
||

931 | temp3 = vec_sra(temp3, vctwo); |
||

932 | pixelssum1 = vec_add(pixelssum2, vctwo); |
||

933 | pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); |
||

934 | |||

935 | ```
if (rightside)
``` |
||

936 | { |
||

937 | blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); |
||

938 | } |
||

939 | ```
else
``` |
||

940 | { |
||

941 | blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); |
||

942 | } |
||

943 | |||

944 | ```
vec_st(blockv, 0, block);
``` |
||

945 | |||

946 | block += line_size; |
||

947 | pixels += line_size; |
||

948 | } |
||

949 | |||

950 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
``` |

951 | 35e5fb06 | Romain Dolbeau | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |

952 | } |
||

953 | |||

954 | fe50f385 | Romain Dolbeau | ```
/* next one assumes that ((line_size % 8) == 0) */
``` |

955 | void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
||

956 | { |
||

957 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
``` |

958 | fe50f385 | Romain Dolbeau | ```
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
``` |

959 | ```
int j;
``` |
||

960 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
``` |

961 | fe50f385 | Romain Dolbeau | for (j = 0; j < 2; j++) { |

962 | ```
int i;
``` |
||

963 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
||

964 | ```
const uint32_t b =
``` |
||

965 | (((const struct unaligned_32 *) (pixels + 1))->l); |
||

966 | uint32_t l0 = |
||

967 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; |
||

968 | uint32_t h0 = |
||

969 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
||

970 | uint32_t l1, h1; |
||

971 | pixels += line_size; |
||

972 | for (i = 0; i < h; i += 2) { |
||

973 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
||

974 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); |
||

975 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL); |
||

976 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
||

977 | *((uint32_t *) block) = |
||

978 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
||

979 | pixels += line_size; |
||

980 | block += line_size; |
||

981 | a = (((const struct unaligned_32 *) (pixels))->l); |
||

982 | b = (((const struct unaligned_32 *) (pixels + 1))->l); |
||

983 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; |
||

984 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
||

985 | *((uint32_t *) block) = |
||

986 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
||

987 | pixels += line_size; |
||

988 | block += line_size; |
||

989 | } pixels += 4 - line_size * (h + 1); |
||

990 | ```
block += 4 - line_size * h;
``` |
||

991 | } |
||

992 | |||

993 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
``` |

994 | fe50f385 | Romain Dolbeau | |

995 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
||

996 | register int i; |
||

997 | register vector unsigned char |
||

998 | pixelsv1, pixelsv2, |
||

999 | pixelsavg; |
||

1000 | register vector unsigned char |
||

1001 | blockv, temp1, temp2; |
||

1002 | register vector unsigned short |
||

1003 | pixelssum1, pixelssum2, temp3; |
||

1004 | 3b991c54 | Romain Dolbeau | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |

1005 | register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); |
||

1006 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); |
||

1007 | fe50f385 | Romain Dolbeau | |

1008 | ```
temp1 = vec_ld(0, pixels);
``` |
||

1009 | ```
temp2 = vec_ld(16, pixels);
``` |
||

1010 | ```
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
``` |
||

1011 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) |
||

1012 | { |
||

1013 | pixelsv2 = temp2; |
||

1014 | } |
||

1015 | ```
else
``` |
||

1016 | { |
||

1017 | ```
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
``` |
||

1018 | } |
||

1019 | pixelsv1 = vec_mergeh(vczero, pixelsv1); |
||

1020 | pixelsv2 = vec_mergeh(vczero, pixelsv2); |
||

1021 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
||

1022 | (vector unsigned short)pixelsv2); |
||

1023 | pixelssum1 = vec_add(pixelssum1, vcone); |
||

1024 | |||

1025 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
``` |

1026 | fe50f385 | Romain Dolbeau | for (i = 0; i < h ; i++) { |

1027 | int rightside = ((unsigned long)block & 0x0000000F); |
||

1028 | ```
blockv = vec_ld(0, block);
``` |
||

1029 | |||

1030 | temp1 = vec_ld(line_size, pixels); |
||

1031 | ```
temp2 = vec_ld(line_size + 16, pixels);
``` |
||

1032 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); |
||

1033 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) |
||

1034 | { |
||

1035 | pixelsv2 = temp2; |
||

1036 | } |
||

1037 | ```
else
``` |
||

1038 | { |
||

1039 | ```
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
``` |
||

1040 | } |
||

1041 | |||

1042 | pixelsv1 = vec_mergeh(vczero, pixelsv1); |
||

1043 | pixelsv2 = vec_mergeh(vczero, pixelsv2); |
||

1044 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
||

1045 | (vector unsigned short)pixelsv2); |
||

1046 | temp3 = vec_add(pixelssum1, pixelssum2); |
||

1047 | temp3 = vec_sra(temp3, vctwo); |
||

1048 | pixelssum1 = vec_add(pixelssum2, vcone); |
||

1049 | pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); |
||

1050 | |||

1051 | ```
if (rightside)
``` |
||

1052 | { |
||

1053 | blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); |
||

1054 | } |
||

1055 | ```
else
``` |
||

1056 | { |
||

1057 | blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); |
||

1058 | } |
||

1059 | |||

1060 | ```
vec_st(blockv, 0, block);
``` |
||

1061 | |||

1062 | block += line_size; |
||

1063 | pixels += line_size; |
||

1064 | } |
||

1065 | |||

1066 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
``` |

1067 | fe50f385 | Romain Dolbeau | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |

1068 | } |
||

1069 | |||

1070 | ```
/* next one assumes that ((line_size % 16) == 0) */
``` |
||

1071 | void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) |
||

1072 | { |
||

1073 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
``` |

1074 | fe50f385 | Romain Dolbeau | ```
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
``` |

1075 | ```
int j;
``` |
||

1076 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
``` |

1077 | fe50f385 | Romain Dolbeau | for (j = 0; j < 4; j++) { |

1078 | ```
int i;
``` |
||

1079 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
||

1080 | ```
const uint32_t b =
``` |
||

1081 | (((const struct unaligned_32 *) (pixels + 1))->l); |
||

1082 | uint32_t l0 = |
||

1083 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; |
||

1084 | uint32_t h0 = |
||

1085 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
||

1086 | uint32_t l1, h1; |
||

1087 | pixels += line_size; |
||

1088 | for (i = 0; i < h; i += 2) { |
||

1089 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
||

1090 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); |
||

1091 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL); |
||

1092 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
||

1093 | *((uint32_t *) block) = |
||

1094 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
||

1095 | pixels += line_size; |
||

1096 | block += line_size; |
||

1097 | a = (((const struct unaligned_32 *) (pixels))->l); |
||

1098 | b = (((const struct unaligned_32 *) (pixels + 1))->l); |
||

1099 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; |
||

1100 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
||

1101 | *((uint32_t *) block) = |
||

1102 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
||

1103 | pixels += line_size; |
||

1104 | block += line_size; |
||

1105 | } pixels += 4 - line_size * (h + 1); |
||

1106 | ```
block += 4 - line_size * h;
``` |
||

1107 | } |
||

1108 | |||

1109 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
``` |

1110 | fe50f385 | Romain Dolbeau | |

1111 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
||

1112 | register int i; |
||

1113 | register vector unsigned char |
||

1114 | pixelsv1, pixelsv2, pixelsv3, pixelsv4; |
||

1115 | register vector unsigned char |
||

1116 | blockv, temp1, temp2; |
||

1117 | register vector unsigned short |
||

1118 | pixelssum1, pixelssum2, temp3, |
||

1119 | pixelssum3, pixelssum4, temp4; |
||

1120 | 3b991c54 | Romain Dolbeau | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |

1121 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); |
||

1122 | 3efd4952 | Romain Dolbeau | |

1123 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
``` |

1124 | 3efd4952 | Romain Dolbeau | |

1125 | fe50f385 | Romain Dolbeau | ```
temp1 = vec_ld(0, pixels);
``` |

1126 | ```
temp2 = vec_ld(16, pixels);
``` |
||

1127 | ```
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
``` |
||

1128 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) |
||

1129 | { |
||

1130 | pixelsv2 = temp2; |
||

1131 | } |
||

1132 | ```
else
``` |
||

1133 | { |
||

1134 | ```
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
``` |
||

1135 | } |
||

1136 | pixelsv3 = vec_mergel(vczero, pixelsv1); |
||

1137 | pixelsv4 = vec_mergel(vczero, pixelsv2); |
||

1138 | pixelsv1 = vec_mergeh(vczero, pixelsv1); |
||

1139 | pixelsv2 = vec_mergeh(vczero, pixelsv2); |
||

1140 | pixelssum3 = vec_add((vector unsigned short)pixelsv3, |
||

1141 | (vector unsigned short)pixelsv4); |
||

1142 | pixelssum3 = vec_add(pixelssum3, vctwo); |
||

1143 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
||

1144 | (vector unsigned short)pixelsv2); |
||

1145 | pixelssum1 = vec_add(pixelssum1, vctwo); |
||

1146 | |||

1147 | for (i = 0; i < h ; i++) { |
||

1148 | ```
blockv = vec_ld(0, block);
``` |
||

1149 | |||

1150 | temp1 = vec_ld(line_size, pixels); |
||

1151 | ```
temp2 = vec_ld(line_size + 16, pixels);
``` |
||

1152 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); |
||

1153 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) |
||

1154 | { |
||

1155 | pixelsv2 = temp2; |
||

1156 | } |
||

1157 | ```
else
``` |
||

1158 | { |
||

1159 | ```
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
``` |
||

1160 | } |
||

1161 | |||

1162 | pixelsv3 = vec_mergel(vczero, pixelsv1); |
||

1163 | pixelsv4 = vec_mergel(vczero, pixelsv2); |
||

1164 | pixelsv1 = vec_mergeh(vczero, pixelsv1); |
||

1165 | pixelsv2 = vec_mergeh(vczero, pixelsv2); |
||

1166 | |||

1167 | pixelssum4 = vec_add((vector unsigned short)pixelsv3, |
||

1168 | (vector unsigned short)pixelsv4); |
||

1169 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
||

1170 | (vector unsigned short)pixelsv2); |
||

1171 | temp4 = vec_add(pixelssum3, pixelssum4); |
||

1172 | temp4 = vec_sra(temp4, vctwo); |
||

1173 | temp3 = vec_add(pixelssum1, pixelssum2); |
||

1174 | temp3 = vec_sra(temp3, vctwo); |
||

1175 | |||

1176 | pixelssum3 = vec_add(pixelssum4, vctwo); |
||

1177 | pixelssum1 = vec_add(pixelssum2, vctwo); |
||

1178 | |||

1179 | blockv = vec_packsu(temp3, temp4); |
||

1180 | |||

1181 | ```
vec_st(blockv, 0, block);
``` |
||

1182 | |||

1183 | block += line_size; |
||

1184 | pixels += line_size; |
||

1185 | } |
||

1186 | |||

1187 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
``` |

1188 | fe50f385 | Romain Dolbeau | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |

1189 | } |
||

1190 | |||

1191 | ```
/* next one assumes that ((line_size % 16) == 0) */
``` |
||

1192 | void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) |
||

1193 | { |
||

1194 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
``` |

1195 | fe50f385 | Romain Dolbeau | ```
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
``` |

1196 | ```
int j;
``` |
||

1197 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
``` |

1198 | fe50f385 | Romain Dolbeau | for (j = 0; j < 4; j++) { |

1199 | ```
int i;
``` |
||

1200 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
||

1201 | ```
const uint32_t b =
``` |
||

1202 | (((const struct unaligned_32 *) (pixels + 1))->l); |
||

1203 | uint32_t l0 = |
||

1204 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; |
||

1205 | uint32_t h0 = |
||

1206 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
||

1207 | uint32_t l1, h1; |
||

1208 | pixels += line_size; |
||

1209 | for (i = 0; i < h; i += 2) { |
||

1210 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
||

1211 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); |
||

1212 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL); |
||

1213 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
||

1214 | *((uint32_t *) block) = |
||

1215 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
||

1216 | pixels += line_size; |
||

1217 | block += line_size; |
||

1218 | a = (((const struct unaligned_32 *) (pixels))->l); |
||

1219 | b = (((const struct unaligned_32 *) (pixels + 1))->l); |
||

1220 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; |
||

1221 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
||

1222 | *((uint32_t *) block) = |
||

1223 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
||

1224 | pixels += line_size; |
||

1225 | block += line_size; |
||

1226 | } pixels += 4 - line_size * (h + 1); |
||

1227 | ```
block += 4 - line_size * h;
``` |
||

1228 | } |
||

1229 | |||

1230 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
``` |

1231 | fe50f385 | Romain Dolbeau | |

1232 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
||

1233 | register int i; |
||

1234 | register vector unsigned char |
||

1235 | pixelsv1, pixelsv2, pixelsv3, pixelsv4; |
||

1236 | register vector unsigned char |
||

1237 | blockv, temp1, temp2; |
||

1238 | register vector unsigned short |
||

1239 | pixelssum1, pixelssum2, temp3, |
||

1240 | pixelssum3, pixelssum4, temp4; |
||

1241 | 3b991c54 | Romain Dolbeau | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |

1242 | register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); |
||

1243 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); |
||

1244 | 3efd4952 | Romain Dolbeau | |

1245 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
``` |

1246 | 3efd4952 | Romain Dolbeau | |

1247 | fe50f385 | Romain Dolbeau | ```
temp1 = vec_ld(0, pixels);
``` |

1248 | ```
temp2 = vec_ld(16, pixels);
``` |
||

1249 | ```
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
``` |
||

1250 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) |
||

1251 | { |
||

1252 | pixelsv2 = temp2; |
||

1253 | } |
||

1254 | ```
else
``` |
||

1255 | { |
||

1256 | ```
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
``` |
||

1257 | } |
||

1258 | pixelsv3 = vec_mergel(vczero, pixelsv1); |
||

1259 | pixelsv4 = vec_mergel(vczero, pixelsv2); |
||

1260 | pixelsv1 = vec_mergeh(vczero, pixelsv1); |
||

1261 | pixelsv2 = vec_mergeh(vczero, pixelsv2); |
||

1262 | pixelssum3 = vec_add((vector unsigned short)pixelsv3, |
||

1263 | (vector unsigned short)pixelsv4); |
||

1264 | pixelssum3 = vec_add(pixelssum3, vcone); |
||

1265 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
||

1266 | (vector unsigned short)pixelsv2); |
||

1267 | pixelssum1 = vec_add(pixelssum1, vcone); |
||

1268 | |||

1269 | for (i = 0; i < h ; i++) { |
||

1270 | ```
blockv = vec_ld(0, block);
``` |
||

1271 | |||

1272 | temp1 = vec_ld(line_size, pixels); |
||

1273 | ```
temp2 = vec_ld(line_size + 16, pixels);
``` |
||

1274 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); |
||

1275 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) |
||

1276 | { |
||

1277 | pixelsv2 = temp2; |
||

1278 | } |
||

1279 | ```
else
``` |
||

1280 | { |
||

1281 | ```
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
``` |
||

1282 | } |
||

1283 | |||

1284 | pixelsv3 = vec_mergel(vczero, pixelsv1); |
||

1285 | pixelsv4 = vec_mergel(vczero, pixelsv2); |
||

1286 | pixelsv1 = vec_mergeh(vczero, pixelsv1); |
||

1287 | pixelsv2 = vec_mergeh(vczero, pixelsv2); |
||

1288 | |||

1289 | pixelssum4 = vec_add((vector unsigned short)pixelsv3, |
||

1290 | (vector unsigned short)pixelsv4); |
||

1291 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
||

1292 | (vector unsigned short)pixelsv2); |
||

1293 | temp4 = vec_add(pixelssum3, pixelssum4); |
||

1294 | temp4 = vec_sra(temp4, vctwo); |
||

1295 | temp3 = vec_add(pixelssum1, pixelssum2); |
||

1296 | temp3 = vec_sra(temp3, vctwo); |
||

1297 | |||

1298 | pixelssum3 = vec_add(pixelssum4, vcone); |
||

1299 | pixelssum1 = vec_add(pixelssum2, vcone); |
||

1300 | |||

1301 | blockv = vec_packsu(temp3, temp4); |
||

1302 | |||

1303 | ```
vec_st(blockv, 0, block);
``` |
||

1304 | |||

1305 | block += line_size; |
||

1306 | pixels += line_size; |
||

1307 | } |
||

1308 | |||

1309 | e45a2872 | Romain Dolbeau | ```
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
``` |

1310 | fe50f385 | Romain Dolbeau | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |

1311 | } |
||

1312 | |||

1313 | 59925ef2 | Brian Foley | int has_altivec(void) |

1314 | { |
||

1315 | 3b991c54 | Romain Dolbeau | ```
#ifdef CONFIG_DARWIN
``` |

1316 | 59925ef2 | Brian Foley | int sels[2] = {CTL_HW, HW_VECTORUNIT}; |

1317 | int has_vu = 0; |
||

1318 | ```
size_t len = sizeof(has_vu);
``` |
||

1319 | ```
int err;
``` |
||

1320 | |||

1321 | err = sysctl(sels, 2, &has_vu, &len, NULL, 0); |
||

1322 | |||

1323 | if (err == 0) return (has_vu != 0); |
||

1324 | 3b991c54 | Romain Dolbeau | #else /* CONFIG_DARWIN */ |

1325 | ```
/* no Darwin, do it the brute-force way */
``` |
||

1326 | ```
/* this is borrowed from the libmpeg2 library */
``` |
||

1327 | { |
||

1328 | signal (SIGILL, sigill_handler); |
||

1329 | if (sigsetjmp (jmpbuf, 1)) { |
||

1330 | signal (SIGILL, SIG_DFL); |
||

1331 | ```
} else {
``` |
||

1332 | ```
canjump = 1;
``` |
||

1333 | |||

1334 | asm volatile ("mtspr 256, %0\n\t" |
||

1335 | ```
"vand %%v0, %%v0, %%v0"
``` |
||

1336 | : |
||

1337 | : "r" (-1)); |
||

1338 | |||

1339 | signal (SIGILL, SIG_DFL); |
||

1340 | return 1; |
||

1341 | } |
||

1342 | } |
||

1343 | #endif /* CONFIG_DARWIN */ |
||

1344 | 59925ef2 | Brian Foley | return 0; |

1345 | } |