## ffmpeg / postproc / yuv2rgb_altivec.c @ 20b02bc6

History | View | Annotate | Download (28.3 KB)

1 | a31de956 | Michael Niedermayer | ```
/*
``` |
---|---|---|---|

2 | ```
marc.hoffman@analog.com March 8, 2004
``` |
||

3 | |||

4 | ```
Altivec Acceleration for Color Space Conversion revision 0.2
``` |
||

5 | |||

6 | ```
convert I420 YV12 to RGB in various formats,
``` |
||

7 | ```
it rejects images that are not in 420 formats
``` |
||

8 | ```
it rejects images that don't have widths of multiples of 16
``` |
||

9 | ```
it rejects images that don't have heights of multiples of 2
``` |
||

10 | ```
reject defers to C simulation codes.
``` |
||

11 | |||

12 | ```
lots of optimizations to be done here
``` |
||

13 | |||

14 | ```
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
``` |
||

15 | ```
so we currently use max min to clip
``` |
||

16 | |||

17 | ```
2. the inefficient use of chroma loading needs a bit of brushing up
``` |
||

18 | |||

19 | ```
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
``` |
||

20 | |||

21 | |||

22 | ```
MODIFIED to calculate coeffs from currently selected color space.
``` |
||

23 | ```
MODIFIED core to be a macro which you spec the output format.
``` |
||

24 | ```
ADDED UYVY conversion which is never called due to some thing in SWSCALE.
``` |
||

25 | ```
CORRECTED algorithim selection to be strict on input formats.
``` |
||

26 | ```
ADDED runtime detection of altivec.
``` |
||

27 | |||

28 | ```
ADDED altivec_yuv2packedX vertical scl + RGB converter
``` |
||

29 | |||

30 | ```
March 27,2004
``` |
||

31 | ```
PERFORMANCE ANALYSIS
``` |
||

32 | |||

33 | ```
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
``` |
||

34 | ```
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
``` |
||

35 | |||

36 | ```
720*480*30 ~10MPS
``` |
||

37 | |||

38 | ```
so we have roughly 10clocks per pixel this is too high something has to be wrong.
``` |
||

39 | |||

40 | ```
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
``` |
||

41 | |||

42 | ```
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
``` |
||

43 | ```
guaranteed to have the input video frame it was just decompressed so
``` |
||

44 | ```
it probably resides in L1 caches. However we are creating the
``` |
||

45 | ```
output video stream this needs to use the DSTST instruction to
``` |
||

46 | ```
optimize for the cache. We couple this with the fact that we are
``` |
||

47 | ```
not going to be visiting the input buffer again so we mark it Least
``` |
||

48 | ```
Recently Used. This shaves 25% of the processor cycles off.
``` |
||

49 | |||

50 | ```
Now MEMCPY is the largest mips consumer in the system, probably due
``` |
||

51 | ```
to the inefficient X11 stuff.
``` |
||

52 | |||

53 | ```
GL libraries seem to be very slow on this machine 1.33Ghz PB running
``` |
||

54 | ```
Jaguar, this is not the case for my 1Ghz PB. I thought it might be
``` |
||

55 | ```
a versioning issues, however i have libGL.1.2.dylib for both
``` |
||

56 | ```
machines. ((We need to figure this out now))
``` |
||

57 | |||

58 | ```
GL2 libraries work now with patch for RGB32
``` |
||

59 | |||

60 | ```
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
``` |
||

61 | |||

62 | ```
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
``` |
||

63 | |||

64 | ```
*/
``` |
||

65 | #include <stdio.h> |
||

66 | #include <stdlib.h> |
||

67 | 84fdd642 | Alex Beregszaszi | #include <string.h> |

68 | a31de956 | Michael Niedermayer | #include <inttypes.h> |

69 | #include <assert.h> |
||

70 | #include "config.h" |
||

71 | #include "rgb2rgb.h" |
||

72 | #include "swscale.h" |
||

73 | #include "swscale_internal.h" |
||

74 | #include "../mangle.h" |
||

75 | #include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff |
||

76 | |||

77 | ```
#undef PROFILE_THE_BEAST
``` |
||

78 | ```
#undef INC_SCALING
``` |
||

79 | |||

80 | typedef unsigned char ubyte; |
||

81 | typedef signed char sbyte; |
||

82 | |||

83 | |||

84 | ```
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
``` |
||

85 | ```
homogeneous vector registers x0,x1,x2 are interleaved with the
``` |
||

86 | ```
following technique:
``` |
||

87 | |||

88 | ```
o0 = vec_mergeh (x0,x1);
``` |
||

89 | ```
o1 = vec_perm (o0, x2, perm_rgb_0);
``` |
||

90 | ```
o2 = vec_perm (o0, x2, perm_rgb_1);
``` |
||

91 | ```
o3 = vec_mergel (x0,x1);
``` |
||

92 | ```
o4 = vec_perm (o3,o2,perm_rgb_2);
``` |
||

93 | ```
o5 = vec_perm (o3,o2,perm_rgb_3);
``` |
||

94 | |||

95 | ```
perm_rgb_0: o0(RG).h v1(B) --> o1*
``` |
||

96 | ```
0 1 2 3 4
``` |
||

97 | ```
rgbr|gbrg|brgb|rgbr
``` |
||

98 | ```
0010 0100 1001 0010
``` |
||

99 | ```
0102 3145 2673 894A
``` |
||

100 | |||

101 | ```
perm_rgb_1: o0(RG).h v1(B) --> o2
``` |
||

102 | ```
0 1 2 3 4
``` |
||

103 | ```
gbrg|brgb|bbbb|bbbb
``` |
||

104 | ```
0100 1001 1111 1111
``` |
||

105 | ```
B5CD 6EF7 89AB CDEF
``` |
||

106 | |||

107 | ```
perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
``` |
||

108 | ```
0 1 2 3 4
``` |
||

109 | ```
gbrg|brgb|rgbr|gbrg
``` |
||

110 | ```
1111 1111 0010 0100
``` |
||

111 | ```
89AB CDEF 0182 3945
``` |
||

112 | |||

113 | ```
perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
``` |
||

114 | ```
0 1 2 3 4
``` |
||

115 | ```
brgb|rgbr|gbrg|brgb
``` |
||

116 | ```
1001 0010 0100 1001
``` |
||

117 | ```
a67b 89cA BdCD eEFf
``` |
||

118 | |||

119 | ```
*/
``` |
||

120 | ```
static
``` |
||

121 | const vector unsigned char |
||

122 | 582552fb | Luca Barbato | perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05, |

123 | a31de956 | Michael Niedermayer | 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a), |

124 | 582552fb | Luca Barbato | perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17, |

125 | a31de956 | Michael Niedermayer | 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f), |

126 | 582552fb | Luca Barbato | perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, |

127 | a31de956 | Michael Niedermayer | 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05), |

128 | 582552fb | Luca Barbato | perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a, |

129 | a31de956 | Michael Niedermayer | 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f); |

130 | |||

131 | ```
#define vec_merge3(x2,x1,x0,y0,y1,y2) \
``` |
||

132 | ```
do { \
``` |
||

133 | typeof(x0) o0,o2,o3; \ |
||

134 | o0 = vec_mergeh (x0,x1); \ |
||

135 | y0 = vec_perm (o0, x2, perm_rgb_0);\ |
||

136 | o2 = vec_perm (o0, x2, perm_rgb_1);\ |
||

137 | o3 = vec_mergel (x0,x1); \ |
||

138 | y1 = vec_perm (o3,o2,perm_rgb_2); \ |
||

139 | y2 = vec_perm (o3,o2,perm_rgb_3); \ |
||

140 | } while(0) |
||

141 | |||

142 | ```
#define vec_mstrgb24(x0,x1,x2,ptr) \
``` |
||

143 | ```
do { \
``` |
||

144 | typeof(x0) _0,_1,_2; \ |
||

145 | vec_merge3 (x0,x1,x2,_0,_1,_2); \ |
||

146 | ```
vec_st (_0, 0, ptr++); \
``` |
||

147 | ```
vec_st (_1, 0, ptr++); \
``` |
||

148 | ```
vec_st (_2, 0, ptr++); \
``` |
||

149 | } while (0); |
||

150 | |||

151 | ```
#define vec_mstbgr24(x0,x1,x2,ptr) \
``` |
||

152 | ```
do { \
``` |
||

153 | typeof(x0) _0,_1,_2; \ |
||

154 | vec_merge3 (x2,x1,x0,_0,_1,_2); \ |
||

155 | ```
vec_st (_0, 0, ptr++); \
``` |
||

156 | ```
vec_st (_1, 0, ptr++); \
``` |
||

157 | ```
vec_st (_2, 0, ptr++); \
``` |
||

158 | } while (0); |
||

159 | |||

160 | ```
/* pack the pixels in rgb0 format
``` |
||

161 | ```
msb R
``` |
||

162 | ```
lsb 0
``` |
||

163 | ```
*/
``` |
||

164 | ```
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
``` |
||

165 | ```
do { \
``` |
||

166 | T _0,_1,_2,_3; \ |
||

167 | _0 = vec_mergeh (x0,x1); \ |
||

168 | _1 = vec_mergeh (x2,x3); \ |
||

169 | _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ |
||

170 | _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ |
||

171 | vec_st (_2, 0*16, (T *)ptr); \ |
||

172 | vec_st (_3, 1*16, (T *)ptr); \ |
||

173 | _0 = vec_mergel (x0,x1); \ |
||

174 | _1 = vec_mergel (x2,x3); \ |
||

175 | _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ |
||

176 | _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ |
||

177 | vec_st (_2, 2*16, (T *)ptr); \ |
||

178 | vec_st (_3, 3*16, (T *)ptr); \ |
||

179 | ```
ptr += 4; \
``` |
||

180 | } while (0); |
||

181 | |||

182 | ```
/*
``` |
||

183 | |||

184 | ```
| 1 0 1.4021 | | Y |
``` |
||

185 | ```
| 1 -0.3441 -0.7142 |x| Cb|
``` |
||

186 | ```
| 1 1.7718 0 | | Cr|
``` |
||

187 | |||

188 | |||

189 | ```
Y: [-128 127]
``` |
||

190 | ```
Cb/Cr : [-128 127]
``` |
||

191 | |||

192 | ```
typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
``` |
||

193 | |||

194 | ```
*/
``` |
||

195 | |||

196 | |||

197 | |||

198 | |||

199 | ```
#define vec_unh(x) \
``` |
||

200 | (vector signed short) \ |
||

201 | 582552fb | Luca Barbato | ```
vec_perm(x,(typeof(x))AVV(0),\
``` |

202 | (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ |
||

203 | a31de956 | Michael Niedermayer | 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07)) |

204 | ```
#define vec_unl(x) \
``` |
||

205 | (vector signed short) \ |
||

206 | 582552fb | Luca Barbato | ```
vec_perm(x,(typeof(x))AVV(0),\
``` |

207 | (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ |
||

208 | a31de956 | Michael Niedermayer | 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F)) |

209 | |||

210 | ```
#define vec_clip(x) \
``` |
||

211 | 582552fb | Luca Barbato | vec_max (vec_min (x, (typeof(x))AVV(235)), (typeof(x))AVV(16)) |

212 | a31de956 | Michael Niedermayer | |

213 | ```
#define vec_packclp_a(x,y) \
``` |
||

214 | (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y)) |
||

215 | |||

216 | ```
#define vec_packclp(x,y) \
``` |
||

217 | (vector unsigned char)vec_packs \ |
||

218 | 582552fb | Luca Barbato | ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \ |

219 | (vector unsigned short)vec_max (y,(vector signed short) AVV(0))) |
||

220 | a31de956 | Michael Niedermayer | |

221 | 582552fb | Luca Barbato | ```
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
``` |

222 | a31de956 | Michael Niedermayer | |

223 | |||

224 | 84fdd642 | Alex Beregszaszi | static inline void cvtyuvtoRGB (SwsContext *c, |

225 | a31de956 | Michael Niedermayer | vector signed short Y, vector signed short U, vector signed short V, |

226 | vector signed short *R, vector signed short *G, vector signed short *B) |
||

227 | { |
||

228 | vector signed short vx,ux,uvx; |
||

229 | |||

230 | Y = vec_mradds (Y, c->CY, c->OY); |
||

231 | 582552fb | Luca Barbato | U = vec_sub (U,(vector signed short) |

232 | vec_splat((vector signed short)AVV(128),0)); |
||

233 | V = vec_sub (V,(vector signed short) |
||

234 | vec_splat((vector signed short)AVV(128),0)); |
||

235 | a31de956 | Michael Niedermayer | |

236 | ```
// ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
``` |
||

237 | ux = vec_sl (U, c->CSHIFT); |
||

238 | *B = vec_mradds (ux, c->CBU, Y); |
||

239 | |||

240 | ```
// vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
``` |
||

241 | vx = vec_sl (V, c->CSHIFT); |
||

242 | *R = vec_mradds (vx, c->CRV, Y); |
||

243 | |||

244 | ```
// uvx = ((CGU*u) + (CGV*v))>>15;
``` |
||

245 | uvx = vec_mradds (U, c->CGU, Y); |
||

246 | *G = vec_mradds (V, c->CGV, uvx); |
||

247 | } |
||

248 | |||

249 | |||

250 | ```
/*
``` |
||

251 | ```
------------------------------------------------------------------------------
``` |
||

252 | ```
CS converters
``` |
||

253 | ```
------------------------------------------------------------------------------
``` |
||

254 | ```
*/
``` |
||

255 | |||

256 | |||

257 | ```
#define DEFCSP420_CVT(name,out_pixels) \
``` |
||

258 | static int altivec_##name (SwsContext *c, \ |
||

259 | unsigned char **in, int *instrides, \ |
||

260 | int srcSliceY, int srcSliceH, \ |
||

261 | unsigned char **oplanes, int *outstrides) \ |
||

262 | { \ |
||

263 | ```
int w = c->srcW; \
``` |
||

264 | ```
int h = srcSliceH; \
``` |
||

265 | ```
int i,j; \
``` |
||

266 | int instrides_scl[3]; \ |
||

267 | vector unsigned char y0,y1; \ |
||

268 | \ |
||

269 | vector signed char u,v; \ |
||

270 | \ |
||

271 | vector signed short Y0,Y1,Y2,Y3; \ |
||

272 | vector signed short U,V; \ |
||

273 | vector signed short vx,ux,uvx; \ |
||

274 | vector signed short vx0,ux0,uvx0; \ |
||

275 | vector signed short vx1,ux1,uvx1; \ |
||

276 | vector signed short R0,G0,B0; \ |
||

277 | vector signed short R1,G1,B1; \ |
||

278 | vector unsigned char R,G,B; \ |
||

279 | \ |
||

280 | vector unsigned char *uivP, *vivP; \ |
||

281 | vector unsigned char align_perm; \ |
||

282 | \ |
||

283 | vector signed short \ |
||

284 | lCY = c->CY, \ |
||

285 | lOY = c->OY, \ |
||

286 | lCRV = c->CRV, \ |
||

287 | lCBU = c->CBU, \ |
||

288 | lCGU = c->CGU, \ |
||

289 | lCGV = c->CGV; \ |
||

290 | \ |
||

291 | vector unsigned short lCSHIFT = c->CSHIFT; \ |
||

292 | \ |
||

293 | ```
ubyte *y1i = in[0]; \
``` |
||

294 | ```
ubyte *y2i = in[0]+w; \
``` |
||

295 | ```
ubyte *ui = in[1]; \
``` |
||

296 | ```
ubyte *vi = in[2]; \
``` |
||

297 | \ |
||

298 | vector unsigned char *oute \ |
||

299 | = (vector unsigned char *) \ |
||

300 | (oplanes[0]+srcSliceY*outstrides[0]); \ |
||

301 | vector unsigned char *outo \ |
||

302 | = (vector unsigned char *) \ |
||

303 | (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ |
||

304 | \ |
||

305 | \ |
||

306 | instrides_scl[0] = instrides[0]; \ |
||

307 | instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \ |
||

308 | instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \ |
||

309 | \ |
||

310 | \ |
||

311 | for (i=0;i<h/2;i++) { \ |
||

312 | vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \ |
||

313 | vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \ |
||

314 | \ |
||

315 | for (j=0;j<w/16;j++) { \ |
||

316 | \ |
||

317 | ```
y0 = vec_ldl (0,y1i); \
``` |
||

318 | ```
y1 = vec_ldl (0,y2i); \
``` |
||

319 | uivP = (vector unsigned char *)ui; \ |
||

320 | vivP = (vector unsigned char *)vi; \ |
||

321 | \ |
||

322 | ```
align_perm = vec_lvsl (0, ui); \
``` |
||

323 | u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \ |
||

324 | \ |
||

325 | ```
align_perm = vec_lvsl (0, vi); \
``` |
||

326 | v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \ |
||

327 | \ |
||

328 | 582552fb | Luca Barbato | u = (vector signed char) \ |

329 | vec_sub (u,(vector signed char) \ |
||

330 | vec_splat((vector signed char)AVV(128),0));\ |
||

331 | v = (vector signed char) \ |
||

332 | vec_sub (v,(vector signed char) \ |
||

333 | vec_splat((vector signed char)AVV(128),0));\ |
||

334 | \ |
||

335 | a31de956 | Michael Niedermayer | U = vec_unpackh (u); \ |

336 | V = vec_unpackh (v); \ |
||

337 | \ |
||

338 | \ |
||

339 | Y0 = vec_unh (y0); \ |
||

340 | Y1 = vec_unl (y0); \ |
||

341 | Y2 = vec_unh (y1); \ |
||

342 | Y3 = vec_unl (y1); \ |
||

343 | \ |
||

344 | Y0 = vec_mradds (Y0, lCY, lOY); \ |
||

345 | Y1 = vec_mradds (Y1, lCY, lOY); \ |
||

346 | Y2 = vec_mradds (Y2, lCY, lOY); \ |
||

347 | Y3 = vec_mradds (Y3, lCY, lOY); \ |
||

348 | \ |
||

349 | ```
/* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
``` |
||

350 | ux = vec_sl (U, lCSHIFT); \ |
||

351 | 582552fb | Luca Barbato | ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \ |

352 | a31de956 | Michael Niedermayer | ux0 = vec_mergeh (ux,ux); \ |

353 | ux1 = vec_mergel (ux,ux); \ |
||

354 | \ |
||

355 | ```
/* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
``` |
||

356 | vx = vec_sl (V, lCSHIFT); \ |
||

357 | 582552fb | Luca Barbato | vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \ |

358 | a31de956 | Michael Niedermayer | vx0 = vec_mergeh (vx,vx); \ |

359 | vx1 = vec_mergel (vx,vx); \ |
||

360 | \ |
||

361 | ```
/* uvx = ((CGU*u) + (CGV*v))>>15 */ \
``` |
||

362 | 582552fb | Luca Barbato | uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \ |

363 | a31de956 | Michael Niedermayer | uvx = vec_mradds (V, lCGV, uvx); \ |

364 | uvx0 = vec_mergeh (uvx,uvx); \ |
||

365 | uvx1 = vec_mergel (uvx,uvx); \ |
||

366 | \ |
||

367 | R0 = vec_add (Y0,vx0); \ |
||

368 | G0 = vec_add (Y0,uvx0); \ |
||

369 | B0 = vec_add (Y0,ux0); \ |
||

370 | R1 = vec_add (Y1,vx1); \ |
||

371 | G1 = vec_add (Y1,uvx1); \ |
||

372 | B1 = vec_add (Y1,ux1); \ |
||

373 | \ |
||

374 | R = vec_packclp (R0,R1); \ |
||

375 | G = vec_packclp (G0,G1); \ |
||

376 | B = vec_packclp (B0,B1); \ |
||

377 | \ |
||

378 | out_pixels(R,G,B,oute); \ |
||

379 | \ |
||

380 | R0 = vec_add (Y2,vx0); \ |
||

381 | G0 = vec_add (Y2,uvx0); \ |
||

382 | B0 = vec_add (Y2,ux0); \ |
||

383 | R1 = vec_add (Y3,vx1); \ |
||

384 | G1 = vec_add (Y3,uvx1); \ |
||

385 | B1 = vec_add (Y3,ux1); \ |
||

386 | R = vec_packclp (R0,R1); \ |
||

387 | G = vec_packclp (G0,G1); \ |
||

388 | B = vec_packclp (B0,B1); \ |
||

389 | \ |
||

390 | \ |
||

391 | out_pixels(R,G,B,outo); \ |
||

392 | \ |
||

393 | ```
y1i += 16; \
``` |
||

394 | ```
y2i += 16; \
``` |
||

395 | ```
ui += 8; \
``` |
||

396 | ```
vi += 8; \
``` |
||

397 | \ |
||

398 | } \ |
||

399 | \ |
||

400 | outo += (outstrides[0])>>4; \ |
||

401 | oute += (outstrides[0])>>4; \ |
||

402 | \ |
||

403 | ```
ui += instrides_scl[1]; \
``` |
||

404 | ```
vi += instrides_scl[2]; \
``` |
||

405 | ```
y1i += instrides_scl[0]; \
``` |
||

406 | ```
y2i += instrides_scl[0]; \
``` |
||

407 | } \ |
||

408 | ```
return srcSliceH; \
``` |
||

409 | } |
||

410 | |||

411 | |||

412 | 582552fb | Luca Barbato | #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr) |

413 | #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr) |
||

414 | #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr) |
||

415 | #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr) |
||

416 | a31de956 | Michael Niedermayer | ```
#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
``` |

417 | 582552fb | Luca Barbato | ```
#define out_bgr24(a,b,c,ptr) vec_mstbgr24(c,b,a,ptr)
``` |

418 | a31de956 | Michael Niedermayer | |

419 | DEFCSP420_CVT (yuv2_abgr32, out_abgr) |
||

420 | 582552fb | Luca Barbato | #if 1 |

421 | a31de956 | Michael Niedermayer | DEFCSP420_CVT (yuv2_bgra32, out_argb) |

422 | 582552fb | Luca Barbato | ```
#else
``` |

423 | static int altivec_yuv2_bgra32 (SwsContext *c, |
||

424 | unsigned char **in, int *instrides, |
||

425 | int srcSliceY, int srcSliceH, |
||

426 | unsigned char **oplanes, int *outstrides) |
||

427 | { |
||

428 | ```
int w = c->srcW;
``` |
||

429 | ```
int h = srcSliceH;
``` |
||

430 | ```
int i,j;
``` |
||

431 | int instrides_scl[3]; |
||

432 | vector unsigned char y0,y1; |
||

433 | |||

434 | vector signed char u,v; |
||

435 | |||

436 | vector signed short Y0,Y1,Y2,Y3; |
||

437 | vector signed short U,V; |
||

438 | vector signed short vx,ux,uvx; |
||

439 | vector signed short vx0,ux0,uvx0; |
||

440 | vector signed short vx1,ux1,uvx1; |
||

441 | vector signed short R0,G0,B0; |
||

442 | vector signed short R1,G1,B1; |
||

443 | vector unsigned char R,G,B; |
||

444 | |||

445 | vector unsigned char *uivP, *vivP; |
||

446 | vector unsigned char align_perm; |
||

447 | |||

448 | vector signed short |
||

449 | lCY = c->CY, |
||

450 | lOY = c->OY, |
||

451 | lCRV = c->CRV, |
||

452 | lCBU = c->CBU, |
||

453 | lCGU = c->CGU, |
||

454 | lCGV = c->CGV; |
||

455 | |||

456 | vector unsigned short lCSHIFT = c->CSHIFT; |
||

457 | |||

458 | ```
ubyte *y1i = in[0];
``` |
||

459 | ```
ubyte *y2i = in[0]+w;
``` |
||

460 | ```
ubyte *ui = in[1];
``` |
||

461 | ```
ubyte *vi = in[2];
``` |
||

462 | |||

463 | vector unsigned char *oute |
||

464 | = (vector unsigned char *) |
||

465 | (oplanes[0]+srcSliceY*outstrides[0]); |
||

466 | vector unsigned char *outo |
||

467 | = (vector unsigned char *) |
||

468 | (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); |
||

469 | |||

470 | |||

471 | instrides_scl[0] = instrides[0]; |
||

472 | instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ |
||

473 | instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ |
||

474 | |||

475 | |||

476 | for (i=0;i<h/2;i++) { |
||

477 | vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); |
||

478 | vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); |
||

479 | |||

480 | for (j=0;j<w/16;j++) { |
||

481 | |||

482 | ```
y0 = vec_ldl (0,y1i);
``` |
||

483 | ```
y1 = vec_ldl (0,y2i);
``` |
||

484 | uivP = (vector unsigned char *)ui; |
||

485 | vivP = (vector unsigned char *)vi; |
||

486 | |||

487 | ```
align_perm = vec_lvsl (0, ui);
``` |
||

488 | u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); |
||

489 | |||

490 | ```
align_perm = vec_lvsl (0, vi);
``` |
||

491 | v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); |
||

492 | u = (vector signed char) |
||

493 | vec_sub (u,(vector signed char) |
||

494 | vec_splat((vector signed char)AVV(128),0)); |
||

495 | |||

496 | v = (vector signed char) |
||

497 | vec_sub (v, (vector signed char) |
||

498 | vec_splat((vector signed char)AVV(128),0)); |
||

499 | |||

500 | U = vec_unpackh (u); |
||

501 | V = vec_unpackh (v); |
||

502 | |||

503 | |||

504 | Y0 = vec_unh (y0); |
||

505 | Y1 = vec_unl (y0); |
||

506 | Y2 = vec_unh (y1); |
||

507 | Y3 = vec_unl (y1); |
||

508 | |||

509 | Y0 = vec_mradds (Y0, lCY, lOY); |
||

510 | Y1 = vec_mradds (Y1, lCY, lOY); |
||

511 | Y2 = vec_mradds (Y2, lCY, lOY); |
||

512 | Y3 = vec_mradds (Y3, lCY, lOY); |
||

513 | |||

514 | ```
/* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
``` |
||

515 | ux = vec_sl (U, lCSHIFT); |
||

516 | ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); |
||

517 | ux0 = vec_mergeh (ux,ux); |
||

518 | ux1 = vec_mergel (ux,ux); |
||

519 | |||

520 | ```
/* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
``` |
||

521 | vx = vec_sl (V, lCSHIFT); |
||

522 | vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); |
||

523 | vx0 = vec_mergeh (vx,vx); |
||

524 | vx1 = vec_mergel (vx,vx); |
||

525 | ```
/* uvx = ((CGU*u) + (CGV*v))>>15 */
``` |
||

526 | uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); |
||

527 | uvx = vec_mradds (V, lCGV, uvx); |
||

528 | uvx0 = vec_mergeh (uvx,uvx); |
||

529 | uvx1 = vec_mergel (uvx,uvx); |
||

530 | R0 = vec_add (Y0,vx0); |
||

531 | G0 = vec_add (Y0,uvx0); |
||

532 | B0 = vec_add (Y0,ux0); |
||

533 | R1 = vec_add (Y1,vx1); |
||

534 | G1 = vec_add (Y1,uvx1); |
||

535 | B1 = vec_add (Y1,ux1); |
||

536 | R = vec_packclp (R0,R1); |
||

537 | G = vec_packclp (G0,G1); |
||

538 | B = vec_packclp (B0,B1); |
||

539 | |||

540 | out_argb(R,G,B,oute); |
||

541 | R0 = vec_add (Y2,vx0); |
||

542 | G0 = vec_add (Y2,uvx0); |
||

543 | B0 = vec_add (Y2,ux0); |
||

544 | R1 = vec_add (Y3,vx1); |
||

545 | G1 = vec_add (Y3,uvx1); |
||

546 | B1 = vec_add (Y3,ux1); |
||

547 | R = vec_packclp (R0,R1); |
||

548 | G = vec_packclp (G0,G1); |
||

549 | B = vec_packclp (B0,B1); |
||

550 | |||

551 | out_argb(R,G,B,outo); |
||

552 | ```
y1i += 16;
``` |
||

553 | ```
y2i += 16;
``` |
||

554 | ```
ui += 8;
``` |
||

555 | ```
vi += 8;
``` |
||

556 | |||

557 | } |
||

558 | |||

559 | outo += (outstrides[0])>>4; |
||

560 | oute += (outstrides[0])>>4; |
||

561 | |||

562 | ```
ui += instrides_scl[1];
``` |
||

563 | ```
vi += instrides_scl[2];
``` |
||

564 | ```
y1i += instrides_scl[0];
``` |
||

565 | ```
y2i += instrides_scl[0];
``` |
||

566 | } |
||

567 | ```
return srcSliceH;
``` |
||

568 | } |
||

569 | |||

570 | ```
#endif
``` |
||

571 | |||

572 | |||

573 | a31de956 | Michael Niedermayer | DEFCSP420_CVT (yuv2_rgba32, out_rgba) |

574 | DEFCSP420_CVT (yuv2_argb32, out_argb) |
||

575 | DEFCSP420_CVT (yuv2_rgb24, out_rgb24) |
||

576 | DEFCSP420_CVT (yuv2_bgr24, out_bgr24) |
||

577 | |||

578 | |||

579 | ```
// uyvy|uyvy|uyvy|uyvy
``` |
||

580 | ```
// 0123 4567 89ab cdef
``` |
||

581 | ```
static
``` |
||

582 | const vector unsigned char |
||

583 | 582552fb | Luca Barbato | demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00, |

584 | a31de956 | Michael Niedermayer | 0x10,0x04,0x10,0x04, |

585 | 0x10,0x08,0x10,0x08, |
||

586 | 0x10,0x0c,0x10,0x0c), |
||

587 | 582552fb | Luca Barbato | demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02, |

588 | a31de956 | Michael Niedermayer | 0x10,0x06,0x10,0x06, |

589 | 0x10,0x0A,0x10,0x0A, |
||

590 | 0x10,0x0E,0x10,0x0E), |
||

591 | 582552fb | Luca Barbato | demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03, |

592 | a31de956 | Michael Niedermayer | 0x10,0x05,0x10,0x07, |

593 | 0x10,0x09,0x10,0x0B, |
||

594 | 0x10,0x0D,0x10,0x0F); |
||

595 | |||

596 | ```
/*
``` |
||

597 | ```
this is so I can play live CCIR raw video
``` |
||

598 | ```
*/
``` |
||

599 | static int altivec_uyvy_rgb32 (SwsContext *c, |
||

600 | unsigned char **in, int *instrides, |
||

601 | int srcSliceY, int srcSliceH, |
||

602 | unsigned char **oplanes, int *outstrides) |
||

603 | { |
||

604 | ```
int w = c->srcW;
``` |
||

605 | ```
int h = srcSliceH;
``` |
||

606 | ```
int i,j;
``` |
||

607 | vector unsigned char uyvy; |
||

608 | vector signed short Y,U,V; |
||

609 | vector signed short vx,ux,uvx; |
||

610 | vector signed short R0,G0,B0,R1,G1,B1; |
||

611 | vector unsigned char R,G,B; |
||

612 | vector unsigned char *out; |
||

613 | ubyte *img; |
||

614 | |||

615 | ```
img = in[0];
``` |
||

616 | out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); |
||

617 | |||

618 | for (i=0;i<h;i++) { |
||

619 | for (j=0;j<w/16;j++) { |
||

620 | ```
uyvy = vec_ld (0, img);
``` |
||

621 | U = (vector signed short) |
||

622 | 582552fb | Luca Barbato | vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); |

623 | a31de956 | Michael Niedermayer | |

624 | V = (vector signed short) |
||

625 | 582552fb | Luca Barbato | vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); |

626 | a31de956 | Michael Niedermayer | |

627 | Y = (vector signed short) |
||

628 | 582552fb | Luca Barbato | vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); |

629 | a31de956 | Michael Niedermayer | |

630 | cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0); |
||

631 | |||

632 | ```
uyvy = vec_ld (16, img);
``` |
||

633 | U = (vector signed short) |
||

634 | 582552fb | Luca Barbato | vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); |

635 | a31de956 | Michael Niedermayer | |

636 | V = (vector signed short) |
||

637 | 582552fb | Luca Barbato | vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); |

638 | a31de956 | Michael Niedermayer | |

639 | Y = (vector signed short) |
||

640 | 582552fb | Luca Barbato | vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); |

641 | a31de956 | Michael Niedermayer | |

642 | cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1); |
||

643 | |||

644 | R = vec_packclp (R0,R1); |
||

645 | G = vec_packclp (G0,G1); |
||

646 | B = vec_packclp (B0,B1); |
||

647 | |||

648 | ```
// vec_mstbgr24 (R,G,B, out);
``` |
||

649 | out_rgba (R,G,B,out); |
||

650 | |||

651 | ```
img += 32;
``` |
||

652 | } |
||

653 | } |
||

654 | 84fdd642 | Alex Beregszaszi | ```
return srcSliceH;
``` |

655 | a31de956 | Michael Niedermayer | } |

656 | |||

657 | |||

658 | |||

659 | ```
/* Ok currently the acceleration routine only supports
``` |
||

660 | ```
inputs of widths a multiple of 16
``` |
||

661 | ```
and heights a multiple 2
``` |
||

662 | |||

663 | ```
So we just fall back to the C codes for this.
``` |
||

664 | ```
*/
``` |
||

665 | SwsFunc yuv2rgb_init_altivec (SwsContext *c) |
||

666 | { |
||

667 | ```
if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
``` |
||

668 | return NULL; |
||

669 | |||

670 | ```
/*
``` |
||

671 | ```
and this seems not to matter too much I tried a bunch of
``` |
||

672 | ```
videos with abnormal widths and mplayer crashes else where.
``` |
||

673 | ```
mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
``` |
||

674 | ```
boom with X11 bad match.
``` |
||

675 | ```
``` |
||

676 | ```
*/
``` |
||

677 | if ((c->srcW & 0xf) != 0) return NULL; |
||

678 | |||

679 | ```
switch (c->srcFormat) {
``` |
||

680 | ```
case IMGFMT_YVU9:
``` |
||

681 | ```
case IMGFMT_IF09:
``` |
||

682 | ```
case IMGFMT_YV12:
``` |
||

683 | ```
case IMGFMT_I420:
``` |
||

684 | ```
case IMGFMT_IYUV:
``` |
||

685 | ```
case IMGFMT_CLPL:
``` |
||

686 | ```
case IMGFMT_Y800:
``` |
||

687 | ```
case IMGFMT_Y8:
``` |
||

688 | ```
case IMGFMT_NV12:
``` |
||

689 | ```
case IMGFMT_NV21:
``` |
||

690 | if ((c->srcH & 0x1) != 0) |
||

691 | return NULL; |
||

692 | |||

693 | ```
switch(c->dstFormat){
``` |
||

694 | ```
case IMGFMT_RGB24:
``` |
||

695 | ```
MSG_WARN("ALTIVEC: Color Space RGB24\n");
``` |
||

696 | ```
return altivec_yuv2_rgb24;
``` |
||

697 | ```
case IMGFMT_BGR24:
``` |
||

698 | ```
MSG_WARN("ALTIVEC: Color Space BGR24\n");
``` |
||

699 | ```
return altivec_yuv2_bgr24;
``` |
||

700 | ```
case IMGFMT_RGB32:
``` |
||

701 | ```
MSG_WARN("ALTIVEC: Color Space ARGB32\n");
``` |
||

702 | ```
return altivec_yuv2_argb32;
``` |
||

703 | ```
case IMGFMT_BGR32:
``` |
||

704 | ```
MSG_WARN("ALTIVEC: Color Space BGRA32\n");
``` |
||

705 | ```
// return profile_altivec_bgra32;
``` |
||

706 | |||

707 | ```
return altivec_yuv2_bgra32;
``` |
||

708 | default: return NULL; |
||

709 | } |
||

710 | ```
break;
``` |
||

711 | |||

712 | ```
case IMGFMT_UYVY:
``` |
||

713 | ```
switch(c->dstFormat){
``` |
||

714 | ```
case IMGFMT_RGB32:
``` |
||

715 | ```
MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
``` |
||

716 | ```
return altivec_uyvy_rgb32;
``` |
||

717 | default: return NULL; |
||

718 | } |
||

719 | ```
break;
``` |
||

720 | |||

721 | } |
||

722 | return NULL; |
||

723 | } |
||

724 | |||

725 | 582552fb | Luca Barbato | ```
static uint16_t roundToInt16(int64_t f){
``` |

726 | int r= (f + (1<<15))>>16; |
||

727 | if(r<-0x7FFF) return 0x8000; |
||

728 | else if(r> 0x7FFF) return 0x7FFF; |
||

729 | else return r; |
||

730 | } |
||

731 | 84fdd642 | Alex Beregszaszi | |

732 | 582552fb | Luca Barbato | void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation) |

733 | { |
||

734 | ```
union {
``` |
||

735 | signed short tmp[8] __attribute__ ((aligned(16))); |
||

736 | vector signed short vec; |
||

737 | } buf; |
||

738 | |||

739 | buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy |
||

740 | buf.tmp[1] = -256*brightness; //oy |
||

741 | buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv |
||

742 | buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu |
||

743 | buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu |
||

744 | buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv |
||

745 | |||

746 | |||

747 | c->CSHIFT = (vector unsigned short)vec_splat((vector unsigned short)AVV(2),0); |
||

748 | c->CY = vec_splat ((vector signed short)buf.vec, 0); |
||

749 | c->OY = vec_splat ((vector signed short)buf.vec, 1); |
||

750 | c->CRV = vec_splat ((vector signed short)buf.vec, 2); |
||

751 | c->CBU = vec_splat ((vector signed short)buf.vec, 3); |
||

752 | c->CGU = vec_splat ((vector signed short)buf.vec, 4); |
||

753 | c->CGV = vec_splat ((vector signed short)buf.vec, 5); |
||

754 | 84fdd642 | Alex Beregszaszi | ```
#if 0
``` |

755 | 582552fb | Luca Barbato | ```
{
``` |

756 | ```
int i;
``` |
||

757 | ```
char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
``` |
||

758 | ```
for (i=0; i<6;i++)
``` |
||

759 | ```
printf("%s %d ", v[i],buf.tmp[i] );
``` |
||

760 | ```
printf("\n");
``` |
||

761 | ```
}
``` |
||

762 | a31de956 | Michael Niedermayer | ```
#endif
``` |

763 | 84fdd642 | Alex Beregszaszi | ```
return;
``` |

764 | a31de956 | Michael Niedermayer | } |

765 | |||

766 | |||

767 | ```
void
``` |
||

768 | altivec_yuv2packedX (SwsContext *c, |
||

769 | ```
int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
``` |
||

770 | ```
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
``` |
||

771 | uint8_t *dest, int dstW, int dstY) |
||

772 | { |
||

773 | ```
int i,j;
``` |
||

774 | short tmp __attribute__((aligned (16))); |
||

775 | 582552fb | Luca Barbato | int16_t *p; |

776 | a31de956 | Michael Niedermayer | ```
short *f;
``` |

777 | vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; |
||

778 | vector signed short R0,G0,B0,R1,G1,B1; |
||

779 | |||

780 | vector unsigned char R,G,B,pels[3]; |
||

781 | vector unsigned char *out,*nout; |
||

782 | 582552fb | Luca Barbato | |

783 | vector signed short RND = vec_splat((vector signed short)AVV(1<<3),0); |
||

784 | vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0); |
||

785 | a31de956 | Michael Niedermayer | unsigned long scratch[16] __attribute__ ((aligned (16))); |

786 | |||

787 | vector signed short *vYCoeffsBank, *vCCoeffsBank; |
||

788 | |||

789 | vector signed short *YCoeffs, *CCoeffs; |
||

790 | |||

791 | vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW); |
||

792 | vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW); |
||

793 | |||

794 | for (i=0;i<lumFilterSize*dstW;i++) { |
||

795 | tmp = c->vLumFilter[i]; |
||

796 | p = &vYCoeffsBank[i]; |
||

797 | for (j=0;j<8;j++) |
||

798 | p[j] = tmp; |
||

799 | } |
||

800 | |||

801 | for (i=0;i<chrFilterSize*dstW;i++) { |
||

802 | tmp = c->vChrFilter[i]; |
||

803 | p = &vCCoeffsBank[i]; |
||

804 | for (j=0;j<8;j++) |
||

805 | p[j] = tmp; |
||

806 | } |
||

807 | |||

808 | YCoeffs = vYCoeffsBank+dstY*lumFilterSize; |
||

809 | CCoeffs = vCCoeffsBank+dstY*chrFilterSize; |
||

810 | |||

811 | out = (vector unsigned char *)dest; |
||

812 | |||

813 | for(i=0; i<dstW; i+=16){ |
||

814 | Y0 = RND; |
||

815 | Y1 = RND; |
||

816 | ```
/* extract 16 coeffs from lumSrc */
``` |
||

817 | for(j=0; j<lumFilterSize; j++) { |
||

818 | ```
X0 = vec_ld (0, &lumSrc[j][i]);
``` |
||

819 | ```
X1 = vec_ld (16, &lumSrc[j][i]);
``` |
||

820 | Y0 = vec_mradds (X0, YCoeffs[j], Y0); |
||

821 | Y1 = vec_mradds (X1, YCoeffs[j], Y1); |
||

822 | } |
||

823 | |||

824 | U = RND; |
||

825 | V = RND; |
||

826 | ```
/* extract 8 coeffs from U,V */
``` |
||

827 | for(j=0; j<chrFilterSize; j++) { |
||

828 | X = vec_ld (0, &chrSrc[j][i/2]); |
||

829 | U = vec_mradds (X, CCoeffs[j], U); |
||

830 | X = vec_ld (0, &chrSrc[j][i/2+2048]); |
||

831 | V = vec_mradds (X, CCoeffs[j], V); |
||

832 | } |
||

833 | |||

834 | ```
/* scale and clip signals */
``` |
||

835 | Y0 = vec_sra (Y0, SCL); |
||

836 | Y1 = vec_sra (Y1, SCL); |
||

837 | U = vec_sra (U, SCL); |
||

838 | V = vec_sra (V, SCL); |
||

839 | |||

840 | Y0 = vec_clip (Y0); |
||

841 | Y1 = vec_clip (Y1); |
||

842 | U = vec_clip (U); |
||

843 | V = vec_clip (V); |
||

844 | |||

845 | ```
/* now we have
``` |
||

846 | ```
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
``` |
||

847 | ```
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
``` |
||

848 | |||

849 | ```
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
``` |
||

850 | ```
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
``` |
||

851 | ```
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
``` |
||

852 | ```
*/
``` |
||

853 | |||

854 | U0 = vec_mergeh (U,U); |
||

855 | V0 = vec_mergeh (V,V); |
||

856 | |||

857 | U1 = vec_mergel (U,U); |
||

858 | V1 = vec_mergel (V,V); |
||

859 | |||

860 | cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); |
||

861 | cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); |
||

862 | |||

863 | R = vec_packclp (R0,R1); |
||

864 | G = vec_packclp (G0,G1); |
||

865 | B = vec_packclp (B0,B1); |
||

866 | |||

867 | out_rgba (R,G,B,out); |
||

868 | } |
||

869 | |||

870 | ```
if (i < dstW) {
``` |
||

871 | ```
i -= 16;
``` |
||

872 | |||

873 | Y0 = RND; |
||

874 | Y1 = RND; |
||

875 | ```
/* extract 16 coeffs from lumSrc */
``` |
||

876 | for(j=0; j<lumFilterSize; j++) { |
||

877 | ```
X0 = vec_ld (0, &lumSrc[j][i]);
``` |
||

878 | ```
X1 = vec_ld (16, &lumSrc[j][i]);
``` |
||

879 | Y0 = vec_mradds (X0, YCoeffs[j], Y0); |
||

880 | Y1 = vec_mradds (X1, YCoeffs[j], Y1); |
||

881 | } |
||

882 | |||

883 | U = RND; |
||

884 | V = RND; |
||

885 | ```
/* extract 8 coeffs from U,V */
``` |
||

886 | for(j=0; j<chrFilterSize; j++) { |
||

887 | X = vec_ld (0, &chrSrc[j][i/2]); |
||

888 | U = vec_mradds (X, CCoeffs[j], U); |
||

889 | X = vec_ld (0, &chrSrc[j][i/2+2048]); |
||

890 | V = vec_mradds (X, CCoeffs[j], V); |
||

891 | } |
||

892 | |||

893 | ```
/* scale and clip signals */
``` |
||

894 | Y0 = vec_sra (Y0, SCL); |
||

895 | Y1 = vec_sra (Y1, SCL); |
||

896 | U = vec_sra (U, SCL); |
||

897 | V = vec_sra (V, SCL); |
||

898 | |||

899 | Y0 = vec_clip (Y0); |
||

900 | Y1 = vec_clip (Y1); |
||

901 | U = vec_clip (U); |
||

902 | V = vec_clip (V); |
||

903 | |||

904 | ```
/* now we have
``` |
||

905 | ```
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
``` |
||

906 | ```
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
``` |
||

907 | |||

908 | ```
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
``` |
||

909 | ```
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
``` |
||

910 | ```
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
``` |
||

911 | ```
*/
``` |
||

912 | |||

913 | U0 = vec_mergeh (U,U); |
||

914 | V0 = vec_mergeh (V,V); |
||

915 | |||

916 | U1 = vec_mergel (U,U); |
||

917 | V1 = vec_mergel (V,V); |
||

918 | |||

919 | cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); |
||

920 | cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); |
||

921 | |||

922 | R = vec_packclp (R0,R1); |
||

923 | G = vec_packclp (G0,G1); |
||

924 | B = vec_packclp (B0,B1); |
||

925 | |||

926 | nout = (vector unsigned char *)scratch; |
||

927 | out_rgba (R,G,B,nout); |
||

928 | |||

929 | ```
memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
``` |
||

930 | } |
||

931 | |||

932 | ```
if (vYCoeffsBank) free (vYCoeffsBank);
``` |
||

933 | ```
if (vCCoeffsBank) free (vCCoeffsBank);
``` |
||

934 | |||

935 | } |