## ffmpeg / libswscale / internal_bfin.S @ 3164d25e

History | View | Annotate | Download (19.5 KB)

1 | d3f3eea9 | Marc Hoffman | /* |
---|---|---|---|

2 | * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> |
||

3 | * April 20, 2007 |
||

4 | * |
||

5 | 8a322796 | Diego Biurrun | * Blackfin video color space converter operations |

6 | * convert I420 YV12 to RGB in various formats |
||

7 | d3f3eea9 | Marc Hoffman | * |

8 | * This file is part of FFmpeg. |
||

9 | * |
||

10 | * FFmpeg is free software; you can redistribute it and/or |
||

11 | * modify it under the terms of the GNU Lesser General Public |
||

12 | * License as published by the Free Software Foundation; either |
||

13 | * version 2.1 of the License, or (at your option) any later version. |
||

14 | * |
||

15 | * FFmpeg is distributed in the hope that it will be useful, |
||

16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||

17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||

18 | * Lesser General Public License for more details. |
||

19 | * |
||

20 | * You should have received a copy of the GNU Lesser General Public |
||

21 | * License along with FFmpeg; if not, write to the Free Software |
||

22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||

23 | */ |
||

24 | |||

25 | |||

26 | /* |
||

27 | 8a322796 | Diego Biurrun | YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock |

28 | and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts. |
||

29 | d3f3eea9 | Marc Hoffman | |

30 | |||

31 | 4bdc44c7 | Diego Biurrun | The following calculation is used for the conversion: |

32 | d3f3eea9 | Marc Hoffman | |

33 | 4bdc44c7 | Diego Biurrun | r = clipz((y-oy)*cy + crv*(v-128)) |

34 | g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) |
||

35 | b = clipz((y-oy)*cy + cbu*(u-128)) |
||

36 | d3f3eea9 | Marc Hoffman | |

37 | 8a322796 | Diego Biurrun | y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision. |

38 | d3f3eea9 | Marc Hoffman | |

39 | |||

40 | 4bdc44c7 | Diego Biurrun | New factorization to eliminate the truncation error which was |

41 | 8a322796 | Diego Biurrun | occurring due to the byteop3p. |

42 | d3f3eea9 | Marc Hoffman | |

43 | |||

44 | 8a322796 | Diego Biurrun | 1) Use the bytop16m to subtract quad bytes we use this in U8 this |

45 | 4bdc44c7 | Diego Biurrun | then so the offsets need to be renormalized to 8bits. |

46 | d3f3eea9 | Marc Hoffman | |

47 | 8a322796 | Diego Biurrun | 2) Scale operands up by a factor of 4 not 8 because Blackfin |

48 | 4bdc44c7 | Diego Biurrun | multiplies include a shift. |

49 | d3f3eea9 | Marc Hoffman | |

50 | 8a322796 | Diego Biurrun | 3) Compute into the accumulators cy*yx0, cy*yx1. |

51 | d3f3eea9 | Marc Hoffman | |

52 | 8a322796 | Diego Biurrun | 4) Compute each of the linear equations: |

53 | 4bdc44c7 | Diego Biurrun | r = clipz((y - oy) * cy + crv * (v - 128)) |

54 | d3f3eea9 | Marc Hoffman | |

55 | 4bdc44c7 | Diego Biurrun | g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128)) |

56 | d3f3eea9 | Marc Hoffman | |

57 | 4bdc44c7 | Diego Biurrun | b = clipz((y - oy) * cy + cbu * (u - 128)) |

58 | d3f3eea9 | Marc Hoffman | |

59 | 8a322796 | Diego Biurrun | Reuse of the accumulators requires that we actually multiply |

60 | twice once with addition and the second time with a subtraction. |
||

61 | d3f3eea9 | Marc Hoffman | |

62 | 8a322796 | Diego Biurrun | Because of this we need to compute the equations in the order R B |

63 | 4bdc44c7 | Diego Biurrun | then G saving the writes for B in the case of 24/32 bit color |

64 | formats. |
||

65 | d3f3eea9 | Marc Hoffman | |

66 | 8a322796 | Diego Biurrun | API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, |

67 | 4bdc44c7 | Diego Biurrun | int dW, uint32_t *coeffs); |

68 | d3f3eea9 | Marc Hoffman | |

69 | 4bdc44c7 | Diego Biurrun | A B |

70 | --- --- |
||

71 | i2 = cb i3 = cr |
||

72 | i1 = coeff i0 = y |
||

73 | d3f3eea9 | Marc Hoffman | |

74 | 4bdc44c7 | Diego Biurrun | Where coeffs have the following layout in memory. |

75 | d3f3eea9 | Marc Hoffman | |

76 | 4bdc44c7 | Diego Biurrun | uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv; |

77 | d3f3eea9 | Marc Hoffman | |

78 | 4bdc44c7 | Diego Biurrun | coeffs is a pointer to oy. |

79 | d3f3eea9 | Marc Hoffman | |

80 | 8a322796 | Diego Biurrun | The {rgb} masks are only utilized by the 565 packing algorithm. Note the data |

81 | replication is used to simplify the internal algorithms for the dual Mac |
||

82 | architecture of BlackFin. |
||

83 | d3f3eea9 | Marc Hoffman | |

84 | 8a322796 | Diego Biurrun | All routines are exported with _ff_bfin_ as a symbol prefix. |

85 | d3f3eea9 | Marc Hoffman | |

86 | 8a322796 | Diego Biurrun | Rough performance gain compared against -O3: |

87 | d3f3eea9 | Marc Hoffman | |

88 | 4bdc44c7 | Diego Biurrun | 2779809/1484290 187.28% |

89 | d3f3eea9 | Marc Hoffman | |

90 | 4bdc44c7 | Diego Biurrun | which translates to ~33c/pel to ~57c/pel for the reference vs 17.5 |

91 | c/pel for the optimized implementations. Not sure why there is such a |
||

92 | huge variation on the reference codes on Blackfin I guess it must have |
||

93 | to do with the memory system. |
||

94 | d3f3eea9 | Marc Hoffman | */ |

95 | |||

96 | #define mL3 .text |
||

97 | d2a4ecaf | Mike Frysinger | #ifdef __FDPIC__ |

98 | #define mL1 .l1.text |
||

99 | #else |
||

100 | #define mL1 mL3 |
||

101 | #endif |
||

102 | d3f3eea9 | Marc Hoffman | #define MEM mL1 |

103 | |||

104 | #define DEFUN(fname,where,interface) \ |
||

105 | .section where; \ |
||

106 | .global _ff_bfin_ ## fname; \ |
||

107 | .type _ff_bfin_ ## fname, STT_FUNC; \ |
||

108 | .align 8; \ |
||

109 | _ff_bfin_ ## fname |
||

110 | |||

111 | #define DEFUN_END(fname) \ |
||

112 | .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname |
||

113 | |||

114 | |||

115 | .text |
||

116 | |||

117 | #define COEFF_LEN 11*4 |
||

118 | #define COEFF_REL_CY_OFF 4*4 |
||

119 | |||

120 | #define ARG_OUT 20 |
||

121 | #define ARG_W 24 |
||

122 | #define ARG_COEFF 28 |
||

123 | |||

124 | DEFUN(yuv2rgb565_line,MEM, |
||

125 | (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): |
||

126 | link 0; |
||

127 | [--sp] = (r7:4); |
||

128 | p1 = [fp+ARG_OUT]; |
||

129 | r3 = [fp+ARG_W]; |
||

130 | |||

131 | i0 = r0; |
||

132 | i2 = r1; |
||

133 | i3 = r2; |
||

134 | |||

135 | r0 = [fp+ARG_COEFF]; |
||

136 | i1 = r0; |
||

137 | b1 = i1; |
||

138 | l1 = COEFF_LEN; |
||

139 | m0 = COEFF_REL_CY_OFF; |
||

140 | p0 = r3; |
||

141 | |||

142 | r0 = [i0++]; // 2Y |
||

143 | r1.l = w[i2++]; // 2u |
||

144 | r1.h = w[i3++]; // 2v |
||

145 | p0 = p0>>2; |
||

146 | |||

147 | lsetup (.L0565, .L1565) lc0 = p0; |
||

148 | |||

149 | /* |
||

150 | uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv |
||

151 | r0 -- used to load 4ys |
||

152 | r1 -- used to load 2us,2vs |
||

153 | r4 -- y3,y2 |
||

154 | r5 -- y1,y0 |
||

155 | r6 -- u1,u0 |
||

156 | r7 -- v1,v0 |
||

157 | */ |
||

158 | r2=[i1++]; // oy |
||

159 | .L0565: |
||

160 | /* |
||

161 | rrrrrrrr gggggggg bbbbbbbb |
||

162 | 5432109876543210 |
||

163 | bbbbb >>3 |
||

164 | gggggggg <<3 |
||

165 | rrrrrrrr <<8 |
||

166 | rrrrrggggggbbbbb |
||

167 | */ |
||

168 | (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc |
||

169 | (r7,r6) = byteop16m (r1:0, r3:2) (r); |
||

170 | r5 = r5 << 2 (v); // y1,y0 |
||

171 | r4 = r4 << 2 (v); // y3,y2 |
||

172 | r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero |
||

173 | r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy |
||

174 | /* Y' = y*cy */ |
||

175 | a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv |
||

176 | |||

177 | /* R = Y+ crv*(Cr-128) */ |
||

178 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |
||

179 | a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask |
||

180 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |
||

181 | r2 = r2 >> 3 (v); |
||

182 | r3 = r2 & r5; |
||

183 | |||

184 | /* B = Y+ cbu*(Cb-128) */ |
||

185 | r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); |
||

186 | a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask |
||

187 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |
||

188 | r2 = r2 << 8 (v); |
||

189 | r2 = r2 & r5; |
||

190 | r3 = r3 | r2; |
||

191 | |||

192 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |
||

193 | a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv |
||

194 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |
||

195 | r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask |
||

196 | r2 = r2 << 3 (v); |
||

197 | r2 = r2 & r5; |
||

198 | r3 = r3 | r2; |
||

199 | [p1++]=r3 || r1=[i1++]; // cy |
||

200 | |||

201 | /* Y' = y*cy */ |
||

202 | |||

203 | a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv |
||

204 | |||

205 | /* R = Y+ crv*(Cr-128) */ |
||

206 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); |
||

207 | a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask |
||

208 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |
||

209 | r2 = r2 >> 3 (v); |
||

210 | r3 = r2 & r5; |
||

211 | |||

212 | /* B = Y+ cbu*(Cb-128) */ |
||

213 | r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); |
||

214 | a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask |
||

215 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |
||

216 | r2 = r2 << 8 (v); |
||

217 | r2 = r2 & r5; |
||

218 | r3 = r3 | r2; |
||

219 | |||

220 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |
||

221 | a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv |
||

222 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask |
||

223 | r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y |
||

224 | r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u |
||

225 | r2 = r2 & r5; |
||

226 | r3 = r3 | r2; |
||

227 | [p1++]=r3 || r1.h = w[i3++]; // 2v |
||

228 | .L1565: r2=[i1++]; // oy |
||

229 | |||

230 | l1 = 0; |
||

231 | |||

232 | (r7:4) = [sp++]; |
||

233 | unlink; |
||

234 | rts; |
||

235 | DEFUN_END(yuv2rgb565_line) |
||

236 | |||

237 | DEFUN(yuv2rgb555_line,MEM, |
||

238 | (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): |
||

239 | link 0; |
||

240 | [--sp] = (r7:4); |
||

241 | p1 = [fp+ARG_OUT]; |
||

242 | r3 = [fp+ARG_W]; |
||

243 | |||

244 | i0 = r0; |
||

245 | i2 = r1; |
||

246 | i3 = r2; |
||

247 | |||

248 | r0 = [fp+ARG_COEFF]; |
||

249 | i1 = r0; |
||

250 | b1 = i1; |
||

251 | l1 = COEFF_LEN; |
||

252 | m0 = COEFF_REL_CY_OFF; |
||

253 | p0 = r3; |
||

254 | |||

255 | r0 = [i0++]; // 2Y |
||

256 | r1.l = w[i2++]; // 2u |
||

257 | r1.h = w[i3++]; // 2v |
||

258 | p0 = p0>>2; |
||

259 | |||

260 | lsetup (.L0555, .L1555) lc0 = p0; |
||

261 | |||

262 | /* |
||

263 | uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv |
||

264 | r0 -- used to load 4ys |
||

265 | r1 -- used to load 2us,2vs |
||

266 | r4 -- y3,y2 |
||

267 | r5 -- y1,y0 |
||

268 | r6 -- u1,u0 |
||

269 | r7 -- v1,v0 |
||

270 | */ |
||

271 | r2=[i1++]; // oy |
||

272 | .L0555: |
||

273 | /* |
||

274 | rrrrrrrr gggggggg bbbbbbbb |
||

275 | 5432109876543210 |
||

276 | bbbbb >>3 |
||

277 | gggggggg <<2 |
||

278 | rrrrrrrr <<7 |
||

279 | xrrrrrgggggbbbbb |
||

280 | */ |
||

281 | |||

282 | (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc |
||

283 | (r7,r6) = byteop16m (r1:0, r3:2) (r); |
||

284 | r5 = r5 << 2 (v); // y1,y0 |
||

285 | r4 = r4 << 2 (v); // y3,y2 |
||

286 | r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero |
||

287 | r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy |
||

288 | /* Y' = y*cy */ |
||

289 | a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv |
||

290 | |||

291 | /* R = Y+ crv*(Cr-128) */ |
||

292 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |
||

293 | a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask |
||

294 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |
||

295 | r2 = r2 >> 3 (v); |
||

296 | r3 = r2 & r5; |
||

297 | |||

298 | /* B = Y+ cbu*(Cb-128) */ |
||

299 | r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); |
||

300 | a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask |
||

301 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |
||

302 | r2 = r2 << 7 (v); |
||

303 | r2 = r2 & r5; |
||

304 | r3 = r3 | r2; |
||

305 | |||

306 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |
||

307 | a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv |
||

308 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |
||

309 | r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask |
||

310 | r2 = r2 << 2 (v); |
||

311 | r2 = r2 & r5; |
||

312 | r3 = r3 | r2; |
||

313 | [p1++]=r3 || r1=[i1++]; // cy |
||

314 | |||

315 | /* Y' = y*cy */ |
||

316 | |||

317 | a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv |
||

318 | |||

319 | /* R = Y+ crv*(Cr-128) */ |
||

320 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); |
||

321 | a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask |
||

322 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |
||

323 | r2 = r2 >> 3 (v); |
||

324 | r3 = r2 & r5; |
||

325 | |||

326 | /* B = Y+ cbu*(Cb-128) */ |
||

327 | r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); |
||

328 | a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask |
||

329 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |
||

330 | r2 = r2 << 7 (v); |
||

331 | r2 = r2 & r5; |
||

332 | r3 = r3 | r2; |
||

333 | |||

334 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |
||

335 | a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv |
||

336 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask |
||

337 | r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y |
||

338 | r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u |
||

339 | r2 = r2 & r5; |
||

340 | r3 = r3 | r2; |
||

341 | [p1++]=r3 || r1.h=w[i3++]; // 2v |
||

342 | |||

343 | .L1555: r2=[i1++]; // oy |
||

344 | |||

345 | l1 = 0; |
||

346 | |||

347 | (r7:4) = [sp++]; |
||

348 | unlink; |
||

349 | rts; |
||

350 | DEFUN_END(yuv2rgb555_line) |
||

351 | |||

352 | DEFUN(yuv2rgb24_line,MEM, |
||

353 | (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): |
||

354 | link 0; |
||

355 | [--sp] = (r7:4); |
||

356 | p1 = [fp+ARG_OUT]; |
||

357 | r3 = [fp+ARG_W]; |
||

358 | p2 = p1; |
||

359 | p2 += 3; |
||

360 | |||

361 | i0 = r0; |
||

362 | i2 = r1; |
||

363 | i3 = r2; |
||

364 | |||

365 | r0 = [fp+ARG_COEFF]; // coeff buffer |
||

366 | i1 = r0; |
||

367 | b1 = i1; |
||

368 | l1 = COEFF_LEN; |
||

369 | m0 = COEFF_REL_CY_OFF; |
||

370 | p0 = r3; |
||

371 | |||

372 | r0 = [i0++]; // 2Y |
||

373 | r1.l = w[i2++]; // 2u |
||

374 | r1.h = w[i3++]; // 2v |
||

375 | p0 = p0>>2; |
||

376 | |||

377 | lsetup (.L0888, .L1888) lc0 = p0; |
||

378 | |||

379 | /* |
||

380 | uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv |
||

381 | r0 -- used to load 4ys |
||

382 | r1 -- used to load 2us,2vs |
||

383 | r4 -- y3,y2 |
||

384 | r5 -- y1,y0 |
||

385 | r6 -- u1,u0 |
||

386 | r7 -- v1,v0 |
||

387 | */ |
||

388 | r2=[i1++]; // oy |
||

389 | .L0888: |
||

390 | (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc |
||

391 | (r7,r6) = byteop16m (r1:0, r3:2) (r); |
||

392 | r5 = r5 << 2 (v); // y1,y0 |
||

393 | r4 = r4 << 2 (v); // y3,y2 |
||

394 | r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero |
||

395 | r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy |
||

396 | |||

397 | /* Y' = y*cy */ |
||

398 | a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv |
||

399 | |||

400 | /* R = Y+ crv*(Cr-128) */ |
||

401 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |
||

402 | a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask |
||

403 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |
||

404 | r2=r2>>16 || B[p1++]=r2; |
||

405 | B[p2++]=r2; |
||

406 | |||

407 | /* B = Y+ cbu*(Cb-128) */ |
||

408 | r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); |
||

409 | a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask |
||

410 | r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |
||

411 | |||

412 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |
||

413 | a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv |
||

414 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |
||

415 | r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero |
||

416 | |||

417 | r2=r2>>16 || B[p1++]=r2; |
||

418 | B[p2++]=r2; |
||

419 | |||

420 | r3=r3>>16 || B[p1++]=r3; |
||

421 | B[p2++]=r3 || r1=[i1++]; // cy |
||

422 | |||

423 | p1+=3; |
||

424 | p2+=3; |
||

425 | /* Y' = y*cy */ |
||

426 | a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv |
||

427 | |||

428 | /* R = Y+ crv*(Cr-128) */ |
||

429 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); |
||

430 | a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask |
||

431 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |
||

432 | r2=r2>>16 || B[p1++]=r2; |
||

433 | B[p2++]=r2; |
||

434 | |||

435 | /* B = Y+ cbu*(Cb-128) */ |
||

436 | r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); |
||

437 | a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask |
||

438 | r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |
||

439 | |||

440 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |
||

441 | a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv |
||

442 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); |
||

443 | r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask |
||

444 | r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y |
||

445 | B[p2++]=r2 || r1.l = w[i2++]; // 2u |
||

446 | r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v |
||

447 | B[p2++]=r3 || r2=[i1++]; // oy |
||

448 | |||

449 | p1+=3; |
||

450 | .L1888: p2+=3; |
||

451 | |||

452 | l1 = 0; |
||

453 | |||

454 | (r7:4) = [sp++]; |
||

455 | unlink; |
||

456 | rts; |
||

457 | 22a11d57 | Marc Hoffman | DEFUN_END(yuv2rgb24_line) |

458 | bf4a90fc | Marc Hoffman | |

459 | |||

460 | |||

461 | #define ARG_vdst 20 |
||

462 | #define ARG_width 24 |
||

463 | #define ARG_height 28 |
||

464 | #define ARG_lumStride 32 |
||

465 | #define ARG_chromStride 36 |
||

466 | #define ARG_srcStride 40 |
||

467 | |||

468 | DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
||

469 | long width, long height, |
||

470 | long lumStride, long chromStride, long srcStride)): |
||

471 | link 0; |
||

472 | [--sp] = (r7:4,p5:4); |
||

473 | |||

474 | p0 = r1; // Y top even |
||

475 | |||

476 | i2 = r2; // *u |
||

477 | r2 = [fp + ARG_vdst]; |
||

478 | i3 = r2; // *v |
||

479 | |||

480 | r1 = [fp + ARG_srcStride]; |
||

481 | r2 = r0 + r1; |
||

482 | 69a6db95 | Marc Hoffman | r1 += -8; // i0,i1 is pre read need to correct |

483 | bf4a90fc | Marc Hoffman | m0 = r1; |

484 | |||

485 | i0 = r0; // uyvy_T even |
||

486 | i1 = r2; // uyvy_B odd |
||

487 | |||

488 | p2 = [fp + ARG_lumStride]; |
||

489 | p1 = p0 + p2; // Y bot odd |
||

490 | |||

491 | p5 = [fp + ARG_width]; |
||

492 | p4 = [fp + ARG_height]; |
||

493 | 45eeae39 | Marc Hoffman | r0 = p5; |

494 | bf4a90fc | Marc Hoffman | p4 = p4 >> 1; |

495 | p5 = p5 >> 2; |
||

496 | |||

497 | 45eeae39 | Marc Hoffman | r2 = [fp + ARG_chromStride]; |

498 | r0 = r0 >> 1; |
||

499 | r2 = r2 - r0; |
||

500 | m1 = r2; |
||

501 | |||

502 | bf4a90fc | Marc Hoffman | /* I0,I1 - src input line pointers |

503 | * p0,p1 - luma output line pointers |
||

504 | * I2 - dstU |
||

505 | * I3 - dstV |
||

506 | */ |
||

507 | |||

508 | e9d4375f | Marc Hoffman | lsetup (0f, 1f) lc1 = p4; // H/2 |

509 | 0: r0 = [i0++] || r2 = [i1++]; |
||

510 | r1 = [i0++] || r3 = [i1++]; |
||

511 | r4 = byteop1p(r1:0, r3:2); |
||

512 | r5 = byteop1p(r1:0, r3:2) (r); |
||

513 | lsetup (2f, 3f) lc0 = p5; // W/4 |
||

514 | 2: r0 = r0 >> 8(v); |
||

515 | bf4a90fc | Marc Hoffman | r1 = r1 >> 8(v); |

516 | r2 = r2 >> 8(v); |
||

517 | r3 = r3 >> 8(v); |
||

518 | r0 = bytepack(r0, r1); |
||

519 | e9d4375f | Marc Hoffman | r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy |

520 | r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy |
||

521 | r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++]; |
||

522 | r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++]; |
||

523 | r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu |
||

524 | 3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv |
||

525 | bf4a90fc | Marc Hoffman | |

526 | i0 += m0; |
||

527 | i1 += m0; |
||

528 | 45eeae39 | Marc Hoffman | i2 += m1; |

529 | i3 += m1; |
||

530 | bf4a90fc | Marc Hoffman | p0 = p0 + p2; |

531 | 1: p1 = p1 + p2; |
||

532 | |||

533 | (r7:4,p5:4) = [sp++]; |
||

534 | unlink; |
||

535 | rts; |
||

536 | DEFUN_END(uyvytoyv12) |
||

537 | 4055d271 | Marc Hoffman | |

538 | DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
||

539 | long width, long height, |
||

540 | long lumStride, long chromStride, long srcStride)): |
||

541 | link 0; |
||

542 | [--sp] = (r7:4,p5:4); |
||

543 | |||

544 | p0 = r1; // Y top even |
||

545 | |||

546 | i2 = r2; // *u |
||

547 | r2 = [fp + ARG_vdst]; |
||

548 | i3 = r2; // *v |
||

549 | |||

550 | r1 = [fp + ARG_srcStride]; |
||

551 | r2 = r0 + r1; |
||

552 | r1 += -8; // i0,i1 is pre read need to correct |
||

553 | m0 = r1; |
||

554 | |||

555 | i0 = r0; // uyvy_T even |
||

556 | i1 = r2; // uyvy_B odd |
||

557 | |||

558 | p2 = [fp + ARG_lumStride]; |
||

559 | p1 = p0 + p2; // Y bot odd |
||

560 | |||

561 | p5 = [fp + ARG_width]; |
||

562 | p4 = [fp + ARG_height]; |
||

563 | r0 = p5; |
||

564 | p4 = p4 >> 1; |
||

565 | p5 = p5 >> 2; |
||

566 | |||

567 | r2 = [fp + ARG_chromStride]; |
||

568 | r0 = r0 >> 1; |
||

569 | r2 = r2 - r0; |
||

570 | m1 = r2; |
||

571 | |||

572 | /* I0,I1 - src input line pointers |
||

573 | * p0,p1 - luma output line pointers |
||

574 | * I2 - dstU |
||

575 | * I3 - dstV |
||

576 | */ |
||

577 | |||

578 | lsetup (0f, 1f) lc1 = p4; // H/2 |
||

579 | 0: r0 = [i0++] || r2 = [i1++]; |
||

580 | r1 = [i0++] || r3 = [i1++]; |
||

581 | r4 = bytepack(r0, r1); |
||

582 | r5 = bytepack(r2, r3); |
||

583 | lsetup (2f, 3f) lc0 = p5; // W/4 |
||

584 | 2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even |
||

585 | r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd |
||

586 | r2 = r2 >> 8(v); |
||

587 | r3 = r3 >> 8(v); |
||

588 | r4 = byteop1p(r1:0, r3:2); |
||

589 | r5 = byteop1p(r1:0, r3:2) (r); |
||

590 | r6 = pack(r5.l, r4.l); |
||

591 | r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++]; |
||

592 | r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++]; |
||

593 | r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu |
||

594 | 3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv |
||

595 | |||

596 | i0 += m0; |
||

597 | i1 += m0; |
||

598 | i2 += m1; |
||

599 | i3 += m1; |
||

600 | p0 = p0 + p2; |
||

601 | 1: p1 = p1 + p2; |
||

602 | |||

603 | (r7:4,p5:4) = [sp++]; |
||

604 | unlink; |
||

605 | rts; |
||

606 | DEFUN_END(yuyvtoyv12) |