## ffmpeg / libswscale / internal_bfin.S @ 4bdc44c7

History | View | Annotate | Download (19.5 KB)

1 |
/* |
---|---|

2 |
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> |

3 |
* April 20, 2007 |

4 |
* |

5 |
* Blackfin Video Color Space Converters Operations |

6 |
* convert I420 YV12 to RGB in various formats, |

7 |
* |

8 |
* This file is part of FFmpeg. |

9 |
* |

10 |
* FFmpeg is free software; you can redistribute it and/or |

11 |
* modify it under the terms of the GNU Lesser General Public |

12 |
* License as published by the Free Software Foundation; either |

13 |
* version 2.1 of the License, or (at your option) any later version. |

14 |
* |

15 |
* FFmpeg is distributed in the hope that it will be useful, |

16 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |

17 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

18 |
* Lesser General Public License for more details. |

19 |
* |

20 |
* You should have received a copy of the GNU Lesser General Public |

21 |
* License along with FFmpeg; if not, write to the Free Software |

22 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

23 |
*/ |

24 | |

25 | |

26 |
/* |

27 |
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock |

28 |
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts |

29 | |

30 | |

31 |
The following calculation is used for the conversion: |

32 | |

33 |
r = clipz((y-oy)*cy + crv*(v-128)) |

34 |
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) |

35 |
b = clipz((y-oy)*cy + cbu*(u-128)) |

36 | |

37 |
y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision. |

38 | |

39 | |

40 |
New factorization to eliminate the truncation error which was |

41 |
occuring due to the byteop3p. |

42 | |

43 | |

44 |
1) use the bytop16m to subtract quad bytes we use this in U8 this |

45 |
then so the offsets need to be renormalized to 8bits. |

46 | |

47 |
2) scale operands up by a factor of 4 not 8 because Blackfin |

48 |
multiplies include a shift. |

49 | |

50 |
3) compute into the accumulators cy*yx0, cy*yx1 |

51 | |

52 |
4) compute each of the linear equations |

53 |
r = clipz((y - oy) * cy + crv * (v - 128)) |

54 | |

55 |
g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128)) |

56 | |

57 |
b = clipz((y - oy) * cy + cbu * (u - 128)) |

58 | |

59 |
reuse of the accumulators requires that we actually multiply |

60 |
twice once with addition and the second time with a subtaction. |

61 | |

62 |
because of this we need to compute the equations in the order R B |

63 |
then G saving the writes for B in the case of 24/32 bit color |

64 |
formats. |

65 | |

66 |
api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, |

67 |
int dW, uint32_t *coeffs); |

68 | |

69 |
A B |

70 |
--- --- |

71 |
i2 = cb i3 = cr |

72 |
i1 = coeff i0 = y |

73 | |

74 |
Where coeffs have the following layout in memory. |

75 | |

76 |
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv; |

77 | |

78 |
coeffs is a pointer to oy. |

79 | |

80 |
the {rgb} masks are only utilized by the 565 packing algorithm. Note the data |

81 |
replication is used to simplify the internal algorithms for the dual mac architecture |

82 |
of BlackFin. |

83 | |

84 |
All routines are exported with _ff_bfin_ as a symbol prefix |

85 | |

86 |
rough performance gain compared against -O3: |

87 | |

88 |
2779809/1484290 187.28% |

89 | |

90 |
which translates to ~33c/pel to ~57c/pel for the reference vs 17.5 |

91 |
c/pel for the optimized implementations. Not sure why there is such a |

92 |
huge variation on the reference codes on Blackfin I guess it must have |

93 |
to do with the memory system. |

94 |
*/ |

95 | |

96 |
#define mL3 .text |

97 |
#ifdef __FDPIC__ |

98 |
#define mL1 .l1.text |

99 |
#else |

100 |
#define mL1 mL3 |

101 |
#endif |

102 |
#define MEM mL1 |

103 | |

104 |
#define DEFUN(fname,where,interface) \ |

105 |
.section where; \ |

106 |
.global _ff_bfin_ ## fname; \ |

107 |
.type _ff_bfin_ ## fname, STT_FUNC; \ |

108 |
.align 8; \ |

109 |
_ff_bfin_ ## fname |

110 | |

111 |
#define DEFUN_END(fname) \ |

112 |
.size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname |

113 | |

114 | |

115 |
.text |

116 | |

117 |
#define COEFF_LEN 11*4 |

118 |
#define COEFF_REL_CY_OFF 4*4 |

119 | |

120 |
#define ARG_OUT 20 |

121 |
#define ARG_W 24 |

122 |
#define ARG_COEFF 28 |

123 | |

124 |
DEFUN(yuv2rgb565_line,MEM, |

125 |
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): |

126 |
link 0; |

127 |
[--sp] = (r7:4); |

128 |
p1 = [fp+ARG_OUT]; |

129 |
r3 = [fp+ARG_W]; |

130 | |

131 |
i0 = r0; |

132 |
i2 = r1; |

133 |
i3 = r2; |

134 | |

135 |
r0 = [fp+ARG_COEFF]; |

136 |
i1 = r0; |

137 |
b1 = i1; |

138 |
l1 = COEFF_LEN; |

139 |
m0 = COEFF_REL_CY_OFF; |

140 |
p0 = r3; |

141 | |

142 |
r0 = [i0++]; // 2Y |

143 |
r1.l = w[i2++]; // 2u |

144 |
r1.h = w[i3++]; // 2v |

145 |
p0 = p0>>2; |

146 | |

147 |
lsetup (.L0565, .L1565) lc0 = p0; |

148 | |

149 |
/* |

150 |
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv |

151 |
r0 -- used to load 4ys |

152 |
r1 -- used to load 2us,2vs |

153 |
r4 -- y3,y2 |

154 |
r5 -- y1,y0 |

155 |
r6 -- u1,u0 |

156 |
r7 -- v1,v0 |

157 |
*/ |

158 |
r2=[i1++]; // oy |

159 |
.L0565: |

160 |
/* |

161 |
rrrrrrrr gggggggg bbbbbbbb |

162 |
5432109876543210 |

163 |
bbbbb >>3 |

164 |
gggggggg <<3 |

165 |
rrrrrrrr <<8 |

166 |
rrrrrggggggbbbbb |

167 |
*/ |

168 |
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc |

169 |
(r7,r6) = byteop16m (r1:0, r3:2) (r); |

170 |
r5 = r5 << 2 (v); // y1,y0 |

171 |
r4 = r4 << 2 (v); // y3,y2 |

172 |
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero |

173 |
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy |

174 |
/* Y' = y*cy */ |

175 |
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv |

176 | |

177 |
/* R = Y+ crv*(Cr-128) */ |

178 |
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |

179 |
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask |

180 |
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |

181 |
r2 = r2 >> 3 (v); |

182 |
r3 = r2 & r5; |

183 | |

184 |
/* B = Y+ cbu*(Cb-128) */ |

185 |
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); |

186 |
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask |

187 |
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |

188 |
r2 = r2 << 8 (v); |

189 |
r2 = r2 & r5; |

190 |
r3 = r3 | r2; |

191 | |

192 |
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |

193 |
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv |

194 |
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |

195 |
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask |

196 |
r2 = r2 << 3 (v); |

197 |
r2 = r2 & r5; |

198 |
r3 = r3 | r2; |

199 |
[p1++]=r3 || r1=[i1++]; // cy |

200 | |

201 |
/* Y' = y*cy */ |

202 | |

203 |
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv |

204 | |

205 |
/* R = Y+ crv*(Cr-128) */ |

206 |
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); |

207 |
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask |

208 |
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |

209 |
r2 = r2 >> 3 (v); |

210 |
r3 = r2 & r5; |

211 | |

212 |
/* B = Y+ cbu*(Cb-128) */ |

213 |
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); |

214 |
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask |

215 |
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |

216 |
r2 = r2 << 8 (v); |

217 |
r2 = r2 & r5; |

218 |
r3 = r3 | r2; |

219 | |

220 |
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |

221 |
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv |

222 |
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask |

223 |
r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y |

224 |
r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u |

225 |
r2 = r2 & r5; |

226 |
r3 = r3 | r2; |

227 |
[p1++]=r3 || r1.h = w[i3++]; // 2v |

228 |
.L1565: r2=[i1++]; // oy |

229 | |

230 |
l1 = 0; |

231 | |

232 |
(r7:4) = [sp++]; |

233 |
unlink; |

234 |
rts; |

235 |
DEFUN_END(yuv2rgb565_line) |

236 | |

237 |
DEFUN(yuv2rgb555_line,MEM, |

238 |
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): |

239 |
link 0; |

240 |
[--sp] = (r7:4); |

241 |
p1 = [fp+ARG_OUT]; |

242 |
r3 = [fp+ARG_W]; |

243 | |

244 |
i0 = r0; |

245 |
i2 = r1; |

246 |
i3 = r2; |

247 | |

248 |
r0 = [fp+ARG_COEFF]; |

249 |
i1 = r0; |

250 |
b1 = i1; |

251 |
l1 = COEFF_LEN; |

252 |
m0 = COEFF_REL_CY_OFF; |

253 |
p0 = r3; |

254 | |

255 |
r0 = [i0++]; // 2Y |

256 |
r1.l = w[i2++]; // 2u |

257 |
r1.h = w[i3++]; // 2v |

258 |
p0 = p0>>2; |

259 | |

260 |
lsetup (.L0555, .L1555) lc0 = p0; |

261 | |

262 |
/* |

263 |
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv |

264 |
r0 -- used to load 4ys |

265 |
r1 -- used to load 2us,2vs |

266 |
r4 -- y3,y2 |

267 |
r5 -- y1,y0 |

268 |
r6 -- u1,u0 |

269 |
r7 -- v1,v0 |

270 |
*/ |

271 |
r2=[i1++]; // oy |

272 |
.L0555: |

273 |
/* |

274 |
rrrrrrrr gggggggg bbbbbbbb |

275 |
5432109876543210 |

276 |
bbbbb >>3 |

277 |
gggggggg <<2 |

278 |
rrrrrrrr <<7 |

279 |
xrrrrrgggggbbbbb |

280 |
*/ |

281 | |

282 |
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc |

283 |
(r7,r6) = byteop16m (r1:0, r3:2) (r); |

284 |
r5 = r5 << 2 (v); // y1,y0 |

285 |
r4 = r4 << 2 (v); // y3,y2 |

286 |
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero |

287 |
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy |

288 |
/* Y' = y*cy */ |

289 |
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv |

290 | |

291 |
/* R = Y+ crv*(Cr-128) */ |

292 |
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |

293 |
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask |

294 |
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |

295 |
r2 = r2 >> 3 (v); |

296 |
r3 = r2 & r5; |

297 | |

298 |
/* B = Y+ cbu*(Cb-128) */ |

299 |
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); |

300 |
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask |

301 |
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |

302 |
r2 = r2 << 7 (v); |

303 |
r2 = r2 & r5; |

304 |
r3 = r3 | r2; |

305 | |

306 |
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |

307 |
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv |

308 |
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |

309 |
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask |

310 |
r2 = r2 << 2 (v); |

311 |
r2 = r2 & r5; |

312 |
r3 = r3 | r2; |

313 |
[p1++]=r3 || r1=[i1++]; // cy |

314 | |

315 |
/* Y' = y*cy */ |

316 | |

317 |
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv |

318 | |

319 |
/* R = Y+ crv*(Cr-128) */ |

320 |
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); |

321 |
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask |

322 |
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |

323 |
r2 = r2 >> 3 (v); |

324 |
r3 = r2 & r5; |

325 | |

326 |
/* B = Y+ cbu*(Cb-128) */ |

327 |
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); |

328 |
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask |

329 |
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |

330 |
r2 = r2 << 7 (v); |

331 |
r2 = r2 & r5; |

332 |
r3 = r3 | r2; |

333 | |

334 |
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |

335 |
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv |

336 |
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask |

337 |
r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y |

338 |
r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u |

339 |
r2 = r2 & r5; |

340 |
r3 = r3 | r2; |

341 |
[p1++]=r3 || r1.h=w[i3++]; // 2v |

342 | |

343 |
.L1555: r2=[i1++]; // oy |

344 | |

345 |
l1 = 0; |

346 | |

347 |
(r7:4) = [sp++]; |

348 |
unlink; |

349 |
rts; |

350 |
DEFUN_END(yuv2rgb555_line) |

351 | |

352 |
DEFUN(yuv2rgb24_line,MEM, |

353 |
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): |

354 |
link 0; |

355 |
[--sp] = (r7:4); |

356 |
p1 = [fp+ARG_OUT]; |

357 |
r3 = [fp+ARG_W]; |

358 |
p2 = p1; |

359 |
p2 += 3; |

360 | |

361 |
i0 = r0; |

362 |
i2 = r1; |

363 |
i3 = r2; |

364 | |

365 |
r0 = [fp+ARG_COEFF]; // coeff buffer |

366 |
i1 = r0; |

367 |
b1 = i1; |

368 |
l1 = COEFF_LEN; |

369 |
m0 = COEFF_REL_CY_OFF; |

370 |
p0 = r3; |

371 | |

372 |
r0 = [i0++]; // 2Y |

373 |
r1.l = w[i2++]; // 2u |

374 |
r1.h = w[i3++]; // 2v |

375 |
p0 = p0>>2; |

376 | |

377 |
lsetup (.L0888, .L1888) lc0 = p0; |

378 | |

379 |
/* |

380 |
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv |

381 |
r0 -- used to load 4ys |

382 |
r1 -- used to load 2us,2vs |

383 |
r4 -- y3,y2 |

384 |
r5 -- y1,y0 |

385 |
r6 -- u1,u0 |

386 |
r7 -- v1,v0 |

387 |
*/ |

388 |
r2=[i1++]; // oy |

389 |
.L0888: |

390 |
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc |

391 |
(r7,r6) = byteop16m (r1:0, r3:2) (r); |

392 |
r5 = r5 << 2 (v); // y1,y0 |

393 |
r4 = r4 << 2 (v); // y3,y2 |

394 |
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero |

395 |
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy |

396 | |

397 |
/* Y' = y*cy */ |

398 |
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv |

399 | |

400 |
/* R = Y+ crv*(Cr-128) */ |

401 |
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |

402 |
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask |

403 |
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |

404 |
r2=r2>>16 || B[p1++]=r2; |

405 |
B[p2++]=r2; |

406 | |

407 |
/* B = Y+ cbu*(Cb-128) */ |

408 |
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); |

409 |
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask |

410 |
r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |

411 | |

412 |
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |

413 |
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv |

414 |
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |

415 |
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero |

416 | |

417 |
r2=r2>>16 || B[p1++]=r2; |

418 |
B[p2++]=r2; |

419 | |

420 |
r3=r3>>16 || B[p1++]=r3; |

421 |
B[p2++]=r3 || r1=[i1++]; // cy |

422 | |

423 |
p1+=3; |

424 |
p2+=3; |

425 |
/* Y' = y*cy */ |

426 |
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv |

427 | |

428 |
/* R = Y+ crv*(Cr-128) */ |

429 |
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); |

430 |
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask |

431 |
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |

432 |
r2=r2>>16 || B[p1++]=r2; |

433 |
B[p2++]=r2; |

434 | |

435 |
/* B = Y+ cbu*(Cb-128) */ |

436 |
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); |

437 |
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask |

438 |
r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |

439 | |

440 |
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |

441 |
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv |

442 |
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); |

443 |
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask |

444 |
r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y |

445 |
B[p2++]=r2 || r1.l = w[i2++]; // 2u |

446 |
r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v |

447 |
B[p2++]=r3 || r2=[i1++]; // oy |

448 | |

449 |
p1+=3; |

450 |
.L1888: p2+=3; |

451 | |

452 |
l1 = 0; |

453 | |

454 |
(r7:4) = [sp++]; |

455 |
unlink; |

456 |
rts; |

457 |
DEFUN_END(yuv2rgb24_line) |

458 | |

459 | |

460 | |

461 |
#define ARG_vdst 20 |

462 |
#define ARG_width 24 |

463 |
#define ARG_height 28 |

464 |
#define ARG_lumStride 32 |

465 |
#define ARG_chromStride 36 |

466 |
#define ARG_srcStride 40 |

467 | |

468 |
DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |

469 |
long width, long height, |

470 |
long lumStride, long chromStride, long srcStride)): |

471 |
link 0; |

472 |
[--sp] = (r7:4,p5:4); |

473 | |

474 |
p0 = r1; // Y top even |

475 | |

476 |
i2 = r2; // *u |

477 |
r2 = [fp + ARG_vdst]; |

478 |
i3 = r2; // *v |

479 | |

480 |
r1 = [fp + ARG_srcStride]; |

481 |
r2 = r0 + r1; |

482 |
r1 += -8; // i0,i1 is pre read need to correct |

483 |
m0 = r1; |

484 | |

485 |
i0 = r0; // uyvy_T even |

486 |
i1 = r2; // uyvy_B odd |

487 | |

488 |
p2 = [fp + ARG_lumStride]; |

489 |
p1 = p0 + p2; // Y bot odd |

490 | |

491 |
p5 = [fp + ARG_width]; |

492 |
p4 = [fp + ARG_height]; |

493 |
r0 = p5; |

494 |
p4 = p4 >> 1; |

495 |
p5 = p5 >> 2; |

496 | |

497 |
r2 = [fp + ARG_chromStride]; |

498 |
r0 = r0 >> 1; |

499 |
r2 = r2 - r0; |

500 |
m1 = r2; |

501 | |

502 |
/* I0,I1 - src input line pointers |

503 |
* p0,p1 - luma output line pointers |

504 |
* I2 - dstU |

505 |
* I3 - dstV |

506 |
*/ |

507 | |

508 |
lsetup (0f, 1f) lc1 = p4; // H/2 |

509 |
0: r0 = [i0++] || r2 = [i1++]; |

510 |
r1 = [i0++] || r3 = [i1++]; |

511 |
r4 = byteop1p(r1:0, r3:2); |

512 |
r5 = byteop1p(r1:0, r3:2) (r); |

513 |
lsetup (2f, 3f) lc0 = p5; // W/4 |

514 |
2: r0 = r0 >> 8(v); |

515 |
r1 = r1 >> 8(v); |

516 |
r2 = r2 >> 8(v); |

517 |
r3 = r3 >> 8(v); |

518 |
r0 = bytepack(r0, r1); |

519 |
r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy |

520 |
r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy |

521 |
r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++]; |

522 |
r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++]; |

523 |
r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu |

524 |
3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv |

525 | |

526 |
i0 += m0; |

527 |
i1 += m0; |

528 |
i2 += m1; |

529 |
i3 += m1; |

530 |
p0 = p0 + p2; |

531 |
1: p1 = p1 + p2; |

532 | |

533 |
(r7:4,p5:4) = [sp++]; |

534 |
unlink; |

535 |
rts; |

536 |
DEFUN_END(uyvytoyv12) |

537 | |

538 |
DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |

539 |
long width, long height, |

540 |
long lumStride, long chromStride, long srcStride)): |

541 |
link 0; |

542 |
[--sp] = (r7:4,p5:4); |

543 | |

544 |
p0 = r1; // Y top even |

545 | |

546 |
i2 = r2; // *u |

547 |
r2 = [fp + ARG_vdst]; |

548 |
i3 = r2; // *v |

549 | |

550 |
r1 = [fp + ARG_srcStride]; |

551 |
r2 = r0 + r1; |

552 |
r1 += -8; // i0,i1 is pre read need to correct |

553 |
m0 = r1; |

554 | |

555 |
i0 = r0; // uyvy_T even |

556 |
i1 = r2; // uyvy_B odd |

557 | |

558 |
p2 = [fp + ARG_lumStride]; |

559 |
p1 = p0 + p2; // Y bot odd |

560 | |

561 |
p5 = [fp + ARG_width]; |

562 |
p4 = [fp + ARG_height]; |

563 |
r0 = p5; |

564 |
p4 = p4 >> 1; |

565 |
p5 = p5 >> 2; |

566 | |

567 |
r2 = [fp + ARG_chromStride]; |

568 |
r0 = r0 >> 1; |

569 |
r2 = r2 - r0; |

570 |
m1 = r2; |

571 | |

572 |
/* I0,I1 - src input line pointers |

573 |
* p0,p1 - luma output line pointers |

574 |
* I2 - dstU |

575 |
* I3 - dstV |

576 |
*/ |

577 | |

578 |
lsetup (0f, 1f) lc1 = p4; // H/2 |

579 |
0: r0 = [i0++] || r2 = [i1++]; |

580 |
r1 = [i0++] || r3 = [i1++]; |

581 |
r4 = bytepack(r0, r1); |

582 |
r5 = bytepack(r2, r3); |

583 |
lsetup (2f, 3f) lc0 = p5; // W/4 |

584 |
2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even |

585 |
r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd |

586 |
r2 = r2 >> 8(v); |

587 |
r3 = r3 >> 8(v); |

588 |
r4 = byteop1p(r1:0, r3:2); |

589 |
r5 = byteop1p(r1:0, r3:2) (r); |

590 |
r6 = pack(r5.l, r4.l); |

591 |
r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++]; |

592 |
r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++]; |

593 |
r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu |

594 |
3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv |

595 | |

596 |
i0 += m0; |

597 |
i1 += m0; |

598 |
i2 += m1; |

599 |
i3 += m1; |

600 |
p0 = p0 + p2; |

601 |
1: p1 = p1 + p2; |

602 | |

603 |
(r7:4,p5:4) = [sp++]; |

604 |
unlink; |

605 |
rts; |

606 |
DEFUN_END(yuyvtoyv12) |