1 | d3f3eea9 | Marc Hoffman | /* |
2 | * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> |
3 | * April 20, 2007 |
4 | * |
5 | 8a322796 | Diego Biurrun | * Blackfin video color space converter operations |

6 | * convert I420 YV12 to RGB in various formats |
7 | d3f3eea9 | Marc Hoffman | * |

8 | * This file is part of FFmpeg. |
9 | * |
10 | * FFmpeg is free software; you can redistribute it and/or |
11 | * modify it under the terms of the GNU Lesser General Public |
12 | * License as published by the Free Software Foundation; either |
13 | * version 2.1 of the License, or (at your option) any later version. |
14 | * |
15 | * FFmpeg is distributed in the hope that it will be useful, |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | * Lesser General Public License for more details. |
19 | * |
20 | * You should have received a copy of the GNU Lesser General Public |
21 | * License along with FFmpeg; if not, write to the Free Software |
22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
23 | */ |
26 | /* |
27 | 8a322796 | Diego Biurrun | YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock |

28 | and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts. |
29 | d3f3eea9 | Marc Hoffman | |

31 | 4bdc44c7 | Diego Biurrun | The following calculation is used for the conversion: |

33 | 4bdc44c7 | Diego Biurrun | r = clipz((y-oy)*cy + crv*(v-128)) |

34 | g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) |
||

35 | b = clipz((y-oy)*cy + cbu*(u-128)) |
||

37 | 8a322796 | Diego Biurrun | y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision. |

40 | 4bdc44c7 | Diego Biurrun | New factorization to eliminate the truncation error which was |

41 | 8a322796 | Diego Biurrun | occurring due to the byteop3p. |

44 | 8a322796 | Diego Biurrun | 1) Use the bytop16m to subtract quad bytes we use this in U8 this |

45 | 4bdc44c7 | Diego Biurrun | then so the offsets need to be renormalized to 8bits. |

47 | 8a322796 | Diego Biurrun | 2) Scale operands up by a factor of 4 not 8 because Blackfin |

48 | 4bdc44c7 | Diego Biurrun | multiplies include a shift. |

50 | 8a322796 | Diego Biurrun | 3) Compute into the accumulators cy*yx0, cy*yx1. |

52 | 8a322796 | Diego Biurrun | 4) Compute each of the linear equations: |

53 | 4bdc44c7 | Diego Biurrun | r = clipz((y - oy) * cy + crv * (v - 128)) |

55 | 4bdc44c7 | Diego Biurrun | g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128)) |

57 | 4bdc44c7 | Diego Biurrun | b = clipz((y - oy) * cy + cbu * (u - 128)) |

59 | 8a322796 | Diego Biurrun | Reuse of the accumulators requires that we actually multiply |

60 | twice once with addition and the second time with a subtraction. |
62 | 8a322796 | Diego Biurrun | Because of this we need to compute the equations in the order R B |

63 | 4bdc44c7 | Diego Biurrun | then G saving the writes for B in the case of 24/32 bit color |

64 | formats. |
66 | 8a322796 | Diego Biurrun | API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, |

67 | 4bdc44c7 | Diego Biurrun | int dW, uint32_t *coeffs); |

68 | d3f3eea9 | Marc Hoffman | |

69 | 4bdc44c7 | Diego Biurrun | A B |

70 | --- --- |
||

71 | i2 = cb i3 = cr |
||

72 | i1 = coeff i0 = y |
||

74 | 4bdc44c7 | Diego Biurrun | Where coeffs have the following layout in memory. |

76 | 4bdc44c7 | Diego Biurrun | uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv; |

78 | 4bdc44c7 | Diego Biurrun | coeffs is a pointer to oy. |

80 | 8a322796 | Diego Biurrun | The {rgb} masks are only utilized by the 565 packing algorithm. Note the data |

81 | replication is used to simplify the internal algorithms for the dual Mac |
||

82 | architecture of BlackFin. |
||

84 | 8a322796 | Diego Biurrun | All routines are exported with _ff_bfin_ as a symbol prefix. |

86 | 8a322796 | Diego Biurrun | Rough performance gain compared against -O3: |

88 | 4bdc44c7 | Diego Biurrun | 2779809/1484290 187.28% |

90 | 4bdc44c7 | Diego Biurrun | which translates to ~33c/pel to ~57c/pel for the reference vs 17.5 |

91 | c/pel for the optimized implementations. Not sure why there is such a |
||

92 | huge variation on the reference codes on Blackfin I guess it must have |
||

93 | to do with the memory system. |
||

94 | d3f3eea9 | Marc Hoffman | */ |

96 | #define mL3 .text |
97 | d2a4ecaf | Mike Frysinger | #ifdef __FDPIC__ |

98 | #define mL1 .l1.text |
99 | #else |
100 | #define mL1 mL3 |
101 | #endif |
102 | d3f3eea9 | Marc Hoffman | #define MEM mL1 |

104 | #define DEFUN(fname,where,interface) \ |
105 | .section where; \ |
106 | .global _ff_bfin_ ## fname; \ |
107 | .type _ff_bfin_ ## fname, STT_FUNC; \ |
108 | .align 8; \ |
109 | _ff_bfin_ ## fname |
111 | #define DEFUN_END(fname) \ |
112 | .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname |
115 | .text |
117 | #define COEFF_LEN 11*4 |
118 | #define COEFF_REL_CY_OFF 4*4 |
120 | #define ARG_OUT 20 |
121 | #define ARG_W 24 |
122 | #define ARG_COEFF 28 |
124 | DEFUN(yuv2rgb565_line,MEM, |
125 | (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): |
126 | link 0; |
127 | [--sp] = (r7:4); |
128 | p1 = [fp+ARG_OUT]; |
129 | r3 = [fp+ARG_W]; |
131 | i0 = r0; |
132 | i2 = r1; |
133 | i3 = r2; |
134 | |||

||

||

||

||

||

||

142 | r0 = [i0++]; // 2Y |
143 | r1.l = w[i2++]; // 2u |
144 | r1.h = w[i3++]; // 2v |
145 | p0 = p0>>2; |
146 | |||

||

149 | /* |
150 | uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv |
151 | r0 -- used to load 4ys |
152 | r1 -- used to load 2us,2vs |
153 | r4 -- y3,y2 |
154 | r5 -- y1,y0 |
155 | r6 -- u1,u0 |
156 | r7 -- v1,v0 |
157 | */ |
158 | r2=[i1++]; // oy |
159 | .L0565: |
160 | /* |
161 | rrrrrrrr gggggggg bbbbbbbb |
162 | 5432109876543210 |
163 | bbbbb >>3 |
164 | gggggggg <<3 |
165 | rrrrrrrr <<8 |
166 | rrrrrggggggbbbbb |
167 | */ |
168 | (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc |
169 | (r7,r6) = byteop16m (r1:0, r3:2) (r); |
170 | r5 = r5 << 2 (v); // y1,y0 |
171 | r4 = r4 << 2 (v); // y3,y2 |
172 | r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero |
173 | r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy |
174 | /* Y' = y*cy */ |
175 | a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv |
177 | /* R = Y+ crv*(Cr-128) */ |
178 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |
||

||

||

||

||

184 | /* B = Y+ cbu*(Cb-128) */ |
185 | r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); |
186 | a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask |
187 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |
188 | r2 = r2 << 8 (v); |
189 | r2 = r2 & r5; |
190 | r3 = r3 | r2; |
192 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |
193 | a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv |
194 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |
||

||

||

||

||

||

201 | /* Y' = y*cy */ |
||

203 | a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv |
205 | /* R = Y+ crv*(Cr-128) */ |
206 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); |
||

||

||

||

||

212 | /* B = Y+ cbu*(Cb-128) */ |
213 | r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); |
214 | a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask |
||

||

||

||

||

||

||

||

||

||

||

||

||

||

230 | l1 = 0; |
||

231 | |||

232 | (r7:4) = [sp++]; |
||

233 | unlink; |
||

234 | rts; |
235 | DEFUN_END(yuv2rgb565_line) |
237 | DEFUN(yuv2rgb555_line,MEM, |
238 | (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): |
239 | link 0; |
240 | [--sp] = (r7:4); |
241 | p1 = [fp+ARG_OUT]; |
242 | r3 = [fp+ARG_W]; |
244 | i0 = r0; |
245 | i2 = r1; |
246 | i3 = r2; |
248 | r0 = [fp+ARG_COEFF]; |
249 | i1 = r0; |
250 | b1 = i1; |
251 | l1 = COEFF_LEN; |
252 | m0 = COEFF_REL_CY_OFF; |
253 | p0 = r3; |
255 | r0 = [i0++]; // 2Y |
256 | r1.l = w[i2++]; // 2u |
257 | r1.h = w[i3++]; // 2v |
258 | p0 = p0>>2; |
259 | |||

||

262 | /* |
263 | uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv |
264 | r0 -- used to load 4ys |
265 | r1 -- used to load 2us,2vs |
266 | r4 -- y3,y2 |
267 | r5 -- y1,y0 |
268 | r6 -- u1,u0 |
269 | r7 -- v1,v0 |
270 | */ |
271 | r2=[i1++]; // oy |
272 | .L0555: |
273 | /* |
274 | rrrrrrrr gggggggg bbbbbbbb |
275 | 5432109876543210 |
276 | bbbbb >>3 |
277 | gggggggg <<2 |
278 | rrrrrrrr <<7 |
279 | xrrrrrgggggbbbbb |
280 | */ |
282 | (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc |
283 | (r7,r6) = byteop16m (r1:0, r3:2) (r); |
284 | r5 = r5 << 2 (v); // y1,y0 |
285 | r4 = r4 << 2 (v); // y3,y2 |
286 | r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero |
287 | r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy |
288 | /* Y' = y*cy */ |
289 | a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv |
291 | /* R = Y+ crv*(Cr-128) */ |
292 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |
293 | a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask |
294 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |
295 | r2 = r2 >> 3 (v); |
296 | r3 = r2 & r5; |
298 | /* B = Y+ cbu*(Cb-128) */ |
299 | r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); |
300 | a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask |
301 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |
302 | r2 = r2 << 7 (v); |
303 | r2 = r2 & r5; |
304 | r3 = r3 | r2; |
306 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |
307 | a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv |
308 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |
309 | r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask |
310 | r2 = r2 << 2 (v); |
311 | r2 = r2 & r5; |
312 | r3 = r3 | r2; |
313 | [p1++]=r3 || r1=[i1++]; // cy |
315 | /* Y' = y*cy */ |
317 | a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv |
318 | |||

||

||

||

||

||

||

326 | /* B = Y+ cbu*(Cb-128) */ |
327 | r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); |
328 | a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask |
329 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |
330 | r2 = r2 << 7 (v); |
331 | r2 = r2 & r5; |
332 | r3 = r3 | r2; |
334 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |
335 | a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv |
336 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask |
337 | r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y |
338 | r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u |
339 | r2 = r2 & r5; |
340 | r3 = r3 | r2; |
341 | [p1++]=r3 || r1.h=w[i3++]; // 2v |
343 | .L1555: r2=[i1++]; // oy |
345 | l1 = 0; |
347 | (r7:4) = [sp++]; |
348 | unlink; |
349 | rts; |
350 | DEFUN_END(yuv2rgb555_line) |
352 | DEFUN(yuv2rgb24_line,MEM, |
353 | (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): |
354 | link 0; |
355 | [--sp] = (r7:4); |
356 | p1 = [fp+ARG_OUT]; |
357 | r3 = [fp+ARG_W]; |
358 | p2 = p1; |
359 | p2 += 3; |
361 | i0 = r0; |
362 | i2 = r1; |
363 | i3 = r2; |
365 | r0 = [fp+ARG_COEFF]; // coeff buffer |
366 | i1 = r0; |
367 | b1 = i1; |
368 | l1 = COEFF_LEN; |
369 | m0 = COEFF_REL_CY_OFF; |
370 | p0 = r3; |
372 | r0 = [i0++]; // 2Y |
373 | r1.l = w[i2++]; // 2u |
374 | r1.h = w[i3++]; // 2v |
375 | p0 = p0>>2; |
377 | lsetup (.L0888, .L1888) lc0 = p0; |
379 | /* |
380 | uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv |
381 | r0 -- used to load 4ys |
382 | r1 -- used to load 2us,2vs |
383 | r4 -- y3,y2 |
384 | r5 -- y1,y0 |
385 | r6 -- u1,u0 |
386 | r7 -- v1,v0 |
387 | */ |
388 | r2=[i1++]; // oy |
389 | .L0888: |
390 | (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc |
391 | (r7,r6) = byteop16m (r1:0, r3:2) (r); |
392 | r5 = r5 << 2 (v); // y1,y0 |
393 | r4 = r4 << 2 (v); // y3,y2 |
394 | r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero |
395 | r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy |
397 | /* Y' = y*cy */ |
398 | a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv |
400 | /* R = Y+ crv*(Cr-128) */ |
401 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |
402 | a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask |
403 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |
404 | r2=r2>>16 || B[p1++]=r2; |
405 | B[p2++]=r2; |
407 | /* B = Y+ cbu*(Cb-128) */ |
408 | r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); |
409 | a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask |
410 | r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |
412 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |
413 | a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv |
414 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); |
415 | r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero |
417 | r2=r2>>16 || B[p1++]=r2; |
418 | B[p2++]=r2; |
420 | r3=r3>>16 || B[p1++]=r3; |
421 | B[p2++]=r3 || r1=[i1++]; // cy |
423 | p1+=3; |
424 | p2+=3; |
425 | /* Y' = y*cy */ |
426 | a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv |
428 | /* R = Y+ crv*(Cr-128) */ |
429 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); |
430 | a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask |
431 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu |
432 | r2=r2>>16 || B[p1++]=r2; |
433 | B[p2++]=r2; |
435 | /* B = Y+ cbu*(Cb-128) */ |
436 | r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); |
437 | a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask |
438 | r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu |
440 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ |
441 | a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv |
442 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); |
443 | r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask |
444 | r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y |
445 | B[p2++]=r2 || r1.l = w[i2++]; // 2u |
446 | r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v |
447 | B[p2++]=r3 || r2=[i1++]; // oy |
449 | p1+=3; |
450 | .L1888: p2+=3; |
452 | l1 = 0; |
454 | (r7:4) = [sp++]; |
455 | unlink; |
456 | rts; |
457 | 22a11d57 | Marc Hoffman | DEFUN_END(yuv2rgb24_line) |

461 | #define ARG_vdst 20 |
462 | #define ARG_width 24 |
463 | #define ARG_height 28 |
464 | #define ARG_lumStride 32 |
465 | #define ARG_chromStride 36 |
466 | #define ARG_srcStride 40 |
468 | DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
469 | long width, long height, |
470 | long lumStride, long chromStride, long srcStride)): |
471 | link 0; |
472 | [--sp] = (r7:4,p5:4); |
474 | p0 = r1; // Y top even |
475 | |||

||

||

||

480 | r1 = [fp + ARG_srcStride]; |
481 | r2 = r0 + r1; |
482 | 69a6db95 | Marc Hoffman | r1 += -8; // i0,i1 is pre read need to correct |

483 | bf4a90fc | Marc Hoffman | m0 = r1; |

485 | i0 = r0; // uyvy_T even |
486 | i1 = r2; // uyvy_B odd |
487 | |||

||

||

491 | p5 = [fp + ARG_width]; |
492 | p4 = [fp + ARG_height]; |
493 | 45eeae39 | Marc Hoffman | r0 = p5; |

494 | bf4a90fc | Marc Hoffman | p4 = p4 >> 1; |

||

497 | 45eeae39 | Marc Hoffman | r2 = [fp + ARG_chromStride]; |

498 | r0 = r0 >> 1; |
||

||

||

502 | bf4a90fc | Marc Hoffman | /* I0,I1 - src input line pointers |

503 | * p0,p1 - luma output line pointers |
||

504 | * I2 - dstU |
||

505 | * I3 - dstV |
||

506 | */ |
507 | |||

508 | e9d4375f | Marc Hoffman | lsetup (0f, 1f) lc1 = p4; // H/2 |

509 | 0: r0 = [i0++] || r2 = [i1++]; |
510 | r1 = [i0++] || r3 = [i1++]; |
511 | r4 = byteop1p(r1:0, r3:2); |
512 | r5 = byteop1p(r1:0, r3:2) (r); |
513 | lsetup (2f, 3f) lc0 = p5; // W/4 |
514 | 2: r0 = r0 >> 8(v); |
515 | bf4a90fc | Marc Hoffman | r1 = r1 >> 8(v); |

516 | r2 = r2 >> 8(v); |
517 | r3 = r3 >> 8(v); |
518 | r0 = bytepack(r0, r1); |
519 | e9d4375f | Marc Hoffman | r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy |

520 | r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy |
521 | r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++]; |
522 | r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++]; |
523 | r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu |
524 | 3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv |
||

526 | i0 += m0; |
527 | i1 += m0; |
528 | 45eeae39 | Marc Hoffman | i2 += m1; |

529 | i3 += m1; |
530 | bf4a90fc | Marc Hoffman | p0 = p0 + p2; |

||

533 | (r7:4,p5:4) = [sp++]; |
534 | unlink; |
535 | rts; |
536 | DEFUN_END(uyvytoyv12) |
538 | DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
539 | long width, long height, |
540 | long lumStride, long chromStride, long srcStride)): |
541 | link 0; |
542 | [--sp] = (r7:4,p5:4); |
543 | |||

||

546 | i2 = r2; // *u |
547 | r2 = [fp + ARG_vdst]; |
548 | i3 = r2; // *v |
549 | |||

||

||

||

||

555 | i0 = r0; // uyvy_T even |
556 | i1 = r2; // uyvy_B odd |
557 | |||

||

||

561 | p5 = [fp + ARG_width]; |
562 | p4 = [fp + ARG_height]; |
563 | r0 = p5; |
564 | p4 = p4 >> 1; |
565 | p5 = p5 >> 2; |
566 | |||

||

||

||

||

572 | /* I0,I1 - src input line pointers |
573 | * p0,p1 - luma output line pointers |
574 | * I2 - dstU |
575 | * I3 - dstV |
576 | */ |
578 | lsetup (0f, 1f) lc1 = p4; // H/2 |
579 | 0: r0 = [i0++] || r2 = [i1++]; |
580 | r1 = [i0++] || r3 = [i1++]; |
581 | r4 = bytepack(r0, r1); |
582 | r5 = bytepack(r2, r3); |
583 | lsetup (2f, 3f) lc0 = p5; // W/4 |
584 | 2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even |
585 | r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd |
586 | r2 = r2 >> 8(v); |
587 | r3 = r3 >> 8(v); |
588 | r4 = byteop1p(r1:0, r3:2); |
589 | r5 = byteop1p(r1:0, r3:2) (r); |
590 | r6 = pack(r5.l, r4.l); |
591 | r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++]; |
592 | r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++]; |
593 | r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu |
594 | 3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv |
596 | i0 += m0; |
597 | i1 += m0; |
598 | i2 += m1; |
599 | i3 += m1; |
600 | p0 = p0 + p2; |
601 | 1: p1 = p1 + p2; |
602 | |||

||

||

||

606 | DEFUN_END(yuyvtoyv12) |