Statistics
| Branch: | Revision:

ffmpeg / libswscale / internal_bfin.S @ d2a4ecaf

History | View | Annotate | Download (19.6 KB)

1
/*
2
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3
 *                    April 20, 2007
4
 *
5
 * Blackfin Video Color Space Converters Operations
6
 *  convert I420 YV12 to RGB in various formats,
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25

    
26
/*
27
    YUV420 to RGB565 conversion.  This routine takes a YUV 420 planar macroblock
28
    and converts it to RGB565.  R:5 bits, G:6 bits, B:5 bits.. packed into shorts
29

    
30

    
31
    The following calculation is used for the conversion:
32

    
33
      r = clipz((y-oy)*cy  + crv*(v-128))
34
      g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35
      b = clipz((y-oy)*cy  + cbu*(u-128))
36

    
37
    y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
38

    
39

    
40
    New factorization to eliminate the truncation error which was
41
    occuring due to the byteop3p.
42

    
43

    
44
  1) use the bytop16m to subtract quad bytes we use this in U8 this
45
   then so the offsets need to be renormalized to 8bits.
46

    
47
  2) scale operands up by a factor of 4 not 8 because Blackfin
48
     multiplies include a shift.
49

    
50
  3) compute into the accumulators cy*yx0, cy*yx1
51

    
52
  4) compute each of the linear equations
53
      r = clipz((y-oy)*cy  + crv*(v-128))
54

    
55
      g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
56

    
57
      b = clipz((y-oy)*cy  + cbu*(u-128))
58

    
59
     reuse of the accumulators requires that we actually multiply
60
     twice once with addition and the second time with a subtaction.
61

    
62
     because of this we need to compute the equations in the order R B
63
     then G saving the writes for B in the case of 24/32 bit color
64
     formats.
65

    
66
    api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67
                       int dW, uint32_t *coeffs);
68

    
69
        A          B
70
        ---        ---
71
        i2 = cb    i3 = cr
72
        i1 = coeff i0 = y
73

    
74
  Where coeffs have the following layout in memory.
75

    
76
  uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
77

    
78
  coeffs is a pointer to oy.
79

    
80
  the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81
  replication is used to simplify the internal algorithms for the dual mac architecture
82
  of BlackFin.
83

    
84
  All routines are exported with _ff_bfin_ as a symbol prefix
85

    
86
  rough performance gain compared against -O3:
87

    
88
  2779809/1484290 187.28%
89

    
90
  which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91
  c/pel for the optimized implementations. Not sure why there is such a
92
  huge variation on the reference codes on Blackfin I guess it must have
93
  to do with the memory system.
94

    
95
*/
96

    
97
#define mL3 .text
98
#ifdef __FDPIC__
99
#define mL1 .l1.text
100
#else
101
#define mL1 mL3
102
#endif
103
#define MEM mL1
104

    
105
#define DEFUN(fname,where,interface) \
106
        .section where;              \
107
        .global _ff_bfin_ ## fname;  \
108
        .type _ff_bfin_ ## fname, STT_FUNC; \
109
        .align 8;                    \
110
        _ff_bfin_ ## fname
111

    
112
#define DEFUN_END(fname) \
113
        .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
114

    
115

    
116
.text
117

    
118
#define COEFF_LEN        11*4
119
#define COEFF_REL_CY_OFF 4*4
120

    
121
#define ARG_OUT   20
122
#define ARG_W     24
123
#define ARG_COEFF 28
124

    
125
DEFUN(yuv2rgb565_line,MEM,
126
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
127
        link 0;
128
        [--sp] = (r7:4);
129
        p1 = [fp+ARG_OUT];
130
        r3 = [fp+ARG_W];
131

    
132
        i0 = r0;
133
        i2 = r1;
134
        i3 = r2;
135

    
136
        r0 = [fp+ARG_COEFF];
137
        i1 = r0;
138
        b1 = i1;
139
        l1 = COEFF_LEN;
140
        m0 = COEFF_REL_CY_OFF;
141
        p0 = r3;
142

    
143
        r0   = [i0++];         // 2Y
144
        r1.l = w[i2++];        // 2u
145
        r1.h = w[i3++];        // 2v
146
        p0 = p0>>2;
147

    
148
        lsetup (.L0565, .L1565) lc0 = p0;
149

    
150
        /*
151
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
152
           r0 -- used to load 4ys
153
           r1 -- used to load 2us,2vs
154
           r4 -- y3,y2
155
           r5 -- y1,y0
156
           r6 -- u1,u0
157
           r7 -- v1,v0
158
        */
159
                                                              r2=[i1++]; // oy
160
.L0565:
161
        /*
162
        rrrrrrrr gggggggg bbbbbbbb
163
         5432109876543210
164
                    bbbbb >>3
165
              gggggggg    <<3
166
         rrrrrrrr         <<8
167
         rrrrrggggggbbbbb
168
        */
169
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
170
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
171
        r5 = r5 << 2 (v);                                                // y1,y0
172
        r4 = r4 << 2 (v);                                                // y3,y2
173
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
174
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
175
        /* Y' = y*cy */
176
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
177

    
178
        /* R = Y+ crv*(Cr-128) */
179
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
180
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
181
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
182
        r2 = r2 >> 3 (v);
183
        r3 = r2 & r5;
184

    
185
        /* B = Y+ cbu*(Cb-128) */
186
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
187
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
188
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
189
        r2 = r2 << 8 (v);
190
        r2 = r2 & r5;
191
        r3 = r3 | r2;
192

    
193
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
194
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
195
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
196
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
197
        r2 = r2 << 3 (v);
198
        r2 = r2 & r5;
199
        r3 = r3 | r2;
200
        [p1++]=r3                                          || r1=[i1++]; // cy
201

    
202
        /* Y' = y*cy */
203

    
204
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
205

    
206
        /* R = Y+ crv*(Cr-128) */
207
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
208
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
209
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
210
        r2 = r2 >> 3 (v);
211
        r3 = r2 & r5;
212

    
213
        /* B = Y+ cbu*(Cb-128) */
214
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
215
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
216
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
217
        r2 = r2 << 8 (v);
218
        r2 = r2 & r5;
219
        r3 = r3 | r2;
220

    
221
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
222
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
223
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
224
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
225
        r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
226
        r2 = r2 & r5;
227
        r3 = r3 | r2;
228
        [p1++]=r3                                          || r1.h = w[i3++];        // 2v
229
.L1565:                                                       r2=[i1++]; // oy
230

    
231
        l1 = 0;
232

    
233
        (r7:4) = [sp++];
234
        unlink;
235
        rts;
236
DEFUN_END(yuv2rgb565_line)
237

    
238
DEFUN(yuv2rgb555_line,MEM,
239
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
240
        link 0;
241
        [--sp] = (r7:4);
242
        p1 = [fp+ARG_OUT];
243
        r3 = [fp+ARG_W];
244

    
245
        i0 = r0;
246
        i2 = r1;
247
        i3 = r2;
248

    
249
        r0 = [fp+ARG_COEFF];
250
        i1 = r0;
251
        b1 = i1;
252
        l1 = COEFF_LEN;
253
        m0 = COEFF_REL_CY_OFF;
254
        p0 = r3;
255

    
256
        r0   = [i0++];         // 2Y
257
        r1.l = w[i2++];        // 2u
258
        r1.h = w[i3++];        // 2v
259
        p0 = p0>>2;
260

    
261
        lsetup (.L0555, .L1555) lc0 = p0;
262

    
263
        /*
264
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
265
           r0 -- used to load 4ys
266
           r1 -- used to load 2us,2vs
267
           r4 -- y3,y2
268
           r5 -- y1,y0
269
           r6 -- u1,u0
270
           r7 -- v1,v0
271
        */
272
                                                              r2=[i1++]; // oy
273
.L0555:
274
        /*
275
        rrrrrrrr gggggggg bbbbbbbb
276
         5432109876543210
277
                    bbbbb >>3
278
               gggggggg   <<2
279
          rrrrrrrr        <<7
280
         xrrrrrgggggbbbbb
281
        */
282

    
283
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
284
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
285
        r5 = r5 << 2 (v);                                                // y1,y0
286
        r4 = r4 << 2 (v);                                                // y3,y2
287
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
288
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
289
        /* Y' = y*cy */
290
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
291

    
292
        /* R = Y+ crv*(Cr-128) */
293
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
294
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
295
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
296
        r2 = r2 >> 3 (v);
297
        r3 = r2 & r5;
298

    
299
        /* B = Y+ cbu*(Cb-128) */
300
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
301
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
302
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
303
        r2 = r2 << 7 (v);
304
        r2 = r2 & r5;
305
        r3 = r3 | r2;
306

    
307
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
308
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
309
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
310
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
311
        r2 = r2 << 2 (v);
312
        r2 = r2 & r5;
313
        r3 = r3 | r2;
314
        [p1++]=r3                                          || r1=[i1++]; // cy
315

    
316
        /* Y' = y*cy */
317

    
318
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
319

    
320
        /* R = Y+ crv*(Cr-128) */
321
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
322
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
323
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
324
        r2 = r2 >> 3 (v);
325
        r3 = r2 & r5;
326

    
327
        /* B = Y+ cbu*(Cb-128) */
328
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
329
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
330
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
331
        r2 = r2 << 7 (v);
332
        r2 = r2 & r5;
333
        r3 = r3 | r2;
334

    
335
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
336
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
337
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
338
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
339
        r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
340
        r2 = r2 & r5;
341
        r3 = r3 | r2;
342
        [p1++]=r3                                          || r1.h=w[i3++]; // 2v
343

    
344
.L1555:                                                       r2=[i1++]; // oy
345

    
346
        l1 = 0;
347

    
348
        (r7:4) = [sp++];
349
        unlink;
350
        rts;
351
DEFUN_END(yuv2rgb555_line)
352

    
353
DEFUN(yuv2rgb24_line,MEM,
354
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
355
        link 0;
356
        [--sp] = (r7:4);
357
        p1 = [fp+ARG_OUT];
358
        r3 = [fp+ARG_W];
359
        p2 = p1;
360
        p2 += 3;
361

    
362
        i0 = r0;
363
        i2 = r1;
364
        i3 = r2;
365

    
366
        r0 = [fp+ARG_COEFF]; // coeff buffer
367
        i1 = r0;
368
        b1 = i1;
369
        l1 = COEFF_LEN;
370
        m0 = COEFF_REL_CY_OFF;
371
        p0 = r3;
372

    
373
        r0   = [i0++];         // 2Y
374
        r1.l = w[i2++];        // 2u
375
        r1.h = w[i3++];        // 2v
376
        p0 = p0>>2;
377

    
378
        lsetup (.L0888, .L1888) lc0 = p0;
379

    
380
        /*
381
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
382
           r0 -- used to load 4ys
383
           r1 -- used to load 2us,2vs
384
           r4 -- y3,y2
385
           r5 -- y1,y0
386
           r6 -- u1,u0
387
           r7 -- v1,v0
388
        */
389
                                                              r2=[i1++]; // oy
390
.L0888:
391
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
392
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
393
        r5 = r5 << 2 (v);               // y1,y0
394
        r4 = r4 << 2 (v);               // y3,y2
395
        r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
396
        r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
397

    
398
        /* Y' = y*cy */
399
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
400

    
401
        /* R = Y+ crv*(Cr-128) */
402
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
403
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
404
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
405
        r2=r2>>16 || B[p1++]=r2;
406
                     B[p2++]=r2;
407

    
408
        /* B = Y+ cbu*(Cb-128) */
409
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
410
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
411
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
412

    
413
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
414
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
415
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
416
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
417

    
418
        r2=r2>>16 || B[p1++]=r2;
419
                     B[p2++]=r2;
420

    
421
        r3=r3>>16 || B[p1++]=r3;
422
                     B[p2++]=r3                            || r1=[i1++]; // cy
423

    
424
        p1+=3;
425
        p2+=3;
426
        /* Y' = y*cy */
427
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
428

    
429
        /* R = Y+ crv*(Cr-128) */
430
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
431
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
432
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
433
        r2=r2>>16 || B[p1++]=r2;
434
        B[p2++]=r2;
435

    
436
        /* B = Y+ cbu*(Cb-128) */
437
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
438
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
439
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
440

    
441
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
442
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
443
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
444
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
445
        r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
446
                     B[p2++]=r2 || r1.l = w[i2++]; // 2u
447
        r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
448
                     B[p2++]=r3 || r2=[i1++];      // oy
449

    
450
        p1+=3;
451
.L1888: p2+=3;
452

    
453
        l1 = 0;
454

    
455
        (r7:4) = [sp++];
456
        unlink;
457
        rts;
458
DEFUN_END(yuv2rgb24_line)
459

    
460

    
461

    
462
#define ARG_vdst        20
463
#define ARG_width       24
464
#define ARG_height      28
465
#define ARG_lumStride   32
466
#define ARG_chromStride 36
467
#define ARG_srcStride   40
468

    
469
DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
470
                         long width, long height,
471
                         long lumStride, long chromStride, long srcStride)):
472
        link 0;
473
        [--sp] = (r7:4,p5:4);
474

    
475
        p0 = r1;       // Y top even
476

    
477
        i2 = r2; // *u
478
        r2 = [fp + ARG_vdst];
479
        i3 = r2; // *v
480

    
481
        r1 = [fp + ARG_srcStride];
482
        r2 = r0 + r1;
483
        r1 += -8;  // i0,i1 is pre read need to correct
484
        m0 = r1;
485

    
486
        i0 = r0;  // uyvy_T even
487
        i1 = r2;  // uyvy_B odd
488

    
489
        p2 = [fp + ARG_lumStride];
490
        p1 = p0 + p2;  // Y bot odd
491

    
492
        p5 = [fp + ARG_width];
493
        p4 = [fp + ARG_height];
494
        r0 = p5;
495
        p4 = p4 >> 1;
496
        p5 = p5 >> 2;
497

    
498
        r2 = [fp + ARG_chromStride];
499
        r0 = r0 >> 1;
500
        r2 = r2 - r0;
501
        m1 = r2;
502

    
503
        /*   I0,I1 - src input line pointers
504
         *   p0,p1 - luma output line pointers
505
         *   I2    - dstU
506
         *   I3    - dstV
507
         */
508

    
509
        lsetup (0f, 1f) lc1 = p4;   // H/2
510
0:        r0 = [i0++] || r2 = [i1++];
511
          r1 = [i0++] || r3 = [i1++];
512
          r4 = byteop1p(r1:0, r3:2);
513
          r5 = byteop1p(r1:0, r3:2) (r);
514
          lsetup (2f, 3f) lc0 = p5; // W/4
515
2:          r0 = r0 >> 8(v);
516
            r1 = r1 >> 8(v);
517
            r2 = r2 >> 8(v);
518
            r3 = r3 >> 8(v);
519
            r0 = bytepack(r0, r1);
520
            r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
521
            r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
522
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
523
            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
524
            r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
525
3:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv
526

    
527
          i0 += m0;
528
          i1 += m0;
529
          i2 += m1;
530
          i3 += m1;
531
          p0 = p0 + p2;
532
1:        p1 = p1 + p2;
533

    
534
        (r7:4,p5:4) = [sp++];
535
        unlink;
536
        rts;
537
DEFUN_END(uyvytoyv12)
538

    
539
DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
540
                         long width, long height,
541
                         long lumStride, long chromStride, long srcStride)):
542
        link 0;
543
        [--sp] = (r7:4,p5:4);
544

    
545
        p0 = r1;       // Y top even
546

    
547
        i2 = r2; // *u
548
        r2 = [fp + ARG_vdst];
549
        i3 = r2; // *v
550

    
551
        r1 = [fp + ARG_srcStride];
552
        r2 = r0 + r1;
553
        r1 += -8;  // i0,i1 is pre read need to correct
554
        m0 = r1;
555

    
556
        i0 = r0;  // uyvy_T even
557
        i1 = r2;  // uyvy_B odd
558

    
559
        p2 = [fp + ARG_lumStride];
560
        p1 = p0 + p2;  // Y bot odd
561

    
562
        p5 = [fp + ARG_width];
563
        p4 = [fp + ARG_height];
564
        r0 = p5;
565
        p4 = p4 >> 1;
566
        p5 = p5 >> 2;
567

    
568
        r2 = [fp + ARG_chromStride];
569
        r0 = r0 >> 1;
570
        r2 = r2 - r0;
571
        m1 = r2;
572

    
573
        /*   I0,I1 - src input line pointers
574
         *   p0,p1 - luma output line pointers
575
         *   I2    - dstU
576
         *   I3    - dstV
577
         */
578

    
579
        lsetup (0f, 1f) lc1 = p4;   // H/2
580
0:        r0 = [i0++] || r2 = [i1++];
581
          r1 = [i0++] || r3 = [i1++];
582
          r4 = bytepack(r0, r1);
583
          r5 = bytepack(r2, r3);
584
          lsetup (2f, 3f) lc0 = p5; // W/4
585
2:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
586
            r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
587
            r2 = r2 >> 8(v);
588
            r3 = r3 >> 8(v);
589
            r4 = byteop1p(r1:0, r3:2);
590
            r5 = byteop1p(r1:0, r3:2) (r);
591
            r6 = pack(r5.l, r4.l);
592
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
593
            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
594
            r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
595
3:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv
596

    
597
          i0 += m0;
598
          i1 += m0;
599
          i2 += m1;
600
          i3 += m1;
601
          p0 = p0 + p2;
602
1:        p1 = p1 + p2;
603

    
604
        (r7:4,p5:4) = [sp++];
605
        unlink;
606
        rts;
607
DEFUN_END(yuyvtoyv12)