Statistics
| Branch: | Revision:

ffmpeg / libswscale / bfin / internal_bfin.S @ e62bd367

History | View | Annotate | Download (19.6 KB)

1
/*
2
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3
 *                    April 20, 2007
4
 *
5
 * Blackfin video color space converter operations
6
 * convert I420 YV12 to RGB in various formats
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25

    
26
/*
27
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
29

    
30

    
31
The following calculation is used for the conversion:
32

    
33
  r = clipz((y-oy)*cy  + crv*(v-128))
34
  g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35
  b = clipz((y-oy)*cy  + cbu*(u-128))
36

    
37
y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
38

    
39

    
40
New factorization to eliminate the truncation error which was
41
occurring due to the byteop3p.
42

    
43

    
44
1) Use the bytop16m to subtract quad bytes we use this in U8 this
45
 then so the offsets need to be renormalized to 8bits.
46

    
47
2) Scale operands up by a factor of 4 not 8 because Blackfin
48
   multiplies include a shift.
49

    
50
3) Compute into the accumulators cy*yx0, cy*yx1.
51

    
52
4) Compute each of the linear equations:
53
     r = clipz((y - oy) * cy  + crv * (v - 128))
54

    
55
     g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
56

    
57
     b = clipz((y - oy) * cy  + cbu * (u - 128))
58

    
59
   Reuse of the accumulators requires that we actually multiply
60
   twice once with addition and the second time with a subtraction.
61

    
62
   Because of this we need to compute the equations in the order R B
63
   then G saving the writes for B in the case of 24/32 bit color
64
   formats.
65

    
66
   API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67
                      int dW, uint32_t *coeffs);
68

    
69
       A          B
70
       ---        ---
71
       i2 = cb    i3 = cr
72
       i1 = coeff i0 = y
73

    
74
Where coeffs have the following layout in memory.
75

    
76
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
77

    
78
coeffs is a pointer to oy.
79

    
80
The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81
replication is used to simplify the internal algorithms for the dual Mac
82
architecture of BlackFin.
83

    
84
All routines are exported with _ff_bfin_ as a symbol prefix.
85

    
86
Rough performance gain compared against -O3:
87

    
88
2779809/1484290 187.28%
89

    
90
which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91
c/pel for the optimized implementations. Not sure why there is such a
92
huge variation on the reference codes on Blackfin I guess it must have
93
to do with the memory system.
94
*/
95

    
96
#define mL3 .text
97
#if defined(__FDPIC__) && CONFIG_SRAM
98
#define mL1 .l1.text
99
#else
100
#define mL1 mL3
101
#endif
102
#define MEM mL1
103

    
104
#define DEFUN(fname,where,interface) \
105
        .section where;              \
106
        .global _ff_bfin_ ## fname;  \
107
        .type _ff_bfin_ ## fname, STT_FUNC; \
108
        .align 8;                    \
109
        _ff_bfin_ ## fname
110

    
111
#define DEFUN_END(fname) \
112
        .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
113

    
114

    
115
.text
116

    
117
#define COEFF_LEN        11*4
118
#define COEFF_REL_CY_OFF 4*4
119

    
120
#define ARG_OUT   20
121
#define ARG_W     24
122
#define ARG_COEFF 28
123

    
124
DEFUN(yuv2rgb565_line,MEM,
125
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
126
        link 0;
127
        [--sp] = (r7:4);
128
        p1 = [fp+ARG_OUT];
129
        r3 = [fp+ARG_W];
130

    
131
        i0 = r0;
132
        i2 = r1;
133
        i3 = r2;
134

    
135
        r0 = [fp+ARG_COEFF];
136
        i1 = r0;
137
        b1 = i1;
138
        l1 = COEFF_LEN;
139
        m0 = COEFF_REL_CY_OFF;
140
        p0 = r3;
141

    
142
        r0   = [i0++];         // 2Y
143
        r1.l = w[i2++];        // 2u
144
        r1.h = w[i3++];        // 2v
145
        p0 = p0>>2;
146

    
147
        lsetup (.L0565, .L1565) lc0 = p0;
148

    
149
        /*
150
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
151
           r0 -- used to load 4ys
152
           r1 -- used to load 2us,2vs
153
           r4 -- y3,y2
154
           r5 -- y1,y0
155
           r6 -- u1,u0
156
           r7 -- v1,v0
157
        */
158
                                                              r2=[i1++]; // oy
159
.L0565:
160
        /*
161
        rrrrrrrr gggggggg bbbbbbbb
162
         5432109876543210
163
                    bbbbb >>3
164
              gggggggg    <<3
165
         rrrrrrrr         <<8
166
         rrrrrggggggbbbbb
167
        */
168
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
169
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
170
        r5 = r5 << 2 (v);                                                // y1,y0
171
        r4 = r4 << 2 (v);                                                // y3,y2
172
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
173
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
174
        /* Y' = y*cy */
175
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
176

    
177
        /* R = Y+ crv*(Cr-128) */
178
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
179
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
180
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
181
        r2 = r2 >> 3 (v);
182
        r3 = r2 & r5;
183

    
184
        /* B = Y+ cbu*(Cb-128) */
185
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
186
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
187
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
188
        r2 = r2 << 8 (v);
189
        r2 = r2 & r5;
190
        r3 = r3 | r2;
191

    
192
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
193
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
194
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
195
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
196
        r2 = r2 << 3 (v);
197
        r2 = r2 & r5;
198
        r3 = r3 | r2;
199
        [p1++]=r3                                          || r1=[i1++]; // cy
200

    
201
        /* Y' = y*cy */
202

    
203
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
204

    
205
        /* R = Y+ crv*(Cr-128) */
206
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
207
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
208
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
209
        r2 = r2 >> 3 (v);
210
        r3 = r2 & r5;
211

    
212
        /* B = Y+ cbu*(Cb-128) */
213
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
214
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
215
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
216
        r2 = r2 << 8 (v);
217
        r2 = r2 & r5;
218
        r3 = r3 | r2;
219

    
220
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
221
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
222
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
223
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
224
        r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
225
        r2 = r2 & r5;
226
        r3 = r3 | r2;
227
        [p1++]=r3                                          || r1.h = w[i3++];        // 2v
228
.L1565:                                                       r2=[i1++]; // oy
229

    
230
        l1 = 0;
231

    
232
        (r7:4) = [sp++];
233
        unlink;
234
        rts;
235
DEFUN_END(yuv2rgb565_line)
236

    
237
DEFUN(yuv2rgb555_line,MEM,
238
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
239
        link 0;
240
        [--sp] = (r7:4);
241
        p1 = [fp+ARG_OUT];
242
        r3 = [fp+ARG_W];
243

    
244
        i0 = r0;
245
        i2 = r1;
246
        i3 = r2;
247

    
248
        r0 = [fp+ARG_COEFF];
249
        i1 = r0;
250
        b1 = i1;
251
        l1 = COEFF_LEN;
252
        m0 = COEFF_REL_CY_OFF;
253
        p0 = r3;
254

    
255
        r0   = [i0++];         // 2Y
256
        r1.l = w[i2++];        // 2u
257
        r1.h = w[i3++];        // 2v
258
        p0 = p0>>2;
259

    
260
        lsetup (.L0555, .L1555) lc0 = p0;
261

    
262
        /*
263
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
264
           r0 -- used to load 4ys
265
           r1 -- used to load 2us,2vs
266
           r4 -- y3,y2
267
           r5 -- y1,y0
268
           r6 -- u1,u0
269
           r7 -- v1,v0
270
        */
271
                                                              r2=[i1++]; // oy
272
.L0555:
273
        /*
274
        rrrrrrrr gggggggg bbbbbbbb
275
         5432109876543210
276
                    bbbbb >>3
277
               gggggggg   <<2
278
          rrrrrrrr        <<7
279
         xrrrrrgggggbbbbb
280
        */
281

    
282
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
283
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
284
        r5 = r5 << 2 (v);                                                // y1,y0
285
        r4 = r4 << 2 (v);                                                // y3,y2
286
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
287
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
288
        /* Y' = y*cy */
289
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
290

    
291
        /* R = Y+ crv*(Cr-128) */
292
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
293
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
294
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
295
        r2 = r2 >> 3 (v);
296
        r3 = r2 & r5;
297

    
298
        /* B = Y+ cbu*(Cb-128) */
299
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
300
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
301
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
302
        r2 = r2 << 7 (v);
303
        r2 = r2 & r5;
304
        r3 = r3 | r2;
305

    
306
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
307
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
308
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
309
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
310
        r2 = r2 << 2 (v);
311
        r2 = r2 & r5;
312
        r3 = r3 | r2;
313
        [p1++]=r3                                          || r1=[i1++]; // cy
314

    
315
        /* Y' = y*cy */
316

    
317
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
318

    
319
        /* R = Y+ crv*(Cr-128) */
320
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
321
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
322
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
323
        r2 = r2 >> 3 (v);
324
        r3 = r2 & r5;
325

    
326
        /* B = Y+ cbu*(Cb-128) */
327
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
328
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
329
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
330
        r2 = r2 << 7 (v);
331
        r2 = r2 & r5;
332
        r3 = r3 | r2;
333

    
334
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
335
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
336
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
337
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
338
        r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
339
        r2 = r2 & r5;
340
        r3 = r3 | r2;
341
        [p1++]=r3                                          || r1.h=w[i3++]; // 2v
342

    
343
.L1555:                                                       r2=[i1++]; // oy
344

    
345
        l1 = 0;
346

    
347
        (r7:4) = [sp++];
348
        unlink;
349
        rts;
350
DEFUN_END(yuv2rgb555_line)
351

    
352
DEFUN(yuv2rgb24_line,MEM,
353
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
354
        link 0;
355
        [--sp] = (r7:4);
356
        p1 = [fp+ARG_OUT];
357
        r3 = [fp+ARG_W];
358
        p2 = p1;
359
        p2 += 3;
360

    
361
        i0 = r0;
362
        i2 = r1;
363
        i3 = r2;
364

    
365
        r0 = [fp+ARG_COEFF]; // coeff buffer
366
        i1 = r0;
367
        b1 = i1;
368
        l1 = COEFF_LEN;
369
        m0 = COEFF_REL_CY_OFF;
370
        p0 = r3;
371

    
372
        r0   = [i0++];         // 2Y
373
        r1.l = w[i2++];        // 2u
374
        r1.h = w[i3++];        // 2v
375
        p0 = p0>>2;
376

    
377
        lsetup (.L0888, .L1888) lc0 = p0;
378

    
379
        /*
380
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
381
           r0 -- used to load 4ys
382
           r1 -- used to load 2us,2vs
383
           r4 -- y3,y2
384
           r5 -- y1,y0
385
           r6 -- u1,u0
386
           r7 -- v1,v0
387
        */
388
                                                              r2=[i1++]; // oy
389
.L0888:
390
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
391
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
392
        r5 = r5 << 2 (v);               // y1,y0
393
        r4 = r4 << 2 (v);               // y3,y2
394
        r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
395
        r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
396

    
397
        /* Y' = y*cy */
398
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
399

    
400
        /* R = Y+ crv*(Cr-128) */
401
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
402
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
403
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
404
        r2=r2>>16 || B[p1++]=r2;
405
                     B[p2++]=r2;
406

    
407
        /* B = Y+ cbu*(Cb-128) */
408
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
409
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
410
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
411

    
412
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
413
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
414
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
415
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
416

    
417
        r2=r2>>16 || B[p1++]=r2;
418
                     B[p2++]=r2;
419

    
420
        r3=r3>>16 || B[p1++]=r3;
421
                     B[p2++]=r3                            || r1=[i1++]; // cy
422

    
423
        p1+=3;
424
        p2+=3;
425
        /* Y' = y*cy */
426
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
427

    
428
        /* R = Y+ crv*(Cr-128) */
429
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
430
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
431
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
432
        r2=r2>>16 || B[p1++]=r2;
433
        B[p2++]=r2;
434

    
435
        /* B = Y+ cbu*(Cb-128) */
436
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
437
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
438
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
439

    
440
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
441
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
442
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
443
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
444
        r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
445
                     B[p2++]=r2 || r1.l = w[i2++]; // 2u
446
        r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
447
                     B[p2++]=r3 || r2=[i1++];      // oy
448

    
449
        p1+=3;
450
.L1888: p2+=3;
451

    
452
        l1 = 0;
453

    
454
        (r7:4) = [sp++];
455
        unlink;
456
        rts;
457
DEFUN_END(yuv2rgb24_line)
458

    
459

    
460

    
461
#define ARG_vdst        20
462
#define ARG_width       24
463
#define ARG_height      28
464
#define ARG_lumStride   32
465
#define ARG_chromStride 36
466
#define ARG_srcStride   40
467

    
468
DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
469
                         long width, long height,
470
                         long lumStride, long chromStride, long srcStride)):
471
        link 0;
472
        [--sp] = (r7:4,p5:4);
473

    
474
        p0 = r1;       // Y top even
475

    
476
        i2 = r2; // *u
477
        r2 = [fp + ARG_vdst];
478
        i3 = r2; // *v
479

    
480
        r1 = [fp + ARG_srcStride];
481
        r2 = r0 + r1;
482
        r1 += -8;  // i0,i1 is pre read need to correct
483
        m0 = r1;
484

    
485
        i0 = r0;  // uyvy_T even
486
        i1 = r2;  // uyvy_B odd
487

    
488
        p2 = [fp + ARG_lumStride];
489
        p1 = p0 + p2;  // Y bot odd
490

    
491
        p5 = [fp + ARG_width];
492
        p4 = [fp + ARG_height];
493
        r0 = p5;
494
        p4 = p4 >> 1;
495
        p5 = p5 >> 2;
496

    
497
        r2 = [fp + ARG_chromStride];
498
        r0 = r0 >> 1;
499
        r2 = r2 - r0;
500
        m1 = r2;
501

    
502
        /*   I0,I1 - src input line pointers
503
         *   p0,p1 - luma output line pointers
504
         *   I2    - dstU
505
         *   I3    - dstV
506
         */
507

    
508
        lsetup (0f, 1f) lc1 = p4;   // H/2
509
0:        r0 = [i0++] || r2 = [i1++];
510
          r1 = [i0++] || r3 = [i1++];
511
          r4 = byteop1p(r1:0, r3:2);
512
          r5 = byteop1p(r1:0, r3:2) (r);
513
          lsetup (2f, 3f) lc0 = p5; // W/4
514
2:          r0 = r0 >> 8(v);
515
            r1 = r1 >> 8(v);
516
            r2 = r2 >> 8(v);
517
            r3 = r3 >> 8(v);
518
            r0 = bytepack(r0, r1);
519
            r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
520
            r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
521
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
522
            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
523
            r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
524
3:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv
525

    
526
          i0 += m0;
527
          i1 += m0;
528
          i2 += m1;
529
          i3 += m1;
530
          p0 = p0 + p2;
531
1:        p1 = p1 + p2;
532

    
533
        (r7:4,p5:4) = [sp++];
534
        unlink;
535
        rts;
536
DEFUN_END(uyvytoyv12)
537

    
538
DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
539
                         long width, long height,
540
                         long lumStride, long chromStride, long srcStride)):
541
        link 0;
542
        [--sp] = (r7:4,p5:4);
543

    
544
        p0 = r1;       // Y top even
545

    
546
        i2 = r2; // *u
547
        r2 = [fp + ARG_vdst];
548
        i3 = r2; // *v
549

    
550
        r1 = [fp + ARG_srcStride];
551
        r2 = r0 + r1;
552
        r1 += -8;  // i0,i1 is pre read need to correct
553
        m0 = r1;
554

    
555
        i0 = r0;  // uyvy_T even
556
        i1 = r2;  // uyvy_B odd
557

    
558
        p2 = [fp + ARG_lumStride];
559
        p1 = p0 + p2;  // Y bot odd
560

    
561
        p5 = [fp + ARG_width];
562
        p4 = [fp + ARG_height];
563
        r0 = p5;
564
        p4 = p4 >> 1;
565
        p5 = p5 >> 2;
566

    
567
        r2 = [fp + ARG_chromStride];
568
        r0 = r0 >> 1;
569
        r2 = r2 - r0;
570
        m1 = r2;
571

    
572
        /*   I0,I1 - src input line pointers
573
         *   p0,p1 - luma output line pointers
574
         *   I2    - dstU
575
         *   I3    - dstV
576
         */
577

    
578
        lsetup (0f, 1f) lc1 = p4;   // H/2
579
0:        r0 = [i0++] || r2 = [i1++];
580
          r1 = [i0++] || r3 = [i1++];
581
          r4 = bytepack(r0, r1);
582
          r5 = bytepack(r2, r3);
583
          lsetup (2f, 3f) lc0 = p5; // W/4
584
2:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
585
            r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
586
            r2 = r2 >> 8(v);
587
            r3 = r3 >> 8(v);
588
            r4 = byteop1p(r1:0, r3:2);
589
            r5 = byteop1p(r1:0, r3:2) (r);
590
            r6 = pack(r5.l, r4.l);
591
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
592
            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
593
            r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
594
3:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv
595

    
596
          i0 += m0;
597
          i1 += m0;
598
          i2 += m1;
599
          i3 += m1;
600
          p0 = p0 + p2;
601
1:        p1 = p1 + p2;
602

    
603
        (r7:4,p5:4) = [sp++];
604
        unlink;
605
        rts;
606
DEFUN_END(yuyvtoyv12)