Statistics
| Branch: | Revision:

ffmpeg / libswscale / internal_bfin.S @ c5efef7b

History | View | Annotate | Download (19.5 KB)

1 d3f3eea9 Marc Hoffman
/*
2
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3
 *                    April 20, 2007
4
 *
5 8a322796 Diego Biurrun
 * Blackfin video color space converter operations
6
 * convert I420 YV12 to RGB in various formats
7 d3f3eea9 Marc Hoffman
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24
25
26
/*
27 8a322796 Diego Biurrun
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
29 d3f3eea9 Marc Hoffman
30
31 4bdc44c7 Diego Biurrun
The following calculation is used for the conversion:
32 d3f3eea9 Marc Hoffman
33 4bdc44c7 Diego Biurrun
  r = clipz((y-oy)*cy  + crv*(v-128))
34
  g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35
  b = clipz((y-oy)*cy  + cbu*(u-128))
36 d3f3eea9 Marc Hoffman
37 8a322796 Diego Biurrun
y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
38 d3f3eea9 Marc Hoffman
39
40 4bdc44c7 Diego Biurrun
New factorization to eliminate the truncation error which was
41 8a322796 Diego Biurrun
occurring due to the byteop3p.
42 d3f3eea9 Marc Hoffman
43
44 8a322796 Diego Biurrun
1) Use the bytop16m to subtract quad bytes we use this in U8 this
45 4bdc44c7 Diego Biurrun
 then so the offsets need to be renormalized to 8bits.
46 d3f3eea9 Marc Hoffman
47 8a322796 Diego Biurrun
2) Scale operands up by a factor of 4 not 8 because Blackfin
48 4bdc44c7 Diego Biurrun
   multiplies include a shift.
49 d3f3eea9 Marc Hoffman
50 8a322796 Diego Biurrun
3) Compute into the accumulators cy*yx0, cy*yx1.
51 d3f3eea9 Marc Hoffman
52 8a322796 Diego Biurrun
4) Compute each of the linear equations:
53 4bdc44c7 Diego Biurrun
     r = clipz((y - oy) * cy  + crv * (v - 128))
54 d3f3eea9 Marc Hoffman
55 4bdc44c7 Diego Biurrun
     g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
56 d3f3eea9 Marc Hoffman
57 4bdc44c7 Diego Biurrun
     b = clipz((y - oy) * cy  + cbu * (u - 128))
58 d3f3eea9 Marc Hoffman
59 8a322796 Diego Biurrun
   Reuse of the accumulators requires that we actually multiply
60
   twice once with addition and the second time with a subtraction.
61 d3f3eea9 Marc Hoffman
62 8a322796 Diego Biurrun
   Because of this we need to compute the equations in the order R B
63 4bdc44c7 Diego Biurrun
   then G saving the writes for B in the case of 24/32 bit color
64
   formats.
65 d3f3eea9 Marc Hoffman
66 8a322796 Diego Biurrun
   API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67 4bdc44c7 Diego Biurrun
                      int dW, uint32_t *coeffs);
68 d3f3eea9 Marc Hoffman
69 4bdc44c7 Diego Biurrun
       A          B
70
       ---        ---
71
       i2 = cb    i3 = cr
72
       i1 = coeff i0 = y
73 d3f3eea9 Marc Hoffman
74 4bdc44c7 Diego Biurrun
Where coeffs have the following layout in memory.
75 d3f3eea9 Marc Hoffman
76 4bdc44c7 Diego Biurrun
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
77 d3f3eea9 Marc Hoffman
78 4bdc44c7 Diego Biurrun
coeffs is a pointer to oy.
79 d3f3eea9 Marc Hoffman
80 8a322796 Diego Biurrun
The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81
replication is used to simplify the internal algorithms for the dual Mac
82
architecture of BlackFin.
83 d3f3eea9 Marc Hoffman
84 8a322796 Diego Biurrun
All routines are exported with _ff_bfin_ as a symbol prefix.
85 d3f3eea9 Marc Hoffman
86 8a322796 Diego Biurrun
Rough performance gain compared against -O3:
87 d3f3eea9 Marc Hoffman
88 4bdc44c7 Diego Biurrun
2779809/1484290 187.28%
89 d3f3eea9 Marc Hoffman
90 4bdc44c7 Diego Biurrun
which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91
c/pel for the optimized implementations. Not sure why there is such a
92
huge variation on the reference codes on Blackfin I guess it must have
93
to do with the memory system.
94 d3f3eea9 Marc Hoffman
*/
95
96
#define mL3 .text
97 d2a4ecaf Mike Frysinger
#ifdef __FDPIC__
98
#define mL1 .l1.text
99
#else
100
#define mL1 mL3
101
#endif
102 d3f3eea9 Marc Hoffman
#define MEM mL1
103
104
#define DEFUN(fname,where,interface) \
105
        .section where;              \
106
        .global _ff_bfin_ ## fname;  \
107
        .type _ff_bfin_ ## fname, STT_FUNC; \
108
        .align 8;                    \
109
        _ff_bfin_ ## fname
110
111
#define DEFUN_END(fname) \
112
        .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
113
114
115
.text
116
117
#define COEFF_LEN        11*4
118
#define COEFF_REL_CY_OFF 4*4
119
120
#define ARG_OUT   20
121
#define ARG_W     24
122
#define ARG_COEFF 28
123
124
DEFUN(yuv2rgb565_line,MEM,
125
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
126
        link 0;
127
        [--sp] = (r7:4);
128
        p1 = [fp+ARG_OUT];
129
        r3 = [fp+ARG_W];
130
131
        i0 = r0;
132
        i2 = r1;
133
        i3 = r2;
134
135
        r0 = [fp+ARG_COEFF];
136
        i1 = r0;
137
        b1 = i1;
138
        l1 = COEFF_LEN;
139
        m0 = COEFF_REL_CY_OFF;
140
        p0 = r3;
141
142
        r0   = [i0++];         // 2Y
143
        r1.l = w[i2++];        // 2u
144
        r1.h = w[i3++];        // 2v
145
        p0 = p0>>2;
146
147
        lsetup (.L0565, .L1565) lc0 = p0;
148
149
        /*
150
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
151
           r0 -- used to load 4ys
152
           r1 -- used to load 2us,2vs
153
           r4 -- y3,y2
154
           r5 -- y1,y0
155
           r6 -- u1,u0
156
           r7 -- v1,v0
157
        */
158
                                                              r2=[i1++]; // oy
159
.L0565:
160
        /*
161
        rrrrrrrr gggggggg bbbbbbbb
162
         5432109876543210
163
                    bbbbb >>3
164
              gggggggg    <<3
165
         rrrrrrrr         <<8
166
         rrrrrggggggbbbbb
167
        */
168
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
169
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
170
        r5 = r5 << 2 (v);                                                // y1,y0
171
        r4 = r4 << 2 (v);                                                // y3,y2
172
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
173
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
174
        /* Y' = y*cy */
175
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
176
177
        /* R = Y+ crv*(Cr-128) */
178
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
179
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
180
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
181
        r2 = r2 >> 3 (v);
182
        r3 = r2 & r5;
183
184
        /* B = Y+ cbu*(Cb-128) */
185
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
186
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
187
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
188
        r2 = r2 << 8 (v);
189
        r2 = r2 & r5;
190
        r3 = r3 | r2;
191
192
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
193
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
194
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
195
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
196
        r2 = r2 << 3 (v);
197
        r2 = r2 & r5;
198
        r3 = r3 | r2;
199
        [p1++]=r3                                          || r1=[i1++]; // cy
200
201
        /* Y' = y*cy */
202
203
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
204
205
        /* R = Y+ crv*(Cr-128) */
206
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
207
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
208
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
209
        r2 = r2 >> 3 (v);
210
        r3 = r2 & r5;
211
212
        /* B = Y+ cbu*(Cb-128) */
213
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
214
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
215
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
216
        r2 = r2 << 8 (v);
217
        r2 = r2 & r5;
218
        r3 = r3 | r2;
219
220
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
221
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
222
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
223
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
224
        r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
225
        r2 = r2 & r5;
226
        r3 = r3 | r2;
227
        [p1++]=r3                                          || r1.h = w[i3++];        // 2v
228
.L1565:                                                       r2=[i1++]; // oy
229
230
        l1 = 0;
231
232
        (r7:4) = [sp++];
233
        unlink;
234
        rts;
235
DEFUN_END(yuv2rgb565_line)
236
237
DEFUN(yuv2rgb555_line,MEM,
238
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
239
        link 0;
240
        [--sp] = (r7:4);
241
        p1 = [fp+ARG_OUT];
242
        r3 = [fp+ARG_W];
243
244
        i0 = r0;
245
        i2 = r1;
246
        i3 = r2;
247
248
        r0 = [fp+ARG_COEFF];
249
        i1 = r0;
250
        b1 = i1;
251
        l1 = COEFF_LEN;
252
        m0 = COEFF_REL_CY_OFF;
253
        p0 = r3;
254
255
        r0   = [i0++];         // 2Y
256
        r1.l = w[i2++];        // 2u
257
        r1.h = w[i3++];        // 2v
258
        p0 = p0>>2;
259
260
        lsetup (.L0555, .L1555) lc0 = p0;
261
262
        /*
263
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
264
           r0 -- used to load 4ys
265
           r1 -- used to load 2us,2vs
266
           r4 -- y3,y2
267
           r5 -- y1,y0
268
           r6 -- u1,u0
269
           r7 -- v1,v0
270
        */
271
                                                              r2=[i1++]; // oy
272
.L0555:
273
        /*
274
        rrrrrrrr gggggggg bbbbbbbb
275
         5432109876543210
276
                    bbbbb >>3
277
               gggggggg   <<2
278
          rrrrrrrr        <<7
279
         xrrrrrgggggbbbbb
280
        */
281
282
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
283
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
284
        r5 = r5 << 2 (v);                                                // y1,y0
285
        r4 = r4 << 2 (v);                                                // y3,y2
286
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
287
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
288
        /* Y' = y*cy */
289
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
290
291
        /* R = Y+ crv*(Cr-128) */
292
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
293
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
294
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
295
        r2 = r2 >> 3 (v);
296
        r3 = r2 & r5;
297
298
        /* B = Y+ cbu*(Cb-128) */
299
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
300
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
301
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
302
        r2 = r2 << 7 (v);
303
        r2 = r2 & r5;
304
        r3 = r3 | r2;
305
306
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
307
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
308
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
309
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
310
        r2 = r2 << 2 (v);
311
        r2 = r2 & r5;
312
        r3 = r3 | r2;
313
        [p1++]=r3                                          || r1=[i1++]; // cy
314
315
        /* Y' = y*cy */
316
317
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
318
319
        /* R = Y+ crv*(Cr-128) */
320
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
321
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
322
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
323
        r2 = r2 >> 3 (v);
324
        r3 = r2 & r5;
325
326
        /* B = Y+ cbu*(Cb-128) */
327
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
328
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
329
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
330
        r2 = r2 << 7 (v);
331
        r2 = r2 & r5;
332
        r3 = r3 | r2;
333
334
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
335
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
336
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
337
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
338
        r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
339
        r2 = r2 & r5;
340
        r3 = r3 | r2;
341
        [p1++]=r3                                          || r1.h=w[i3++]; // 2v
342
343
.L1555:                                                       r2=[i1++]; // oy
344
345
        l1 = 0;
346
347
        (r7:4) = [sp++];
348
        unlink;
349
        rts;
350
DEFUN_END(yuv2rgb555_line)
351
352
DEFUN(yuv2rgb24_line,MEM,
353
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
354
        link 0;
355
        [--sp] = (r7:4);
356
        p1 = [fp+ARG_OUT];
357
        r3 = [fp+ARG_W];
358
        p2 = p1;
359
        p2 += 3;
360
361
        i0 = r0;
362
        i2 = r1;
363
        i3 = r2;
364
365
        r0 = [fp+ARG_COEFF]; // coeff buffer
366
        i1 = r0;
367
        b1 = i1;
368
        l1 = COEFF_LEN;
369
        m0 = COEFF_REL_CY_OFF;
370
        p0 = r3;
371
372
        r0   = [i0++];         // 2Y
373
        r1.l = w[i2++];        // 2u
374
        r1.h = w[i3++];        // 2v
375
        p0 = p0>>2;
376
377
        lsetup (.L0888, .L1888) lc0 = p0;
378
379
        /*
380
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
381
           r0 -- used to load 4ys
382
           r1 -- used to load 2us,2vs
383
           r4 -- y3,y2
384
           r5 -- y1,y0
385
           r6 -- u1,u0
386
           r7 -- v1,v0
387
        */
388
                                                              r2=[i1++]; // oy
389
.L0888:
390
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
391
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
392
        r5 = r5 << 2 (v);               // y1,y0
393
        r4 = r4 << 2 (v);               // y3,y2
394
        r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
395
        r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
396
397
        /* Y' = y*cy */
398
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
399
400
        /* R = Y+ crv*(Cr-128) */
401
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
402
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
403
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
404
        r2=r2>>16 || B[p1++]=r2;
405
                     B[p2++]=r2;
406
407
        /* B = Y+ cbu*(Cb-128) */
408
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
409
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
410
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
411
412
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
413
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
414
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
415
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
416
417
        r2=r2>>16 || B[p1++]=r2;
418
                     B[p2++]=r2;
419
420
        r3=r3>>16 || B[p1++]=r3;
421
                     B[p2++]=r3                            || r1=[i1++]; // cy
422
423
        p1+=3;
424
        p2+=3;
425
        /* Y' = y*cy */
426
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
427
428
        /* R = Y+ crv*(Cr-128) */
429
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
430
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
431
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
432
        r2=r2>>16 || B[p1++]=r2;
433
        B[p2++]=r2;
434
435
        /* B = Y+ cbu*(Cb-128) */
436
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
437
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
438
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
439
440
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
441
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
442
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
443
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
444
        r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
445
                     B[p2++]=r2 || r1.l = w[i2++]; // 2u
446
        r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
447
                     B[p2++]=r3 || r2=[i1++];      // oy
448
449
        p1+=3;
450
.L1888: p2+=3;
451
452
        l1 = 0;
453
454
        (r7:4) = [sp++];
455
        unlink;
456
        rts;
457 22a11d57 Marc Hoffman
DEFUN_END(yuv2rgb24_line)
458 bf4a90fc Marc Hoffman
459
460
461
#define ARG_vdst        20
462
#define ARG_width       24
463
#define ARG_height      28
464
#define ARG_lumStride   32
465
#define ARG_chromStride 36
466
#define ARG_srcStride   40
467
468
DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
469
                         long width, long height,
470
                         long lumStride, long chromStride, long srcStride)):
471
        link 0;
472
        [--sp] = (r7:4,p5:4);
473
474
        p0 = r1;       // Y top even
475
476
        i2 = r2; // *u
477
        r2 = [fp + ARG_vdst];
478
        i3 = r2; // *v
479
480
        r1 = [fp + ARG_srcStride];
481
        r2 = r0 + r1;
482 69a6db95 Marc Hoffman
        r1 += -8;  // i0,i1 is pre read need to correct
483 bf4a90fc Marc Hoffman
        m0 = r1;
484
485
        i0 = r0;  // uyvy_T even
486
        i1 = r2;  // uyvy_B odd
487
488
        p2 = [fp + ARG_lumStride];
489
        p1 = p0 + p2;  // Y bot odd
490
491
        p5 = [fp + ARG_width];
492
        p4 = [fp + ARG_height];
493 45eeae39 Marc Hoffman
        r0 = p5;
494 bf4a90fc Marc Hoffman
        p4 = p4 >> 1;
495
        p5 = p5 >> 2;
496
497 45eeae39 Marc Hoffman
        r2 = [fp + ARG_chromStride];
498
        r0 = r0 >> 1;
499
        r2 = r2 - r0;
500
        m1 = r2;
501
502 bf4a90fc Marc Hoffman
        /*   I0,I1 - src input line pointers
503
         *   p0,p1 - luma output line pointers
504
         *   I2    - dstU
505
         *   I3    - dstV
506
         */
507
508 e9d4375f Marc Hoffman
        lsetup (0f, 1f) lc1 = p4;   // H/2
509
0:        r0 = [i0++] || r2 = [i1++];
510
          r1 = [i0++] || r3 = [i1++];
511
          r4 = byteop1p(r1:0, r3:2);
512
          r5 = byteop1p(r1:0, r3:2) (r);
513
          lsetup (2f, 3f) lc0 = p5; // W/4
514
2:          r0 = r0 >> 8(v);
515 bf4a90fc Marc Hoffman
            r1 = r1 >> 8(v);
516
            r2 = r2 >> 8(v);
517
            r3 = r3 >> 8(v);
518
            r0 = bytepack(r0, r1);
519 e9d4375f Marc Hoffman
            r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
520
            r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
521
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
522
            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
523
            r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
524
3:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv
525 bf4a90fc Marc Hoffman
526
          i0 += m0;
527
          i1 += m0;
528 45eeae39 Marc Hoffman
          i2 += m1;
529
          i3 += m1;
530 bf4a90fc Marc Hoffman
          p0 = p0 + p2;
531
1:        p1 = p1 + p2;
532
533
        (r7:4,p5:4) = [sp++];
534
        unlink;
535
        rts;
536
DEFUN_END(uyvytoyv12)
537 4055d271 Marc Hoffman
538
DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
539
                         long width, long height,
540
                         long lumStride, long chromStride, long srcStride)):
541
        link 0;
542
        [--sp] = (r7:4,p5:4);
543
544
        p0 = r1;       // Y top even
545
546
        i2 = r2; // *u
547
        r2 = [fp + ARG_vdst];
548
        i3 = r2; // *v
549
550
        r1 = [fp + ARG_srcStride];
551
        r2 = r0 + r1;
552
        r1 += -8;  // i0,i1 is pre read need to correct
553
        m0 = r1;
554
555
        i0 = r0;  // uyvy_T even
556
        i1 = r2;  // uyvy_B odd
557
558
        p2 = [fp + ARG_lumStride];
559
        p1 = p0 + p2;  // Y bot odd
560
561
        p5 = [fp + ARG_width];
562
        p4 = [fp + ARG_height];
563
        r0 = p5;
564
        p4 = p4 >> 1;
565
        p5 = p5 >> 2;
566
567
        r2 = [fp + ARG_chromStride];
568
        r0 = r0 >> 1;
569
        r2 = r2 - r0;
570
        m1 = r2;
571
572
        /*   I0,I1 - src input line pointers
573
         *   p0,p1 - luma output line pointers
574
         *   I2    - dstU
575
         *   I3    - dstV
576
         */
577
578
        lsetup (0f, 1f) lc1 = p4;   // H/2
579
0:        r0 = [i0++] || r2 = [i1++];
580
          r1 = [i0++] || r3 = [i1++];
581
          r4 = bytepack(r0, r1);
582
          r5 = bytepack(r2, r3);
583
          lsetup (2f, 3f) lc0 = p5; // W/4
584
2:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
585
            r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
586
            r2 = r2 >> 8(v);
587
            r3 = r3 >> 8(v);
588
            r4 = byteop1p(r1:0, r3:2);
589
            r5 = byteop1p(r1:0, r3:2) (r);
590
            r6 = pack(r5.l, r4.l);
591
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
592
            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
593
            r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
594
3:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv
595
596
          i0 += m0;
597
          i1 += m0;
598
          i2 += m1;
599
          i3 += m1;
600
          p0 = p0 + p2;
601
1:        p1 = p1 + p2;
602
603
        (r7:4,p5:4) = [sp++];
604
        unlink;
605
        rts;
606
DEFUN_END(yuyvtoyv12)