Statistics
| Branch: | Revision:

ffmpeg / libswscale / internal_bfin.S @ bf4a90fc

History | View | Annotate | Download (17.5 KB)

1
/*
2
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3
 *                    April 20, 2007
4
 *
5
 * Blackfin Video Color Space Converters Operations
6
 *  convert I420 YV12 to RGB in various formats,
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25

    
26
/*
27
    YUV420 to RGB565 conversion.  This routine takes a YUV 420 planar macroblock
28
    and converts it to RGB565.  R:5 bits, G:6 bits, B:5 bits.. packed into shorts
29

    
30

    
31
    The following calculation is used for the conversion:
32

    
33
      r = clipz((y-oy)*cy  + crv*(v-128))
34
      g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35
      b = clipz((y-oy)*cy  + cbu*(u-128))
36

    
37
    y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
38

    
39

    
40
    New factorization to elliminate the truncation error which was
41
    occuring due to the byteop3p.
42

    
43

    
44
  1) use the bytop16m to subtract quad bytes we use this in U8 this
45
   then so the offsets need to be renormalized to 8bits.
46

    
47
  2) scale operands up by a factor of 4 not 8 because Blackfin
48
     multiplies include a shift.
49

    
50
  3) compute into the accumulators cy*yx0, cy*yx1
51

    
52
  4) compute each of the linear equations
53
      r = clipz((y-oy)*cy  + crv*(v-128))
54

    
55
      g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
56

    
57
      b = clipz((y-oy)*cy  + cbu*(u-128))
58

    
59
     reuse of the accumulators requires that we actually multiply
60
     twice once with addition and the second time with a subtaction.
61

    
62
     because of this we need to compute the equations in the order R B
63
     then G saving the writes for B in the case of 24/32 bit color
64
     formats.
65

    
66
    api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67
                       int dW, uint32_t *coeffs);
68

    
69
        A          B
70
        ---        ---
71
        i2 = cb    i3 = cr
72
        i1 = coeff i0 = y
73

    
74
  Where coeffs have the following layout in memory.
75

    
76
  uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
77

    
78
  coeffs is a pointer to oy.
79

    
80
  the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81
  replication is used to simplify the internal algorithms for the dual mac architecture
82
  of BlackFin.
83

    
84
  All routines are exported with _ff_bfin_ as a symbol prefix
85

    
86
  rough performance gain compared against -O3:
87

    
88
  2779809/1484290 187.28%
89

    
90
  which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91
  c/pel for the optimized implementations. Not sure why there is such a
92
  huge variation on the reference codes on Blackfin I guess it must have
93
  to do with the memory system.
94

    
95
*/
96

    
97
#define mL1 .l1.text
98
#define mL3 .text
99
#define MEM mL1
100

    
101
#define DEFUN(fname,where,interface) \
102
        .section where;              \
103
        .global _ff_bfin_ ## fname;  \
104
        .type _ff_bfin_ ## fname, STT_FUNC; \
105
        .align 8;                    \
106
        _ff_bfin_ ## fname
107

    
108
#define DEFUN_END(fname) \
109
        .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
110

    
111

    
112
.text
113

    
114
#define COEFF_LEN        11*4
115
#define COEFF_REL_CY_OFF 4*4
116

    
117
#define ARG_OUT   20
118
#define ARG_W     24
119
#define ARG_COEFF 28
120

    
121
DEFUN(yuv2rgb565_line,MEM,
122
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
123
        link 0;
124
        [--sp] = (r7:4);
125
        p1 = [fp+ARG_OUT];
126
        r3 = [fp+ARG_W];
127

    
128
        i0 = r0;
129
        i2 = r1;
130
        i3 = r2;
131

    
132
        r0 = [fp+ARG_COEFF];
133
        i1 = r0;
134
        b1 = i1;
135
        l1 = COEFF_LEN;
136
        m0 = COEFF_REL_CY_OFF;
137
        p0 = r3;
138

    
139
        r0   = [i0++];         // 2Y
140
        r1.l = w[i2++];        // 2u
141
        r1.h = w[i3++];        // 2v
142
        p0 = p0>>2;
143

    
144
        lsetup (.L0565, .L1565) lc0 = p0;
145

    
146
        /*
147
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
148
           r0 -- used to load 4ys
149
           r1 -- used to load 2us,2vs
150
           r4 -- y3,y2
151
           r5 -- y1,y0
152
           r6 -- u1,u0
153
           r7 -- v1,v0
154
        */
155
                                                              r2=[i1++]; // oy
156
.L0565:
157
        /*
158
        rrrrrrrr gggggggg bbbbbbbb
159
         5432109876543210
160
                    bbbbb >>3
161
              gggggggg    <<3
162
         rrrrrrrr         <<8
163
         rrrrrggggggbbbbb
164
        */
165
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
166
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
167
        r5 = r5 << 2 (v);                                                // y1,y0
168
        r4 = r4 << 2 (v);                                                // y3,y2
169
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
170
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
171
        /* Y' = y*cy */
172
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
173

    
174
        /* R = Y+ crv*(Cr-128) */
175
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
176
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
177
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
178
        r2 = r2 >> 3 (v);
179
        r3 = r2 & r5;
180

    
181
        /* B = Y+ cbu*(Cb-128) */
182
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
183
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
184
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
185
        r2 = r2 << 8 (v);
186
        r2 = r2 & r5;
187
        r3 = r3 | r2;
188

    
189
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
190
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
191
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
192
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
193
        r2 = r2 << 3 (v);
194
        r2 = r2 & r5;
195
        r3 = r3 | r2;
196
        [p1++]=r3                                          || r1=[i1++]; // cy
197

    
198
        /* Y' = y*cy */
199

    
200
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
201

    
202
        /* R = Y+ crv*(Cr-128) */
203
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
204
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
205
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
206
        r2 = r2 >> 3 (v);
207
        r3 = r2 & r5;
208

    
209
        /* B = Y+ cbu*(Cb-128) */
210
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
211
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
212
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
213
        r2 = r2 << 8 (v);
214
        r2 = r2 & r5;
215
        r3 = r3 | r2;
216

    
217
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
218
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
219
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
220
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
221
        r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
222
        r2 = r2 & r5;
223
        r3 = r3 | r2;
224
        [p1++]=r3                                          || r1.h = w[i3++];        // 2v
225
.L1565:                                                       r2=[i1++]; // oy
226

    
227
        l1 = 0;
228

    
229
        (r7:4) = [sp++];
230
        unlink;
231
        rts;
232
DEFUN_END(yuv2rgb565_line)
233

    
234
DEFUN(yuv2rgb555_line,MEM,
235
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
236
        link 0;
237
        [--sp] = (r7:4);
238
        p1 = [fp+ARG_OUT];
239
        r3 = [fp+ARG_W];
240

    
241
        i0 = r0;
242
        i2 = r1;
243
        i3 = r2;
244

    
245
        r0 = [fp+ARG_COEFF];
246
        i1 = r0;
247
        b1 = i1;
248
        l1 = COEFF_LEN;
249
        m0 = COEFF_REL_CY_OFF;
250
        p0 = r3;
251

    
252
        r0   = [i0++];         // 2Y
253
        r1.l = w[i2++];        // 2u
254
        r1.h = w[i3++];        // 2v
255
        p0 = p0>>2;
256

    
257
        lsetup (.L0555, .L1555) lc0 = p0;
258

    
259
        /*
260
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
261
           r0 -- used to load 4ys
262
           r1 -- used to load 2us,2vs
263
           r4 -- y3,y2
264
           r5 -- y1,y0
265
           r6 -- u1,u0
266
           r7 -- v1,v0
267
        */
268
                                                              r2=[i1++]; // oy
269
.L0555:
270
        /*
271
        rrrrrrrr gggggggg bbbbbbbb
272
         5432109876543210
273
                    bbbbb >>3
274
               gggggggg   <<2
275
          rrrrrrrr        <<7
276
         xrrrrrgggggbbbbb
277
        */
278

    
279
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
280
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
281
        r5 = r5 << 2 (v);                                                // y1,y0
282
        r4 = r4 << 2 (v);                                                // y3,y2
283
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
284
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
285
        /* Y' = y*cy */
286
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
287

    
288
        /* R = Y+ crv*(Cr-128) */
289
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
290
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
291
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
292
        r2 = r2 >> 3 (v);
293
        r3 = r2 & r5;
294

    
295
        /* B = Y+ cbu*(Cb-128) */
296
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
297
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
298
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
299
        r2 = r2 << 7 (v);
300
        r2 = r2 & r5;
301
        r3 = r3 | r2;
302

    
303
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
304
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
305
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
306
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
307
        r2 = r2 << 2 (v);
308
        r2 = r2 & r5;
309
        r3 = r3 | r2;
310
        [p1++]=r3                                          || r1=[i1++]; // cy
311

    
312
        /* Y' = y*cy */
313

    
314
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
315

    
316
        /* R = Y+ crv*(Cr-128) */
317
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
318
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
319
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
320
        r2 = r2 >> 3 (v);
321
        r3 = r2 & r5;
322

    
323
        /* B = Y+ cbu*(Cb-128) */
324
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
325
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
326
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
327
        r2 = r2 << 7 (v);
328
        r2 = r2 & r5;
329
        r3 = r3 | r2;
330

    
331
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
332
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
333
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
334
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
335
        r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
336
        r2 = r2 & r5;
337
        r3 = r3 | r2;
338
        [p1++]=r3                                          || r1.h=w[i3++]; // 2v
339

    
340
.L1555:                                                       r2=[i1++]; // oy
341

    
342
        l1 = 0;
343

    
344
        (r7:4) = [sp++];
345
        unlink;
346
        rts;
347
DEFUN_END(yuv2rgb555_line)
348

    
349
DEFUN(yuv2rgb24_line,MEM,
350
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
351
        link 0;
352
        [--sp] = (r7:4);
353
        p1 = [fp+ARG_OUT];
354
        r3 = [fp+ARG_W];
355
        p2 = p1;
356
        p2 += 3;
357

    
358
        i0 = r0;
359
        i2 = r1;
360
        i3 = r2;
361

    
362
        r0 = [fp+ARG_COEFF]; // coeff buffer
363
        i1 = r0;
364
        b1 = i1;
365
        l1 = COEFF_LEN;
366
        m0 = COEFF_REL_CY_OFF;
367
        p0 = r3;
368

    
369
        r0   = [i0++];         // 2Y
370
        r1.l = w[i2++];        // 2u
371
        r1.h = w[i3++];        // 2v
372
        p0 = p0>>2;
373

    
374
        lsetup (.L0888, .L1888) lc0 = p0;
375

    
376
        /*
377
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
378
           r0 -- used to load 4ys
379
           r1 -- used to load 2us,2vs
380
           r4 -- y3,y2
381
           r5 -- y1,y0
382
           r6 -- u1,u0
383
           r7 -- v1,v0
384
        */
385
                                                              r2=[i1++]; // oy
386
.L0888:
387
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
388
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
389
        r5 = r5 << 2 (v);               // y1,y0
390
        r4 = r4 << 2 (v);               // y3,y2
391
        r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
392
        r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
393

    
394
        /* Y' = y*cy */
395
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
396

    
397
        /* R = Y+ crv*(Cr-128) */
398
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
399
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
400
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
401
        r2=r2>>16 || B[p1++]=r2;
402
                     B[p2++]=r2;
403

    
404
        /* B = Y+ cbu*(Cb-128) */
405
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
406
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
407
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
408

    
409
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
410
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
411
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
412
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
413

    
414
        r2=r2>>16 || B[p1++]=r2;
415
                     B[p2++]=r2;
416

    
417
        r3=r3>>16 || B[p1++]=r3;
418
                     B[p2++]=r3                            || r1=[i1++]; // cy
419

    
420
        p1+=3;
421
        p2+=3;
422
        /* Y' = y*cy */
423
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
424

    
425
        /* R = Y+ crv*(Cr-128) */
426
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
427
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
428
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
429
        r2=r2>>16 || B[p1++]=r2;
430
        B[p2++]=r2;
431

    
432
        /* B = Y+ cbu*(Cb-128) */
433
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
434
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
435
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
436

    
437
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
438
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
439
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
440
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
441
        r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
442
                     B[p2++]=r2 || r1.l = w[i2++]; // 2u
443
        r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
444
                     B[p2++]=r3 || r2=[i1++];      // oy
445

    
446
        p1+=3;
447
.L1888: p2+=3;
448

    
449
        l1 = 0;
450

    
451
        (r7:4) = [sp++];
452
        unlink;
453
        rts;
454
DEFUN_END(yuv2rgb888_line)
455

    
456

    
457

    
458
#define ARG_vdst        20
459
#define ARG_width       24
460
#define ARG_height      28
461
#define ARG_lumStride   32
462
#define ARG_chromStride 36
463
#define ARG_srcStride   40
464

    
465
DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
466
                         long width, long height,
467
                         long lumStride, long chromStride, long srcStride)):
468
        link 0;
469
        [--sp] = (r7:4,p5:4);
470

    
471
        p0 = r1;       // Y top even
472

    
473
        i2 = r2; // *u
474
        r2 = [fp + ARG_vdst];
475
        i3 = r2; // *v
476

    
477
        r1 = [fp + ARG_srcStride];
478
        r2 = r0 + r1;
479
        r1 += -16;
480
        m0 = r1;
481

    
482
        i0 = r0;  // uyvy_T even
483
        i1 = r2;  // uyvy_B odd
484

    
485
        r2 = [fp + ARG_chromStride];
486
        m1 = r2;
487

    
488
        p2 = [fp + ARG_lumStride];
489
        p1 = p0 + p2;  // Y bot odd
490
        p2 += -4;
491

    
492
        p5 = [fp + ARG_width];
493
        p4 = [fp + ARG_height];
494
        p4 = p4 >> 1;
495
        p5 = p5 >> 2;
496

    
497
        r6.l = w[i2--];  r6.l = w[i2];
498
        r6.h = w[i3--];  r6.h = w[i3];
499
        /*   I0,I1 - src input line pointers
500
         *   p0,p1 - luma output line pointers
501
         *   I2    - dstU
502
         *   I3    - dstV
503
         */
504

    
505
        lsetup (0f, 1f) lc0 = p4;
506

    
507
0:        lsetup (2f, 3f) lc1 = p5;
508
            r0 = [i0++] || r2 = [i1++];
509
            r1 = [i0++] || r3 = [i1++];
510
2:          r4 = byteop1p(r1:0, r3:2)     ||   w[i2++] = r6.l;
511
            r5 = byteop1p(r1:0, r3:2) (r) ||   w[i3++] = r6.h;
512
            r0 = r0 >> 8(v);
513
            r1 = r1 >> 8(v);
514
            r2 = r2 >> 8(v);
515
            r3 = r3 >> 8(v);
516
            r0 = bytepack(r0, r1);
517
            r2 = bytepack(r2, r3)         ||  [p0++] = r0;
518
            r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;
519
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++]   || r2 = [i1++];
520
3:          r6 = bytepack(r6, r7)         ||  r1 = [i0++]   || r3 = [i1++];
521

    
522
          i0 += m0;
523
          i1 += m0;
524
          p0 = p0 + p2;
525
1:        p1 = p1 + p2;
526

    
527
        w[i2++] = r6.l;
528
        w[i3++] = r6.h;
529

    
530
        (r7:4,p5:4) = [sp++];
531
        unlink;
532
        rts;
533
DEFUN_END(uyvytoyv12)