ffmpeg / libswscale / yuv2rgb_altivec.c @ 8a322796
History | View | Annotate | Download (38.4 KB)
1 |
/*
|
---|---|
2 |
* AltiVec acceleration for colorspace conversion
|
3 |
*
|
4 |
* copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
|
5 |
*
|
6 |
* This file is part of FFmpeg.
|
7 |
*
|
8 |
* FFmpeg is free software; you can redistribute it and/or modify
|
9 |
* it under the terms of the GNU General Public License as published by
|
10 |
* the Free Software Foundation; either version 2 of the License, or
|
11 |
* (at your option) any later version.
|
12 |
*
|
13 |
* FFmpeg is distributed in the hope that it will be useful,
|
14 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16 |
* GNU General Public License for more details.
|
17 |
*
|
18 |
* You should have received a copy of the GNU General Public License
|
19 |
* along with FFmpeg; if not, write to the Free Software
|
20 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
21 |
*/
|
22 |
|
23 |
/*
|
24 |
Convert I420 YV12 to RGB in various formats,
|
25 |
it rejects images that are not in 420 formats,
|
26 |
it rejects images that don't have widths of multiples of 16,
|
27 |
it rejects images that don't have heights of multiples of 2.
|
28 |
Reject defers to C simulation code.
|
29 |
|
30 |
Lots of optimizations to be done here.
|
31 |
|
32 |
1. Need to fix saturation code. I just couldn't get it to fly with packs
|
33 |
and adds, so we currently use max/min to clip.
|
34 |
|
35 |
2. The inefficient use of chroma loading needs a bit of brushing up.
|
36 |
|
37 |
3. Analysis of pipeline stalls needs to be done. Use shark to identify
|
38 |
pipeline stalls.
|
39 |
|
40 |
|
41 |
MODIFIED to calculate coeffs from currently selected color space.
|
42 |
MODIFIED core to be a macro where you specify the output format.
|
43 |
ADDED UYVY conversion which is never called due to some thing in swscale.
|
44 |
CORRECTED algorithim selection to be strict on input formats.
|
45 |
ADDED runtime detection of AltiVec.
|
46 |
|
47 |
ADDED altivec_yuv2packedX vertical scl + RGB converter
|
48 |
|
49 |
March 27,2004
|
50 |
PERFORMANCE ANALYSIS
|
51 |
|
52 |
The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
|
53 |
used as test.
|
54 |
The AltiVec version uses 10% of the processor or ~100Mips for D1 video
|
55 |
same sequence.
|
56 |
|
57 |
720 * 480 * 30 ~10MPS
|
58 |
|
59 |
so we have roughly 10 clocks per pixel. This is too high, something has
|
60 |
to be wrong.
|
61 |
|
62 |
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
|
63 |
need for vec_min.
|
64 |
|
65 |
OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
|
66 |
the input video frame, it was just decompressed so it probably resides in L1
|
67 |
caches. However, we are creating the output video stream. This needs to use the
|
68 |
DSTST instruction to optimize for the cache. We couple this with the fact that
|
69 |
we are not going to be visiting the input buffer again so we mark it Least
|
70 |
Recently Used. This shaves 25% of the processor cycles off.
|
71 |
|
72 |
Now memcpy is the largest mips consumer in the system, probably due
|
73 |
to the inefficient X11 stuff.
|
74 |
|
75 |
GL libraries seem to be very slow on this machine 1.33Ghz PB running
|
76 |
Jaguar, this is not the case for my 1Ghz PB. I thought it might be
|
77 |
a versioning issue, however I have libGL.1.2.dylib for both
|
78 |
machines. (We need to figure this out now.)
|
79 |
|
80 |
GL2 libraries work now with patch for RGB32.
|
81 |
|
82 |
NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
|
83 |
|
84 |
Integrated luma prescaling adjustment for saturation/contrast/brightness
|
85 |
adjustment.
|
86 |
*/
|
87 |
|
88 |
#include <stdio.h> |
89 |
#include <stdlib.h> |
90 |
#include <string.h> |
91 |
#include <inttypes.h> |
92 |
#include <assert.h> |
93 |
#include "config.h" |
94 |
#ifdef HAVE_MALLOC_H
|
95 |
#include <malloc.h> |
96 |
#endif
|
97 |
#include "rgb2rgb.h" |
98 |
#include "swscale.h" |
99 |
#include "swscale_internal.h" |
100 |
|
101 |
#undef PROFILE_THE_BEAST
|
102 |
#undef INC_SCALING
|
103 |
|
104 |
typedef unsigned char ubyte; |
105 |
typedef signed char sbyte; |
106 |
|
107 |
|
108 |
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
|
109 |
homogeneous vector registers x0,x1,x2 are interleaved with the
|
110 |
following technique:
|
111 |
|
112 |
o0 = vec_mergeh (x0,x1);
|
113 |
o1 = vec_perm (o0, x2, perm_rgb_0);
|
114 |
o2 = vec_perm (o0, x2, perm_rgb_1);
|
115 |
o3 = vec_mergel (x0,x1);
|
116 |
o4 = vec_perm (o3,o2,perm_rgb_2);
|
117 |
o5 = vec_perm (o3,o2,perm_rgb_3);
|
118 |
|
119 |
perm_rgb_0: o0(RG).h v1(B) --> o1*
|
120 |
0 1 2 3 4
|
121 |
rgbr|gbrg|brgb|rgbr
|
122 |
0010 0100 1001 0010
|
123 |
0102 3145 2673 894A
|
124 |
|
125 |
perm_rgb_1: o0(RG).h v1(B) --> o2
|
126 |
0 1 2 3 4
|
127 |
gbrg|brgb|bbbb|bbbb
|
128 |
0100 1001 1111 1111
|
129 |
B5CD 6EF7 89AB CDEF
|
130 |
|
131 |
perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
|
132 |
0 1 2 3 4
|
133 |
gbrg|brgb|rgbr|gbrg
|
134 |
1111 1111 0010 0100
|
135 |
89AB CDEF 0182 3945
|
136 |
|
137 |
perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
|
138 |
0 1 2 3 4
|
139 |
brgb|rgbr|gbrg|brgb
|
140 |
1001 0010 0100 1001
|
141 |
a67b 89cA BdCD eEFf
|
142 |
|
143 |
*/
|
144 |
static
|
145 |
const vector unsigned char |
146 |
perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05, |
147 |
0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a), |
148 |
perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17, |
149 |
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f), |
150 |
perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, |
151 |
0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05), |
152 |
perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a, |
153 |
0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f); |
154 |
|
155 |
#define vec_merge3(x2,x1,x0,y0,y1,y2) \
|
156 |
do { \
|
157 |
typeof(x0) o0,o2,o3; \ |
158 |
o0 = vec_mergeh (x0,x1); \ |
159 |
y0 = vec_perm (o0, x2, perm_rgb_0); \ |
160 |
o2 = vec_perm (o0, x2, perm_rgb_1); \ |
161 |
o3 = vec_mergel (x0,x1); \ |
162 |
y1 = vec_perm (o3,o2,perm_rgb_2); \ |
163 |
y2 = vec_perm (o3,o2,perm_rgb_3); \ |
164 |
} while(0) |
165 |
|
166 |
#define vec_mstbgr24(x0,x1,x2,ptr) \
|
167 |
do { \
|
168 |
typeof(x0) _0,_1,_2; \ |
169 |
vec_merge3 (x0,x1,x2,_0,_1,_2); \ |
170 |
vec_st (_0, 0, ptr++); \
|
171 |
vec_st (_1, 0, ptr++); \
|
172 |
vec_st (_2, 0, ptr++); \
|
173 |
} while (0); |
174 |
|
175 |
#define vec_mstrgb24(x0,x1,x2,ptr) \
|
176 |
do { \
|
177 |
typeof(x0) _0,_1,_2; \ |
178 |
vec_merge3 (x2,x1,x0,_0,_1,_2); \ |
179 |
vec_st (_0, 0, ptr++); \
|
180 |
vec_st (_1, 0, ptr++); \
|
181 |
vec_st (_2, 0, ptr++); \
|
182 |
} while (0); |
183 |
|
184 |
/* pack the pixels in rgb0 format
|
185 |
msb R
|
186 |
lsb 0
|
187 |
*/
|
188 |
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
|
189 |
do { \
|
190 |
T _0,_1,_2,_3; \ |
191 |
_0 = vec_mergeh (x0,x1); \ |
192 |
_1 = vec_mergeh (x2,x3); \ |
193 |
_2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ |
194 |
_3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ |
195 |
vec_st (_2, 0*16, (T *)ptr); \ |
196 |
vec_st (_3, 1*16, (T *)ptr); \ |
197 |
_0 = vec_mergel (x0,x1); \ |
198 |
_1 = vec_mergel (x2,x3); \ |
199 |
_2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ |
200 |
_3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ |
201 |
vec_st (_2, 2*16, (T *)ptr); \ |
202 |
vec_st (_3, 3*16, (T *)ptr); \ |
203 |
ptr += 4; \
|
204 |
} while (0); |
205 |
|
206 |
/*
|
207 |
|
208 |
| 1 0 1.4021 | | Y |
|
209 |
| 1 -0.3441 -0.7142 |x| Cb|
|
210 |
| 1 1.7718 0 | | Cr|
|
211 |
|
212 |
|
213 |
Y: [-128 127]
|
214 |
Cb/Cr : [-128 127]
|
215 |
|
216 |
typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
|
217 |
|
218 |
*/
|
219 |
|
220 |
|
221 |
|
222 |
|
223 |
#define vec_unh(x) \
|
224 |
(vector signed short) \ |
225 |
vec_perm(x,(typeof(x))AVV(0),\
|
226 |
(vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ |
227 |
0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07)) |
228 |
#define vec_unl(x) \
|
229 |
(vector signed short) \ |
230 |
vec_perm(x,(typeof(x))AVV(0),\
|
231 |
(vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ |
232 |
0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F)) |
233 |
|
234 |
#define vec_clip_s16(x) \
|
235 |
vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\ |
236 |
(vector signed short)AVV( 16, 16, 16, 16, 16, 16, 16, 16)) |
237 |
|
238 |
#define vec_packclp(x,y) \
|
239 |
(vector unsigned char)vec_packs \ |
240 |
((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \ |
241 |
(vector unsigned short)vec_max (y,(vector signed short) AVV(0))) |
242 |
|
243 |
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
|
244 |
|
245 |
|
246 |
static inline void cvtyuvtoRGB (SwsContext *c, |
247 |
vector signed short Y, vector signed short U, vector signed short V, |
248 |
vector signed short *R, vector signed short *G, vector signed short *B) |
249 |
{ |
250 |
vector signed short vx,ux,uvx; |
251 |
|
252 |
Y = vec_mradds (Y, c->CY, c->OY); |
253 |
U = vec_sub (U,(vector signed short) |
254 |
vec_splat((vector signed short)AVV(128),0)); |
255 |
V = vec_sub (V,(vector signed short) |
256 |
vec_splat((vector signed short)AVV(128),0)); |
257 |
|
258 |
// ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
|
259 |
ux = vec_sl (U, c->CSHIFT); |
260 |
*B = vec_mradds (ux, c->CBU, Y); |
261 |
|
262 |
// vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
|
263 |
vx = vec_sl (V, c->CSHIFT); |
264 |
*R = vec_mradds (vx, c->CRV, Y); |
265 |
|
266 |
// uvx = ((CGU*u) + (CGV*v))>>15;
|
267 |
uvx = vec_mradds (U, c->CGU, Y); |
268 |
*G = vec_mradds (V, c->CGV, uvx); |
269 |
} |
270 |
|
271 |
|
272 |
/*
|
273 |
------------------------------------------------------------------------------
|
274 |
CS converters
|
275 |
------------------------------------------------------------------------------
|
276 |
*/
|
277 |
|
278 |
|
279 |
#define DEFCSP420_CVT(name,out_pixels) \
|
280 |
static int altivec_##name (SwsContext *c, \ |
281 |
unsigned char **in, int *instrides, \ |
282 |
int srcSliceY, int srcSliceH, \ |
283 |
unsigned char **oplanes, int *outstrides) \ |
284 |
{ \ |
285 |
int w = c->srcW; \
|
286 |
int h = srcSliceH; \
|
287 |
int i,j; \
|
288 |
int instrides_scl[3]; \ |
289 |
vector unsigned char y0,y1; \ |
290 |
\ |
291 |
vector signed char u,v; \ |
292 |
\ |
293 |
vector signed short Y0,Y1,Y2,Y3; \ |
294 |
vector signed short U,V; \ |
295 |
vector signed short vx,ux,uvx; \ |
296 |
vector signed short vx0,ux0,uvx0; \ |
297 |
vector signed short vx1,ux1,uvx1; \ |
298 |
vector signed short R0,G0,B0; \ |
299 |
vector signed short R1,G1,B1; \ |
300 |
vector unsigned char R,G,B; \ |
301 |
\ |
302 |
vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \ |
303 |
vector unsigned char align_perm; \ |
304 |
\ |
305 |
vector signed short \ |
306 |
lCY = c->CY, \ |
307 |
lOY = c->OY, \ |
308 |
lCRV = c->CRV, \ |
309 |
lCBU = c->CBU, \ |
310 |
lCGU = c->CGU, \ |
311 |
lCGV = c->CGV; \ |
312 |
\ |
313 |
vector unsigned short lCSHIFT = c->CSHIFT; \ |
314 |
\ |
315 |
ubyte *y1i = in[0]; \
|
316 |
ubyte *y2i = in[0]+instrides[0]; \ |
317 |
ubyte *ui = in[1]; \
|
318 |
ubyte *vi = in[2]; \
|
319 |
\ |
320 |
vector unsigned char *oute \ |
321 |
= (vector unsigned char *) \ |
322 |
(oplanes[0]+srcSliceY*outstrides[0]); \ |
323 |
vector unsigned char *outo \ |
324 |
= (vector unsigned char *) \ |
325 |
(oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ |
326 |
\ |
327 |
\ |
328 |
instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \ |
329 |
instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \ |
330 |
instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \ |
331 |
\ |
332 |
\ |
333 |
for (i=0;i<h/2;i++) { \ |
334 |
vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \ |
335 |
vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \ |
336 |
\ |
337 |
for (j=0;j<w/16;j++) { \ |
338 |
\ |
339 |
y1ivP = (vector unsigned char *)y1i; \ |
340 |
y2ivP = (vector unsigned char *)y2i; \ |
341 |
uivP = (vector unsigned char *)ui; \ |
342 |
vivP = (vector unsigned char *)vi; \ |
343 |
\ |
344 |
align_perm = vec_lvsl (0, y1i); \
|
345 |
y0 = (vector unsigned char) \ |
346 |
vec_perm (y1ivP[0], y1ivP[1], align_perm); \ |
347 |
\ |
348 |
align_perm = vec_lvsl (0, y2i); \
|
349 |
y1 = (vector unsigned char) \ |
350 |
vec_perm (y2ivP[0], y2ivP[1], align_perm); \ |
351 |
\ |
352 |
align_perm = vec_lvsl (0, ui); \
|
353 |
u = (vector signed char) \ |
354 |
vec_perm (uivP[0], uivP[1], align_perm); \ |
355 |
\ |
356 |
align_perm = vec_lvsl (0, vi); \
|
357 |
v = (vector signed char) \ |
358 |
vec_perm (vivP[0], vivP[1], align_perm); \ |
359 |
\ |
360 |
u = (vector signed char) \ |
361 |
vec_sub (u,(vector signed char) \ |
362 |
vec_splat((vector signed char)AVV(128),0)); \ |
363 |
v = (vector signed char) \ |
364 |
vec_sub (v,(vector signed char) \ |
365 |
vec_splat((vector signed char)AVV(128),0)); \ |
366 |
\ |
367 |
U = vec_unpackh (u); \ |
368 |
V = vec_unpackh (v); \ |
369 |
\ |
370 |
\ |
371 |
Y0 = vec_unh (y0); \ |
372 |
Y1 = vec_unl (y0); \ |
373 |
Y2 = vec_unh (y1); \ |
374 |
Y3 = vec_unl (y1); \ |
375 |
\ |
376 |
Y0 = vec_mradds (Y0, lCY, lOY); \ |
377 |
Y1 = vec_mradds (Y1, lCY, lOY); \ |
378 |
Y2 = vec_mradds (Y2, lCY, lOY); \ |
379 |
Y3 = vec_mradds (Y3, lCY, lOY); \ |
380 |
\ |
381 |
/* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
|
382 |
ux = vec_sl (U, lCSHIFT); \ |
383 |
ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \ |
384 |
ux0 = vec_mergeh (ux,ux); \ |
385 |
ux1 = vec_mergel (ux,ux); \ |
386 |
\ |
387 |
/* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
|
388 |
vx = vec_sl (V, lCSHIFT); \ |
389 |
vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \ |
390 |
vx0 = vec_mergeh (vx,vx); \ |
391 |
vx1 = vec_mergel (vx,vx); \ |
392 |
\ |
393 |
/* uvx = ((CGU*u) + (CGV*v))>>15 */ \
|
394 |
uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \ |
395 |
uvx = vec_mradds (V, lCGV, uvx); \ |
396 |
uvx0 = vec_mergeh (uvx,uvx); \ |
397 |
uvx1 = vec_mergel (uvx,uvx); \ |
398 |
\ |
399 |
R0 = vec_add (Y0,vx0); \ |
400 |
G0 = vec_add (Y0,uvx0); \ |
401 |
B0 = vec_add (Y0,ux0); \ |
402 |
R1 = vec_add (Y1,vx1); \ |
403 |
G1 = vec_add (Y1,uvx1); \ |
404 |
B1 = vec_add (Y1,ux1); \ |
405 |
\ |
406 |
R = vec_packclp (R0,R1); \ |
407 |
G = vec_packclp (G0,G1); \ |
408 |
B = vec_packclp (B0,B1); \ |
409 |
\ |
410 |
out_pixels(R,G,B,oute); \ |
411 |
\ |
412 |
R0 = vec_add (Y2,vx0); \ |
413 |
G0 = vec_add (Y2,uvx0); \ |
414 |
B0 = vec_add (Y2,ux0); \ |
415 |
R1 = vec_add (Y3,vx1); \ |
416 |
G1 = vec_add (Y3,uvx1); \ |
417 |
B1 = vec_add (Y3,ux1); \ |
418 |
R = vec_packclp (R0,R1); \ |
419 |
G = vec_packclp (G0,G1); \ |
420 |
B = vec_packclp (B0,B1); \ |
421 |
\ |
422 |
\ |
423 |
out_pixels(R,G,B,outo); \ |
424 |
\ |
425 |
y1i += 16; \
|
426 |
y2i += 16; \
|
427 |
ui += 8; \
|
428 |
vi += 8; \
|
429 |
\ |
430 |
} \ |
431 |
\ |
432 |
outo += (outstrides[0])>>4; \ |
433 |
oute += (outstrides[0])>>4; \ |
434 |
\ |
435 |
ui += instrides_scl[1]; \
|
436 |
vi += instrides_scl[2]; \
|
437 |
y1i += instrides_scl[0]; \
|
438 |
y2i += instrides_scl[0]; \
|
439 |
} \ |
440 |
return srcSliceH; \
|
441 |
} |
442 |
|
443 |
|
444 |
#define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr) |
445 |
#define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr) |
446 |
#define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr) |
447 |
#define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr) |
448 |
#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
|
449 |
#define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
|
450 |
|
451 |
DEFCSP420_CVT (yuv2_abgr, out_abgr) |
452 |
#if 1 |
453 |
DEFCSP420_CVT (yuv2_bgra, out_bgra) |
454 |
#else
|
455 |
static int altivec_yuv2_bgra32 (SwsContext *c, |
456 |
unsigned char **in, int *instrides, |
457 |
int srcSliceY, int srcSliceH, |
458 |
unsigned char **oplanes, int *outstrides) |
459 |
{ |
460 |
int w = c->srcW;
|
461 |
int h = srcSliceH;
|
462 |
int i,j;
|
463 |
int instrides_scl[3]; |
464 |
vector unsigned char y0,y1; |
465 |
|
466 |
vector signed char u,v; |
467 |
|
468 |
vector signed short Y0,Y1,Y2,Y3; |
469 |
vector signed short U,V; |
470 |
vector signed short vx,ux,uvx; |
471 |
vector signed short vx0,ux0,uvx0; |
472 |
vector signed short vx1,ux1,uvx1; |
473 |
vector signed short R0,G0,B0; |
474 |
vector signed short R1,G1,B1; |
475 |
vector unsigned char R,G,B; |
476 |
|
477 |
vector unsigned char *uivP, *vivP; |
478 |
vector unsigned char align_perm; |
479 |
|
480 |
vector signed short |
481 |
lCY = c->CY, |
482 |
lOY = c->OY, |
483 |
lCRV = c->CRV, |
484 |
lCBU = c->CBU, |
485 |
lCGU = c->CGU, |
486 |
lCGV = c->CGV; |
487 |
|
488 |
vector unsigned short lCSHIFT = c->CSHIFT; |
489 |
|
490 |
ubyte *y1i = in[0];
|
491 |
ubyte *y2i = in[0]+w;
|
492 |
ubyte *ui = in[1];
|
493 |
ubyte *vi = in[2];
|
494 |
|
495 |
vector unsigned char *oute |
496 |
= (vector unsigned char *) |
497 |
(oplanes[0]+srcSliceY*outstrides[0]); |
498 |
vector unsigned char *outo |
499 |
= (vector unsigned char *) |
500 |
(oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); |
501 |
|
502 |
|
503 |
instrides_scl[0] = instrides[0]; |
504 |
instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ |
505 |
instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ |
506 |
|
507 |
|
508 |
for (i=0;i<h/2;i++) { |
509 |
vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); |
510 |
vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); |
511 |
|
512 |
for (j=0;j<w/16;j++) { |
513 |
|
514 |
y0 = vec_ldl (0,y1i);
|
515 |
y1 = vec_ldl (0,y2i);
|
516 |
uivP = (vector unsigned char *)ui; |
517 |
vivP = (vector unsigned char *)vi; |
518 |
|
519 |
align_perm = vec_lvsl (0, ui);
|
520 |
u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); |
521 |
|
522 |
align_perm = vec_lvsl (0, vi);
|
523 |
v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); |
524 |
u = (vector signed char) |
525 |
vec_sub (u,(vector signed char) |
526 |
vec_splat((vector signed char)AVV(128),0)); |
527 |
|
528 |
v = (vector signed char) |
529 |
vec_sub (v, (vector signed char) |
530 |
vec_splat((vector signed char)AVV(128),0)); |
531 |
|
532 |
U = vec_unpackh (u); |
533 |
V = vec_unpackh (v); |
534 |
|
535 |
|
536 |
Y0 = vec_unh (y0); |
537 |
Y1 = vec_unl (y0); |
538 |
Y2 = vec_unh (y1); |
539 |
Y3 = vec_unl (y1); |
540 |
|
541 |
Y0 = vec_mradds (Y0, lCY, lOY); |
542 |
Y1 = vec_mradds (Y1, lCY, lOY); |
543 |
Y2 = vec_mradds (Y2, lCY, lOY); |
544 |
Y3 = vec_mradds (Y3, lCY, lOY); |
545 |
|
546 |
/* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
|
547 |
ux = vec_sl (U, lCSHIFT); |
548 |
ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); |
549 |
ux0 = vec_mergeh (ux,ux); |
550 |
ux1 = vec_mergel (ux,ux); |
551 |
|
552 |
/* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
|
553 |
vx = vec_sl (V, lCSHIFT); |
554 |
vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); |
555 |
vx0 = vec_mergeh (vx,vx); |
556 |
vx1 = vec_mergel (vx,vx); |
557 |
/* uvx = ((CGU*u) + (CGV*v))>>15 */
|
558 |
uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); |
559 |
uvx = vec_mradds (V, lCGV, uvx); |
560 |
uvx0 = vec_mergeh (uvx,uvx); |
561 |
uvx1 = vec_mergel (uvx,uvx); |
562 |
R0 = vec_add (Y0,vx0); |
563 |
G0 = vec_add (Y0,uvx0); |
564 |
B0 = vec_add (Y0,ux0); |
565 |
R1 = vec_add (Y1,vx1); |
566 |
G1 = vec_add (Y1,uvx1); |
567 |
B1 = vec_add (Y1,ux1); |
568 |
R = vec_packclp (R0,R1); |
569 |
G = vec_packclp (G0,G1); |
570 |
B = vec_packclp (B0,B1); |
571 |
|
572 |
out_argb(R,G,B,oute); |
573 |
R0 = vec_add (Y2,vx0); |
574 |
G0 = vec_add (Y2,uvx0); |
575 |
B0 = vec_add (Y2,ux0); |
576 |
R1 = vec_add (Y3,vx1); |
577 |
G1 = vec_add (Y3,uvx1); |
578 |
B1 = vec_add (Y3,ux1); |
579 |
R = vec_packclp (R0,R1); |
580 |
G = vec_packclp (G0,G1); |
581 |
B = vec_packclp (B0,B1); |
582 |
|
583 |
out_argb(R,G,B,outo); |
584 |
y1i += 16;
|
585 |
y2i += 16;
|
586 |
ui += 8;
|
587 |
vi += 8;
|
588 |
|
589 |
} |
590 |
|
591 |
outo += (outstrides[0])>>4; |
592 |
oute += (outstrides[0])>>4; |
593 |
|
594 |
ui += instrides_scl[1];
|
595 |
vi += instrides_scl[2];
|
596 |
y1i += instrides_scl[0];
|
597 |
y2i += instrides_scl[0];
|
598 |
} |
599 |
return srcSliceH;
|
600 |
} |
601 |
|
602 |
#endif
|
603 |
|
604 |
|
605 |
DEFCSP420_CVT (yuv2_rgba, out_rgba) |
606 |
DEFCSP420_CVT (yuv2_argb, out_argb) |
607 |
DEFCSP420_CVT (yuv2_rgb24, out_rgb24) |
608 |
DEFCSP420_CVT (yuv2_bgr24, out_bgr24) |
609 |
|
610 |
|
611 |
// uyvy|uyvy|uyvy|uyvy
|
612 |
// 0123 4567 89ab cdef
|
613 |
static
|
614 |
const vector unsigned char |
615 |
demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00, |
616 |
0x10,0x04,0x10,0x04, |
617 |
0x10,0x08,0x10,0x08, |
618 |
0x10,0x0c,0x10,0x0c), |
619 |
demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02, |
620 |
0x10,0x06,0x10,0x06, |
621 |
0x10,0x0A,0x10,0x0A, |
622 |
0x10,0x0E,0x10,0x0E), |
623 |
demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03, |
624 |
0x10,0x05,0x10,0x07, |
625 |
0x10,0x09,0x10,0x0B, |
626 |
0x10,0x0D,0x10,0x0F); |
627 |
|
628 |
/*
|
629 |
this is so I can play live CCIR raw video
|
630 |
*/
|
631 |
static int altivec_uyvy_rgb32 (SwsContext *c, |
632 |
unsigned char **in, int *instrides, |
633 |
int srcSliceY, int srcSliceH, |
634 |
unsigned char **oplanes, int *outstrides) |
635 |
{ |
636 |
int w = c->srcW;
|
637 |
int h = srcSliceH;
|
638 |
int i,j;
|
639 |
vector unsigned char uyvy; |
640 |
vector signed short Y,U,V; |
641 |
vector signed short R0,G0,B0,R1,G1,B1; |
642 |
vector unsigned char R,G,B; |
643 |
vector unsigned char *out; |
644 |
ubyte *img; |
645 |
|
646 |
img = in[0];
|
647 |
out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); |
648 |
|
649 |
for (i=0;i<h;i++) { |
650 |
for (j=0;j<w/16;j++) { |
651 |
uyvy = vec_ld (0, img);
|
652 |
U = (vector signed short) |
653 |
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); |
654 |
|
655 |
V = (vector signed short) |
656 |
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); |
657 |
|
658 |
Y = (vector signed short) |
659 |
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); |
660 |
|
661 |
cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0); |
662 |
|
663 |
uyvy = vec_ld (16, img);
|
664 |
U = (vector signed short) |
665 |
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); |
666 |
|
667 |
V = (vector signed short) |
668 |
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); |
669 |
|
670 |
Y = (vector signed short) |
671 |
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); |
672 |
|
673 |
cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1); |
674 |
|
675 |
R = vec_packclp (R0,R1); |
676 |
G = vec_packclp (G0,G1); |
677 |
B = vec_packclp (B0,B1); |
678 |
|
679 |
// vec_mstbgr24 (R,G,B, out);
|
680 |
out_rgba (R,G,B,out); |
681 |
|
682 |
img += 32;
|
683 |
} |
684 |
} |
685 |
return srcSliceH;
|
686 |
} |
687 |
|
688 |
|
689 |
|
690 |
/* Ok currently the acceleration routine only supports
|
691 |
inputs of widths a multiple of 16
|
692 |
and heights a multiple 2
|
693 |
|
694 |
So we just fall back to the C codes for this.
|
695 |
*/
|
696 |
SwsFunc yuv2rgb_init_altivec (SwsContext *c) |
697 |
{ |
698 |
if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
|
699 |
return NULL; |
700 |
|
701 |
/*
|
702 |
and this seems not to matter too much I tried a bunch of
|
703 |
videos with abnormal widths and MPlayer crashes elsewhere.
|
704 |
mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
|
705 |
boom with X11 bad match.
|
706 |
|
707 |
*/
|
708 |
if ((c->srcW & 0xf) != 0) return NULL; |
709 |
|
710 |
switch (c->srcFormat) {
|
711 |
case PIX_FMT_YUV410P:
|
712 |
case PIX_FMT_YUV420P:
|
713 |
/*case IMGFMT_CLPL: ??? */
|
714 |
case PIX_FMT_GRAY8:
|
715 |
case PIX_FMT_NV12:
|
716 |
case PIX_FMT_NV21:
|
717 |
if ((c->srcH & 0x1) != 0) |
718 |
return NULL; |
719 |
|
720 |
switch(c->dstFormat){
|
721 |
case PIX_FMT_RGB24:
|
722 |
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
|
723 |
return altivec_yuv2_rgb24;
|
724 |
case PIX_FMT_BGR24:
|
725 |
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
|
726 |
return altivec_yuv2_bgr24;
|
727 |
case PIX_FMT_ARGB:
|
728 |
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
|
729 |
return altivec_yuv2_argb;
|
730 |
case PIX_FMT_ABGR:
|
731 |
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
|
732 |
return altivec_yuv2_abgr;
|
733 |
case PIX_FMT_RGBA:
|
734 |
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
|
735 |
return altivec_yuv2_rgba;
|
736 |
case PIX_FMT_BGRA:
|
737 |
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
|
738 |
return altivec_yuv2_bgra;
|
739 |
default: return NULL; |
740 |
} |
741 |
break;
|
742 |
|
743 |
case PIX_FMT_UYVY422:
|
744 |
switch(c->dstFormat){
|
745 |
case PIX_FMT_BGR32:
|
746 |
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
|
747 |
return altivec_uyvy_rgb32;
|
748 |
default: return NULL; |
749 |
} |
750 |
break;
|
751 |
|
752 |
} |
753 |
return NULL; |
754 |
} |
755 |
|
756 |
void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation) |
757 |
{ |
758 |
union {
|
759 |
signed short tmp[8] __attribute__ ((aligned(16))); |
760 |
vector signed short vec; |
761 |
} buf; |
762 |
|
763 |
buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy |
764 |
buf.tmp[1] = -256*brightness; //oy |
765 |
buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv |
766 |
buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu |
767 |
buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu |
768 |
buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv |
769 |
|
770 |
|
771 |
c->CSHIFT = (vector unsigned short)vec_splat_u16(2); |
772 |
c->CY = vec_splat ((vector signed short)buf.vec, 0); |
773 |
c->OY = vec_splat ((vector signed short)buf.vec, 1); |
774 |
c->CRV = vec_splat ((vector signed short)buf.vec, 2); |
775 |
c->CBU = vec_splat ((vector signed short)buf.vec, 3); |
776 |
c->CGU = vec_splat ((vector signed short)buf.vec, 4); |
777 |
c->CGV = vec_splat ((vector signed short)buf.vec, 5); |
778 |
#if 0
|
779 |
{
|
780 |
int i;
|
781 |
char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
|
782 |
for (i=0; i<6; i++)
|
783 |
printf("%s %d ", v[i],buf.tmp[i] );
|
784 |
printf("\n");
|
785 |
}
|
786 |
#endif
|
787 |
return;
|
788 |
} |
789 |
|
790 |
|
791 |
void
|
792 |
altivec_yuv2packedX (SwsContext *c, |
793 |
int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
|
794 |
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
|
795 |
uint8_t *dest, int dstW, int dstY) |
796 |
{ |
797 |
int i,j;
|
798 |
vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; |
799 |
vector signed short R0,G0,B0,R1,G1,B1; |
800 |
|
801 |
vector unsigned char R,G,B; |
802 |
vector unsigned char *out,*nout; |
803 |
|
804 |
vector signed short RND = vec_splat_s16(1<<3); |
805 |
vector unsigned short SCL = vec_splat_u16(4); |
806 |
unsigned long scratch[16] __attribute__ ((aligned (16))); |
807 |
|
808 |
vector signed short *YCoeffs, *CCoeffs; |
809 |
|
810 |
YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize; |
811 |
CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize; |
812 |
|
813 |
out = (vector unsigned char *)dest; |
814 |
|
815 |
for (i=0; i<dstW; i+=16){ |
816 |
Y0 = RND; |
817 |
Y1 = RND; |
818 |
/* extract 16 coeffs from lumSrc */
|
819 |
for (j=0; j<lumFilterSize; j++) { |
820 |
X0 = vec_ld (0, &lumSrc[j][i]);
|
821 |
X1 = vec_ld (16, &lumSrc[j][i]);
|
822 |
Y0 = vec_mradds (X0, YCoeffs[j], Y0); |
823 |
Y1 = vec_mradds (X1, YCoeffs[j], Y1); |
824 |
} |
825 |
|
826 |
U = RND; |
827 |
V = RND; |
828 |
/* extract 8 coeffs from U,V */
|
829 |
for (j=0; j<chrFilterSize; j++) { |
830 |
X = vec_ld (0, &chrSrc[j][i/2]); |
831 |
U = vec_mradds (X, CCoeffs[j], U); |
832 |
X = vec_ld (0, &chrSrc[j][i/2+2048]); |
833 |
V = vec_mradds (X, CCoeffs[j], V); |
834 |
} |
835 |
|
836 |
/* scale and clip signals */
|
837 |
Y0 = vec_sra (Y0, SCL); |
838 |
Y1 = vec_sra (Y1, SCL); |
839 |
U = vec_sra (U, SCL); |
840 |
V = vec_sra (V, SCL); |
841 |
|
842 |
Y0 = vec_clip_s16 (Y0); |
843 |
Y1 = vec_clip_s16 (Y1); |
844 |
U = vec_clip_s16 (U); |
845 |
V = vec_clip_s16 (V); |
846 |
|
847 |
/* now we have
|
848 |
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
|
849 |
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
|
850 |
|
851 |
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
|
852 |
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
|
853 |
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
|
854 |
*/
|
855 |
|
856 |
U0 = vec_mergeh (U,U); |
857 |
V0 = vec_mergeh (V,V); |
858 |
|
859 |
U1 = vec_mergel (U,U); |
860 |
V1 = vec_mergel (V,V); |
861 |
|
862 |
cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); |
863 |
cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); |
864 |
|
865 |
R = vec_packclp (R0,R1); |
866 |
G = vec_packclp (G0,G1); |
867 |
B = vec_packclp (B0,B1); |
868 |
|
869 |
switch(c->dstFormat) {
|
870 |
case PIX_FMT_ABGR: out_abgr (R,G,B,out); break; |
871 |
case PIX_FMT_BGRA: out_bgra (R,G,B,out); break; |
872 |
case PIX_FMT_RGBA: out_rgba (R,G,B,out); break; |
873 |
case PIX_FMT_ARGB: out_argb (R,G,B,out); break; |
874 |
case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break; |
875 |
case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break; |
876 |
default:
|
877 |
{ |
878 |
/* If this is reached, the caller should have called yuv2packedXinC
|
879 |
instead. */
|
880 |
static int printed_error_message; |
881 |
if (!printed_error_message) {
|
882 |
av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
|
883 |
sws_format_name(c->dstFormat)); |
884 |
printed_error_message=1;
|
885 |
} |
886 |
return;
|
887 |
} |
888 |
} |
889 |
} |
890 |
|
891 |
if (i < dstW) {
|
892 |
i -= 16;
|
893 |
|
894 |
Y0 = RND; |
895 |
Y1 = RND; |
896 |
/* extract 16 coeffs from lumSrc */
|
897 |
for (j=0; j<lumFilterSize; j++) { |
898 |
X0 = vec_ld (0, &lumSrc[j][i]);
|
899 |
X1 = vec_ld (16, &lumSrc[j][i]);
|
900 |
Y0 = vec_mradds (X0, YCoeffs[j], Y0); |
901 |
Y1 = vec_mradds (X1, YCoeffs[j], Y1); |
902 |
} |
903 |
|
904 |
U = RND; |
905 |
V = RND; |
906 |
/* extract 8 coeffs from U,V */
|
907 |
for (j=0; j<chrFilterSize; j++) { |
908 |
X = vec_ld (0, &chrSrc[j][i/2]); |
909 |
U = vec_mradds (X, CCoeffs[j], U); |
910 |
X = vec_ld (0, &chrSrc[j][i/2+2048]); |
911 |
V = vec_mradds (X, CCoeffs[j], V); |
912 |
} |
913 |
|
914 |
/* scale and clip signals */
|
915 |
Y0 = vec_sra (Y0, SCL); |
916 |
Y1 = vec_sra (Y1, SCL); |
917 |
U = vec_sra (U, SCL); |
918 |
V = vec_sra (V, SCL); |
919 |
|
920 |
Y0 = vec_clip_s16 (Y0); |
921 |
Y1 = vec_clip_s16 (Y1); |
922 |
U = vec_clip_s16 (U); |
923 |
V = vec_clip_s16 (V); |
924 |
|
925 |
/* now we have
|
926 |
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
|
927 |
U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
|
928 |
|
929 |
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
|
930 |
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
|
931 |
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
|
932 |
*/
|
933 |
|
934 |
U0 = vec_mergeh (U,U); |
935 |
V0 = vec_mergeh (V,V); |
936 |
|
937 |
U1 = vec_mergel (U,U); |
938 |
V1 = vec_mergel (V,V); |
939 |
|
940 |
cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); |
941 |
cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); |
942 |
|
943 |
R = vec_packclp (R0,R1); |
944 |
G = vec_packclp (G0,G1); |
945 |
B = vec_packclp (B0,B1); |
946 |
|
947 |
nout = (vector unsigned char *)scratch; |
948 |
switch(c->dstFormat) {
|
949 |
case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break; |
950 |
case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break; |
951 |
case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break; |
952 |
case PIX_FMT_ARGB: out_argb (R,G,B,nout); break; |
953 |
case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break; |
954 |
case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break; |
955 |
default:
|
956 |
/* Unreachable, I think. */
|
957 |
av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
|
958 |
sws_format_name(c->dstFormat)); |
959 |
return;
|
960 |
} |
961 |
|
962 |
memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
|
963 |
} |
964 |
|
965 |
} |