Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale.c @ b6654a54

History | View | Annotate | Download (50.3 KB)

1
/*
2
    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
/*
20
  supported Input formats: YV12, I420, IYUV, YUY2, BGR32, BGR24, Y8, Y800
21
  supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32 (grayscale soon too)
22
  BGR15/16 support dithering
23
*/
24

    
25
#include <inttypes.h>
26
#include <string.h>
27
#include <math.h>
28
#include <stdio.h>
29
#include "../config.h"
30
#include "../mangle.h"
31
#ifdef HAVE_MALLOC_H
32
#include <malloc.h>
33
#endif
34
#include "swscale.h"
35
#include "../cpudetect.h"
36
#include "../libvo/img_format.h"
37
#include "rgb2rgb.h"
38
#undef MOVNTQ
39
#undef PAVGB
40

    
41
//#undef HAVE_MMX2
42
//#define HAVE_3DNOW
43
//#undef HAVE_MMX
44
//#undef ARCH_X86
45
#define DITHER1XBPP
46

    
47
#define RET 0xC3 //near return opcode for X86
48

    
49
#ifdef MP_DEBUG
50
#define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
51
#else
52
#define ASSERT(x) ;
53
#endif
54

    
55
#ifdef M_PI
56
#define PI M_PI
57
#else
58
#define PI 3.14159265358979323846
59
#endif
60

    
61
//FIXME replace this with something faster
62
#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
63
#define isYUV(x)       ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
64
#define isHalfChrV(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
65
#define isHalfChrH(x)  ((x)==IMGFMT_YUY2 || (x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
66
#define isPacked(x)    ((x)==IMGFMT_YUY2 || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24)
67
#define isGray(x)      ((x)==IMGFMT_Y800)
68
#define isSupportedIn(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
69
                        || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24\
70
                        || (x)==IMGFMT_Y800)
71
#define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
72
                        || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
73
#define isBGR(x)       ((x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
74

    
75
#define RGB2YUV_SHIFT 16
76
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
77
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
78
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
79
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
80
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
81
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
82
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
83
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
84
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
85

    
86
extern int verbose; // defined in mplayer.c
87
/*
88
NOTES
89

90
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
91
horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
92

93
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
94

95
TODO
96
more intelligent missalignment avoidance for the horizontal scaler
97
write special vertical cubic upscale version
98
Optimize C code (yv12 / minmax)
99
add support for packed pixel yuv input & output
100
add support for Y8 output
101
optimize bgr24 & bgr32
102
add BGR4 output support
103
write special BGR->BGR scaler
104
deglobalize yuv2rgb*.c
105
*/
106

    
107
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
108
#define MIN(a,b) ((a) > (b) ? (b) : (a))
109
#define MAX(a,b) ((a) < (b) ? (b) : (a))
110

    
111
#ifdef ARCH_X86
112
#define CAN_COMPILE_X86_ASM
113
#endif
114

    
115
#ifdef CAN_COMPILE_X86_ASM
116
static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
117
static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
118
static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
119
static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
120
static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
121
static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
122
static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
123
static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
124
static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
125
static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
126
static uint64_t __attribute__((aligned(8))) w02=       0x0002000200020002LL;
127
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
128
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
129
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
130
static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
131

    
132
static volatile uint64_t __attribute__((aligned(8))) b5Dither;
133
static volatile uint64_t __attribute__((aligned(8))) g5Dither;
134
static volatile uint64_t __attribute__((aligned(8))) g6Dither;
135
static volatile uint64_t __attribute__((aligned(8))) r5Dither;
136

    
137
static uint64_t __attribute__((aligned(8))) dither4[2]={
138
        0x0103010301030103LL,
139
        0x0200020002000200LL,};
140

    
141
static uint64_t __attribute__((aligned(8))) dither8[2]={
142
        0x0602060206020602LL,
143
        0x0004000400040004LL,};
144

    
145
static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
146
static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
147
static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
148
static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
149
static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
150
static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
151

    
152
static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
153
static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
154
static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
155

    
156
// FIXME remove
157
static uint64_t __attribute__((aligned(8))) asm_yalpha1;
158
static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
159
#endif
160

    
161
// clipping helper table for C implementations:
162
static unsigned char clip_table[768];
163

    
164
static unsigned short clip_table16b[768];
165
static unsigned short clip_table16g[768];
166
static unsigned short clip_table16r[768];
167
static unsigned short clip_table15b[768];
168
static unsigned short clip_table15g[768];
169
static unsigned short clip_table15r[768];
170

    
171
// yuv->rgb conversion tables:
172
static    int yuvtab_2568[256];
173
static    int yuvtab_3343[256];
174
static    int yuvtab_0c92[256];
175
static    int yuvtab_1a1e[256];
176
static    int yuvtab_40cf[256];
177
// Needed for cubic scaler to catch overflows
178
static    int clip_yuvtab_2568[768];
179
static    int clip_yuvtab_3343[768];
180
static    int clip_yuvtab_0c92[768];
181
static    int clip_yuvtab_1a1e[768];
182
static    int clip_yuvtab_40cf[768];
183

    
184
//global sws_flags from the command line
185
int sws_flags=2;
186

    
187
//global srcFilter
188
SwsFilter src_filter= {NULL, NULL, NULL, NULL};
189

    
190
float sws_lum_gblur= 0.0;
191
float sws_chr_gblur= 0.0;
192
int sws_chr_vshift= 0;
193
int sws_chr_hshift= 0;
194
float sws_chr_sharpen= 0.0;
195
float sws_lum_sharpen= 0.0;
196

    
197
/* cpuCaps combined from cpudetect and whats actually compiled in
198
   (if there is no support for something compiled in it wont appear here) */
199
static CpuCaps cpuCaps;
200

    
201
void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
202
             int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
203

    
204
static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
205

    
206
#ifdef CAN_COMPILE_X86_ASM
207
void in_asm_used_var_warning_killer()
208
{
209
 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
210
 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
211
 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
212
 if(i) i=0;
213
}
214
#endif
215

    
216
static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
217
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
218
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
219
{
220
        //FIXME Optimize (just quickly writen not opti..)
221
        int i;
222
        for(i=0; i<dstW; i++)
223
        {
224
                int val=0;
225
                int j;
226
                for(j=0; j<lumFilterSize; j++)
227
                        val += lumSrc[j][i] * lumFilter[j];
228

    
229
                dest[i]= MIN(MAX(val>>19, 0), 255);
230
        }
231

    
232
        if(uDest != NULL)
233
                for(i=0; i<(dstW>>1); i++)
234
                {
235
                        int u=0;
236
                        int v=0;
237
                        int j;
238
                        for(j=0; j<chrFilterSize; j++)
239
                        {
240
                                u += chrSrc[j][i] * chrFilter[j];
241
                                v += chrSrc[j][i + 2048] * chrFilter[j];
242
                        }
243

    
244
                        uDest[i]= MIN(MAX(u>>19, 0), 255);
245
                        vDest[i]= MIN(MAX(v>>19, 0), 255);
246
                }
247
}
248

    
249
static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
250
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
251
                                    uint8_t *dest, int dstW, int dstFormat)
252
{
253
        if(dstFormat==IMGFMT_BGR32)
254
        {
255
                int i;
256
                for(i=0; i<(dstW>>1); i++){
257
                        int j;
258
                        int Y1=0;
259
                        int Y2=0;
260
                        int U=0;
261
                        int V=0;
262
                        int Cb, Cr, Cg;
263
                        for(j=0; j<lumFilterSize; j++)
264
                        {
265
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
266
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
267
                        }
268
                        for(j=0; j<chrFilterSize; j++)
269
                        {
270
                                U += chrSrc[j][i] * chrFilter[j];
271
                                V += chrSrc[j][i+2048] * chrFilter[j];
272
                        }
273
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
274
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
275
                        U >>= 19;
276
                        V >>= 19;
277

    
278
                        Cb= clip_yuvtab_40cf[U+ 256];
279
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
280
                        Cr= clip_yuvtab_3343[V+ 256];
281

    
282
                        dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
283
                        dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
284
                        dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
285

    
286
                        dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
287
                        dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
288
                        dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
289
                }
290
        }
291
        else if(dstFormat==IMGFMT_BGR24)
292
        {
293
                int i;
294
                for(i=0; i<(dstW>>1); i++){
295
                        int j;
296
                        int Y1=0;
297
                        int Y2=0;
298
                        int U=0;
299
                        int V=0;
300
                        int Cb, Cr, Cg;
301
                        for(j=0; j<lumFilterSize; j++)
302
                        {
303
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
304
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
305
                        }
306
                        for(j=0; j<chrFilterSize; j++)
307
                        {
308
                                U += chrSrc[j][i] * chrFilter[j];
309
                                V += chrSrc[j][i+2048] * chrFilter[j];
310
                        }
311
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
312
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
313
                        U >>= 19;
314
                        V >>= 19;
315

    
316
                        Cb= clip_yuvtab_40cf[U+ 256];
317
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
318
                        Cr= clip_yuvtab_3343[V+ 256];
319

    
320
                        dest[0]=clip_table[((Y1 + Cb) >>13)];
321
                        dest[1]=clip_table[((Y1 + Cg) >>13)];
322
                        dest[2]=clip_table[((Y1 + Cr) >>13)];
323

    
324
                        dest[3]=clip_table[((Y2 + Cb) >>13)];
325
                        dest[4]=clip_table[((Y2 + Cg) >>13)];
326
                        dest[5]=clip_table[((Y2 + Cr) >>13)];
327
                        dest+=6;
328
                }
329
        }
330
        else if(dstFormat==IMGFMT_BGR16)
331
        {
332
                int i;
333
#ifdef DITHER1XBPP
334
                static int ditherb1=1<<14;
335
                static int ditherg1=1<<13;
336
                static int ditherr1=2<<14;
337
                static int ditherb2=3<<14;
338
                static int ditherg2=3<<13;
339
                static int ditherr2=0<<14;
340

    
341
                ditherb1 ^= (1^2)<<14;
342
                ditherg1 ^= (1^2)<<13;
343
                ditherr1 ^= (1^2)<<14;
344
                ditherb2 ^= (3^0)<<14;
345
                ditherg2 ^= (3^0)<<13;
346
                ditherr2 ^= (3^0)<<14;
347
#else
348
                const int ditherb1=0;
349
                const int ditherg1=0;
350
                const int ditherr1=0;
351
                const int ditherb2=0;
352
                const int ditherg2=0;
353
                const int ditherr2=0;
354
#endif
355
                for(i=0; i<(dstW>>1); i++){
356
                        int j;
357
                        int Y1=0;
358
                        int Y2=0;
359
                        int U=0;
360
                        int V=0;
361
                        int Cb, Cr, Cg;
362
                        for(j=0; j<lumFilterSize; j++)
363
                        {
364
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
365
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
366
                        }
367
                        for(j=0; j<chrFilterSize; j++)
368
                        {
369
                                U += chrSrc[j][i] * chrFilter[j];
370
                                V += chrSrc[j][i+2048] * chrFilter[j];
371
                        }
372
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
373
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
374
                        U >>= 19;
375
                        V >>= 19;
376

    
377
                        Cb= clip_yuvtab_40cf[U+ 256];
378
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
379
                        Cr= clip_yuvtab_3343[V+ 256];
380

    
381
                        ((uint16_t*)dest)[2*i] =
382
                                clip_table16b[(Y1 + Cb + ditherb1) >>13] |
383
                                clip_table16g[(Y1 + Cg + ditherg1) >>13] |
384
                                clip_table16r[(Y1 + Cr + ditherr1) >>13];
385

    
386
                        ((uint16_t*)dest)[2*i+1] =
387
                                clip_table16b[(Y2 + Cb + ditherb2) >>13] |
388
                                clip_table16g[(Y2 + Cg + ditherg2) >>13] |
389
                                clip_table16r[(Y2 + Cr + ditherr2) >>13];
390
                }
391
        }
392
        else if(dstFormat==IMGFMT_BGR15)
393
        {
394
                int i;
395
#ifdef DITHER1XBPP
396
                static int ditherb1=1<<14;
397
                static int ditherg1=1<<14;
398
                static int ditherr1=2<<14;
399
                static int ditherb2=3<<14;
400
                static int ditherg2=3<<14;
401
                static int ditherr2=0<<14;
402

    
403
                ditherb1 ^= (1^2)<<14;
404
                ditherg1 ^= (1^2)<<14;
405
                ditherr1 ^= (1^2)<<14;
406
                ditherb2 ^= (3^0)<<14;
407
                ditherg2 ^= (3^0)<<14;
408
                ditherr2 ^= (3^0)<<14;
409
#else
410
                const int ditherb1=0;
411
                const int ditherg1=0;
412
                const int ditherr1=0;
413
                const int ditherb2=0;
414
                const int ditherg2=0;
415
                const int ditherr2=0;
416
#endif
417
                for(i=0; i<(dstW>>1); i++){
418
                        int j;
419
                        int Y1=0;
420
                        int Y2=0;
421
                        int U=0;
422
                        int V=0;
423
                        int Cb, Cr, Cg;
424
                        for(j=0; j<lumFilterSize; j++)
425
                        {
426
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
427
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
428
                        }
429
                        for(j=0; j<chrFilterSize; j++)
430
                        {
431
                                U += chrSrc[j][i] * chrFilter[j];
432
                                V += chrSrc[j][i+2048] * chrFilter[j];
433
                        }
434
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
435
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
436
                        U >>= 19;
437
                        V >>= 19;
438

    
439
                        Cb= clip_yuvtab_40cf[U+ 256];
440
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
441
                        Cr= clip_yuvtab_3343[V+ 256];
442

    
443
                        ((uint16_t*)dest)[2*i] =
444
                                clip_table15b[(Y1 + Cb + ditherb1) >>13] |
445
                                clip_table15g[(Y1 + Cg + ditherg1) >>13] |
446
                                clip_table15r[(Y1 + Cr + ditherr1) >>13];
447

    
448
                        ((uint16_t*)dest)[2*i+1] =
449
                                clip_table15b[(Y2 + Cb + ditherb2) >>13] |
450
                                clip_table15g[(Y2 + Cg + ditherg2) >>13] |
451
                                clip_table15r[(Y2 + Cr + ditherr2) >>13];
452
                }
453
        }
454
}
455

    
456

    
457
//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
458
//Plain C versions
459
#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
460
#define COMPILE_C
461
#endif
462

    
463
#ifdef CAN_COMPILE_X86_ASM
464

    
465
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
466
#define COMPILE_MMX
467
#endif
468

    
469
#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
470
#define COMPILE_MMX2
471
#endif
472

    
473
#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
474
#define COMPILE_3DNOW
475
#endif
476
#endif //CAN_COMPILE_X86_ASM
477

    
478
#undef HAVE_MMX
479
#undef HAVE_MMX2
480
#undef HAVE_3DNOW
481

    
482
#ifdef COMPILE_C
483
#undef HAVE_MMX
484
#undef HAVE_MMX2
485
#undef HAVE_3DNOW
486
#define RENAME(a) a ## _C
487
#include "swscale_template.c"
488
#endif
489

    
490
#ifdef CAN_COMPILE_X86_ASM
491

    
492
//X86 versions
493
/*
494
#undef RENAME
495
#undef HAVE_MMX
496
#undef HAVE_MMX2
497
#undef HAVE_3DNOW
498
#define ARCH_X86
499
#define RENAME(a) a ## _X86
500
#include "swscale_template.c"
501
*/
502
//MMX versions
503
#ifdef COMPILE_MMX
504
#undef RENAME
505
#define HAVE_MMX
506
#undef HAVE_MMX2
507
#undef HAVE_3DNOW
508
#define RENAME(a) a ## _MMX
509
#include "swscale_template.c"
510
#endif
511

    
512
//MMX2 versions
513
#ifdef COMPILE_MMX2
514
#undef RENAME
515
#define HAVE_MMX
516
#define HAVE_MMX2
517
#undef HAVE_3DNOW
518
#define RENAME(a) a ## _MMX2
519
#include "swscale_template.c"
520
#endif
521

    
522
//3DNOW versions
523
#ifdef COMPILE_3DNOW
524
#undef RENAME
525
#define HAVE_MMX
526
#undef HAVE_MMX2
527
#define HAVE_3DNOW
528
#define RENAME(a) a ## _3DNow
529
#include "swscale_template.c"
530
#endif
531

    
532
#endif //CAN_COMPILE_X86_ASM
533

    
534
// minor note: the HAVE_xyz is messed up after that line so dont use it
535

    
536

    
537
// old global scaler, dont use for new code
538
// will use sws_flags from the command line
539
void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
540
                             int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
541
                             int srcW, int srcH, int dstW, int dstH){
542

    
543
        static SwsContext *context=NULL;
544
        int dstFormat;
545
        int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
546

    
547
        switch(dstbpp)
548
        {
549
                case 8 : dstFormat= IMGFMT_Y8;                break;
550
                case 12: dstFormat= IMGFMT_YV12;        break;
551
                case 15: dstFormat= IMGFMT_BGR15;        break;
552
                case 16: dstFormat= IMGFMT_BGR16;        break;
553
                case 24: dstFormat= IMGFMT_BGR24;        break;
554
                case 32: dstFormat= IMGFMT_BGR32;        break;
555
                default: return;
556
        }
557

    
558
        if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
559

    
560
        context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
561
}
562

    
563
// will use sws_flags & src_filter (from cmd line)
564
SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
565
{
566
        int flags=0;
567
        static int firstTime=1;
568

    
569
#ifdef ARCH_X86
570
        if(gCpuCaps.hasMMX)
571
                asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
572
#endif
573
        if(firstTime)
574
        {
575
                firstTime=0;
576
                flags= SWS_PRINT_INFO;
577
        }
578
        else if(verbose>1) flags= SWS_PRINT_INFO;
579

    
580
        if(src_filter.lumH) freeVec(src_filter.lumH);
581
        if(src_filter.lumV) freeVec(src_filter.lumV);
582
        if(src_filter.chrH) freeVec(src_filter.chrH);
583
        if(src_filter.chrV) freeVec(src_filter.chrV);
584

    
585
        if(sws_lum_gblur!=0.0){
586
                src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
587
                src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
588
        }else{
589
                src_filter.lumH= getIdentityVec();
590
                src_filter.lumV= getIdentityVec();
591
        }
592

    
593
        if(sws_chr_gblur!=0.0){
594
                src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
595
                src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
596
        }else{
597
                src_filter.chrH= getIdentityVec();
598
                src_filter.chrV= getIdentityVec();
599
        }
600

    
601
        if(sws_chr_sharpen!=0.0){
602
                SwsVector *g= getConstVec(-1.0, 3);
603
                SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
604
                g->coeff[1]=2.0;
605
                addVec(id, g);
606
                convVec(src_filter.chrH, id);
607
                convVec(src_filter.chrV, id);
608
                freeVec(g);
609
                freeVec(id);
610
        }
611

    
612
        if(sws_lum_sharpen!=0.0){
613
                SwsVector *g= getConstVec(-1.0, 3);
614
                SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
615
                g->coeff[1]=2.0;
616
                addVec(id, g);
617
                convVec(src_filter.lumH, id);
618
                convVec(src_filter.lumV, id);
619
                freeVec(g);
620
                freeVec(id);
621
        }
622

    
623
        if(sws_chr_hshift)
624
                shiftVec(src_filter.chrH, sws_chr_hshift);
625

    
626
        if(sws_chr_vshift)
627
                shiftVec(src_filter.chrV, sws_chr_vshift);
628

    
629
        normalizeVec(src_filter.chrH, 1.0);
630
        normalizeVec(src_filter.chrV, 1.0);
631
        normalizeVec(src_filter.lumH, 1.0);
632
        normalizeVec(src_filter.lumV, 1.0);
633

    
634
        if(verbose > 1) printVec(src_filter.chrH);
635
        if(verbose > 1) printVec(src_filter.lumH);
636

    
637
        switch(sws_flags)
638
        {
639
                case 0: flags|= SWS_FAST_BILINEAR; break;
640
                case 1: flags|= SWS_BILINEAR; break;
641
                case 2: flags|= SWS_BICUBIC; break;
642
                case 3: flags|= SWS_X; break;
643
                case 4: flags|= SWS_POINT; break;
644
                case 5: flags|= SWS_AREA; break;
645
                default:flags|= SWS_BILINEAR; break;
646
        }
647

    
648
        return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
649
}
650

    
651

    
652
static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
653
                              int srcW, int dstW, int filterAlign, int one, int flags,
654
                              SwsVector *srcFilter, SwsVector *dstFilter)
655
{
656
        int i;
657
        int filterSize;
658
        int filter2Size;
659
        int minFilterSize;
660
        double *filter=NULL;
661
        double *filter2=NULL;
662
#ifdef ARCH_X86
663
        if(gCpuCaps.hasMMX)
664
                asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
665
#endif
666

    
667
        *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
668
        (*filterPos)[dstW]=0; // the MMX scaler will read over the end 
669

    
670
        if(ABS(xInc - 0x10000) <10) // unscaled
671
        {
672
                int i;
673
                filterSize= 1;
674
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
675
                for(i=0; i<dstW*filterSize; i++) filter[i]=0;
676

    
677
                for(i=0; i<dstW; i++)
678
                {
679
                        filter[i*filterSize]=1;
680
                        (*filterPos)[i]=i;
681
                }
682

    
683
        }
684
        else if(flags&SWS_POINT) // lame looking point sampling mode
685
        {
686
                int i;
687
                int xDstInSrc;
688
                filterSize= 1;
689
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
690
                
691
                xDstInSrc= xInc/2 - 0x8000;
692
                for(i=0; i<dstW; i++)
693
                {
694
                        int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
695

    
696
                        (*filterPos)[i]= xx;
697
                        filter[i]= 1.0;
698
                        xDstInSrc+= xInc;
699
                }
700
        }
701
        else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
702
        {
703
                int i;
704
                int xDstInSrc;
705
                if     (flags&SWS_BICUBIC) filterSize= 4;
706
                else if(flags&SWS_X      ) filterSize= 4;
707
                else                           filterSize= 2; // SWS_BILINEAR / SWS_AREA 
708
//                printf("%d %d %d\n", filterSize, srcW, dstW);
709
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
710

    
711
                xDstInSrc= xInc/2 - 0x8000;
712
                for(i=0; i<dstW; i++)
713
                {
714
                        int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
715
                        int j;
716

    
717
                        (*filterPos)[i]= xx;
718
                        if((flags & SWS_BICUBIC) || (flags & SWS_X))
719
                        {
720
                                double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
721
                                double y1,y2,y3,y4;
722
                                double A= -0.6;
723
                                if(flags & SWS_BICUBIC){
724
                                                // Equation is from VirtualDub
725
                                        y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
726
                                        y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
727
                                        y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
728
                                        y4 = (                  +           A*d*d -       A*d*d*d);
729
                                }else{
730
                                                // cubic interpolation (derived it myself)
731
                                        y1 = (    -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
732
                                        y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
733
                                        y3 = (    +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
734
                                        y4 = (    -1.0*d           + 1.0*d*d*d)/6.0;
735
                                }
736

    
737
//                                printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
738
                                filter[i*filterSize + 0]= y1;
739
                                filter[i*filterSize + 1]= y2;
740
                                filter[i*filterSize + 2]= y3;
741
                                filter[i*filterSize + 3]= y4;
742
//                                printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
743
                        }
744
                        else
745
                        {
746
                                //Bilinear upscale / linear interpolate / Area averaging
747
                                for(j=0; j<filterSize; j++)
748
                                {
749
                                        double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
750
                                        double coeff= 1.0 - d;
751
                                        if(coeff<0) coeff=0;
752
        //                                printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
753
                                        filter[i*filterSize + j]= coeff;
754
                                        xx++;
755
                                }
756
                        }
757
                        xDstInSrc+= xInc;
758
                }
759
        }
760
        else // downscale
761
        {
762
                int xDstInSrc;
763
                if(flags&SWS_BICUBIC)        filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
764
                else if(flags&SWS_X)        filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
765
                else if(flags&SWS_AREA)        filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
766
                else /* BILINEAR */        filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
767
//                printf("%d %d %d\n", *filterSize, srcW, dstW);
768
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
769

    
770
                xDstInSrc= xInc/2 - 0x8000;
771
                for(i=0; i<dstW; i++)
772
                {
773
                        int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
774
                        int j;
775
                        (*filterPos)[i]= xx;
776
                        for(j=0; j<filterSize; j++)
777
                        {
778
                                double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
779
                                double coeff;
780
                                if((flags & SWS_BICUBIC) || (flags & SWS_X))
781
                                {
782
                                        double A= -0.75;
783
//                                        d*=2;
784
                                        // Equation is from VirtualDub
785
                                        if(d<1.0)
786
                                                coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
787
                                        else if(d<2.0)
788
                                                coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
789
                                        else
790
                                                coeff=0.0;
791
                                }
792
                                else if(flags & SWS_AREA)
793
                                {
794
                                        double srcPixelSize= (1<<16)/(double)xInc;
795
                                        if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
796
                                        else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
797
                                        else coeff=0.0;
798
                                }
799
                                else
800
                                {
801
                                        coeff= 1.0 - d;
802
                                        if(coeff<0) coeff=0;
803
                                }
804
//                                printf("%1.3f %2.3f %d \n", coeff, d, xDstInSrc);
805
                                filter[i*filterSize + j]= coeff;
806
                                xx++;
807
                        }
808
                        xDstInSrc+= xInc;
809
                }
810
        }
811

    
812
        /* apply src & dst Filter to filter -> filter2
813
           free(filter);
814
        */
815
        filter2Size= filterSize;
816
        if(srcFilter) filter2Size+= srcFilter->length - 1;
817
        if(dstFilter) filter2Size+= dstFilter->length - 1;
818
        filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
819

    
820
        for(i=0; i<dstW; i++)
821
        {
822
                int j;
823
                SwsVector scaleFilter;
824
                SwsVector *outVec;
825

    
826
                scaleFilter.coeff= filter + i*filterSize;
827
                scaleFilter.length= filterSize;
828

    
829
                if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
830
                else              outVec= &scaleFilter;
831

    
832
                ASSERT(outVec->length == filter2Size)
833
                //FIXME dstFilter
834

    
835
                for(j=0; j<outVec->length; j++)
836
                {
837
                        filter2[i*filter2Size + j]= outVec->coeff[j];
838
                }
839

    
840
                (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
841

    
842
                if(outVec != &scaleFilter) freeVec(outVec);
843
        }
844
        free(filter); filter=NULL;
845

    
846
        /* try to reduce the filter-size (step1 find size and shift left) */
847
        // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
848
        minFilterSize= 0;
849
        for(i=dstW-1; i>=0; i--)
850
        {
851
                int min= filter2Size;
852
                int j;
853
                double cutOff=0.0;
854

    
855
                /* get rid off near zero elements on the left by shifting left */
856
                for(j=0; j<filter2Size; j++)
857
                {
858
                        int k;
859
                        cutOff += ABS(filter2[i*filter2Size]);
860

    
861
                        if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
862

    
863
                        /* preserve Monotonicity because the core cant handle the filter otherwise */
864
                        if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
865

    
866
                        // Move filter coeffs left
867
                        for(k=1; k<filter2Size; k++)
868
                                filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
869
                        filter2[i*filter2Size + k - 1]= 0.0;
870
                        (*filterPos)[i]++;
871
                }
872

    
873
                cutOff=0.0;
874
                /* count near zeros on the right */
875
                for(j=filter2Size-1; j>0; j--)
876
                {
877
                        cutOff += ABS(filter2[i*filter2Size + j]);
878

    
879
                        if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
880
                        min--;
881
                }
882

    
883
                if(min>minFilterSize) minFilterSize= min;
884
        }
885

    
886
        filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
887
        filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
888
        *outFilterSize= filterSize;
889

    
890
        if((flags&SWS_PRINT_INFO) && verbose)
891
                printf("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
892
        /* try to reduce the filter-size (step2 reduce it) */
893
        for(i=0; i<dstW; i++)
894
        {
895
                int j;
896

    
897
                for(j=0; j<filterSize; j++)
898
                {
899
                        if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
900
                        else                   filter[i*filterSize + j]= filter2[i*filter2Size + j];
901
                }
902
        }
903
        free(filter2); filter2=NULL;
904
        
905
        ASSERT(filterSize > 0)
906

    
907
        //FIXME try to align filterpos if possible
908

    
909
        //fix borders
910
        for(i=0; i<dstW; i++)
911
        {
912
                int j;
913
                if((*filterPos)[i] < 0)
914
                {
915
                        // Move filter coeffs left to compensate for filterPos
916
                        for(j=1; j<filterSize; j++)
917
                        {
918
                                int left= MAX(j + (*filterPos)[i], 0);
919
                                filter[i*filterSize + left] += filter[i*filterSize + j];
920
                                filter[i*filterSize + j]=0;
921
                        }
922
                        (*filterPos)[i]= 0;
923
                }
924

    
925
                if((*filterPos)[i] + filterSize > srcW)
926
                {
927
                        int shift= (*filterPos)[i] + filterSize - srcW;
928
                        // Move filter coeffs right to compensate for filterPos
929
                        for(j=filterSize-2; j>=0; j--)
930
                        {
931
                                int right= MIN(j + shift, filterSize-1);
932
                                filter[i*filterSize +right] += filter[i*filterSize +j];
933
                                filter[i*filterSize +j]=0;
934
                        }
935
                        (*filterPos)[i]= srcW - filterSize;
936
                }
937
        }
938

    
939
        // Note the +1 is for the MMXscaler which reads over the end
940
        *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
941
        memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
942

    
943
        /* Normalize & Store in outFilter */
944
        for(i=0; i<dstW; i++)
945
        {
946
                int j;
947
                double sum=0;
948
                double scale= one;
949
                for(j=0; j<filterSize; j++)
950
                {
951
                        sum+= filter[i*filterSize + j];
952
                }
953
                scale/= sum;
954
                for(j=0; j<filterSize; j++)
955
                {
956
                        (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
957
                }
958
        }
959

    
960
        free(filter);
961
}
962

    
963
#ifdef ARCH_X86
964
static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
965
{
966
        uint8_t *fragment;
967
        int imm8OfPShufW1;
968
        int imm8OfPShufW2;
969
        int fragmentLength;
970

    
971
        int xpos, i;
972

    
973
        // create an optimized horizontal scaling routine
974

    
975
        //code fragment
976

    
977
        asm volatile(
978
                "jmp 9f                                \n\t"
979
        // Begin
980
                "0:                                \n\t"
981
                "movq (%%esi), %%mm0                \n\t" //FIXME Alignment
982
                "movq %%mm0, %%mm1                \n\t"
983
                "psrlq $8, %%mm0                \n\t"
984
                "punpcklbw %%mm7, %%mm1        \n\t"
985
                "movq %%mm2, %%mm3                \n\t"
986
                "punpcklbw %%mm7, %%mm0        \n\t"
987
                "addw %%bx, %%cx                \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
988
                "pshufw $0xFF, %%mm1, %%mm1        \n\t"
989
                "1:                                \n\t"
990
                "adcl %%edx, %%esi                \n\t" //xx+= (4*lumXInc)>>16 + carry
991
                "pshufw $0xFF, %%mm0, %%mm0        \n\t"
992
                "2:                                \n\t"
993
                "psrlw $9, %%mm3                \n\t"
994
                "psubw %%mm1, %%mm0                \n\t"
995
                "pmullw %%mm3, %%mm0                \n\t"
996
                "paddw %%mm6, %%mm2                \n\t" // 2*alpha += xpos&0xFFFF
997
                "psllw $7, %%mm1                \n\t"
998
                "paddw %%mm1, %%mm0                \n\t"
999

    
1000
                "movq %%mm0, (%%edi, %%eax)        \n\t"
1001

    
1002
                "addl $8, %%eax                        \n\t"
1003
        // End
1004
                "9:                                \n\t"
1005
//                "int $3\n\t"
1006
                "leal 0b, %0                        \n\t"
1007
                "leal 1b, %1                        \n\t"
1008
                "leal 2b, %2                        \n\t"
1009
                "decl %1                        \n\t"
1010
                "decl %2                        \n\t"
1011
                "subl %0, %1                        \n\t"
1012
                "subl %0, %2                        \n\t"
1013
                "leal 9b, %3                        \n\t"
1014
                "subl %0, %3                        \n\t"
1015
                :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
1016
                "=r" (fragmentLength)
1017
        );
1018

    
1019
        xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1020

    
1021
        for(i=0; i<dstW/8; i++)
1022
        {
1023
                int xx=xpos>>16;
1024

    
1025
                if((i&3) == 0)
1026
                {
1027
                        int a=0;
1028
                        int b=((xpos+xInc)>>16) - xx;
1029
                        int c=((xpos+xInc*2)>>16) - xx;
1030
                        int d=((xpos+xInc*3)>>16) - xx;
1031

    
1032
                        memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
1033

    
1034
                        funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
1035
                        funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
1036
                                a | (b<<2) | (c<<4) | (d<<6);
1037

    
1038
                        // if we dont need to read 8 bytes than dont :), reduces the chance of
1039
                        // crossing a cache line
1040
                        if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
1041

    
1042
                        funnyCode[fragmentLength*(i+4)/4]= RET;
1043
                }
1044
                xpos+=xInc;
1045
        }
1046
}
1047
#endif // ARCH_X86
1048

    
1049
//FIXME remove
1050
void SwScale_Init(){
1051
}
1052

    
1053
static void globalInit(){
1054
    // generating tables:
1055
    int i;
1056
    for(i=0; i<768; i++){
1057
        int c= MIN(MAX(i-256, 0), 255);
1058
        clip_table[i]=c;
1059
        yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1060
        yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1061
        yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1062
        yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1063
        yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
1064
    }
1065

    
1066
    for(i=0; i<768; i++)
1067
    {
1068
        int v= clip_table[i];
1069
        clip_table16b[i]= v>>3;
1070
        clip_table16g[i]= (v<<3)&0x07E0;
1071
        clip_table16r[i]= (v<<8)&0xF800;
1072
        clip_table15b[i]= v>>3;
1073
        clip_table15g[i]= (v<<2)&0x03E0;
1074
        clip_table15r[i]= (v<<7)&0x7C00;
1075
    }
1076

    
1077
cpuCaps= gCpuCaps;
1078

    
1079
#ifdef RUNTIME_CPUDETECT
1080
#ifdef CAN_COMPILE_X86_ASM
1081
        // ordered per speed fasterst first
1082
        if(gCpuCaps.hasMMX2)
1083
                swScale= swScale_MMX2;
1084
        else if(gCpuCaps.has3DNow)
1085
                swScale= swScale_3DNow;
1086
        else if(gCpuCaps.hasMMX)
1087
                swScale= swScale_MMX;
1088
        else
1089
                swScale= swScale_C;
1090

    
1091
#else
1092
        swScale= swScale_C;
1093
        cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1094
#endif
1095
#else //RUNTIME_CPUDETECT
1096
#ifdef HAVE_MMX2
1097
        swScale= swScale_MMX2;
1098
        cpuCaps.has3DNow = 0;
1099
#elif defined (HAVE_3DNOW)
1100
        swScale= swScale_3DNow;
1101
        cpuCaps.hasMMX2 = 0;
1102
#elif defined (HAVE_MMX)
1103
        swScale= swScale_MMX;
1104
        cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1105
#else
1106
        swScale= swScale_C;
1107
        cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1108
#endif
1109
#endif //!RUNTIME_CPUDETECT
1110
}
1111

    
1112
/* Warper functions for yuv2bgr */
1113
static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1114
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1115
        uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1116

    
1117
        if(c->srcFormat==IMGFMT_YV12)
1118
                yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1119
        else /* I420 & IYUV */
1120
                yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1121
}
1122

    
1123
/* unscaled copy like stuff (assumes nearly identical formats) */
1124
static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
1125
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1126

    
1127
        int srcStride[3];
1128
        uint8_t *src[3];
1129
        uint8_t *dst[3];
1130

    
1131
        if(c->srcFormat == IMGFMT_I420){
1132
                src[0]= srcParam[0];
1133
                src[1]= srcParam[2];
1134
                src[2]= srcParam[1];
1135
                srcStride[0]= srcStrideParam[0];
1136
                srcStride[1]= srcStrideParam[2];
1137
                srcStride[2]= srcStrideParam[1];
1138
        }
1139
        else if(c->srcFormat==IMGFMT_YV12){
1140
                src[0]= srcParam[0];
1141
                src[1]= srcParam[1];
1142
                src[2]= srcParam[2];
1143
                srcStride[0]= srcStrideParam[0];
1144
                srcStride[1]= srcStrideParam[1];
1145
                srcStride[2]= srcStrideParam[2];
1146
        }
1147
        else if(isPacked(c->srcFormat) || isGray(c->srcFormat)){
1148
                src[0]= srcParam[0];
1149
                src[1]=
1150
                src[2]= NULL;
1151
                srcStride[0]= srcStrideParam[0];
1152
                srcStride[1]=
1153
                srcStride[2]= 0;
1154
        }
1155

    
1156
        if(c->dstFormat == IMGFMT_I420){
1157
                dst[0]= dstParam[0];
1158
                dst[1]= dstParam[2];
1159
                dst[2]= dstParam[1];
1160
                
1161
        }else{
1162
                dst[0]= dstParam[0];
1163
                dst[1]= dstParam[1];
1164
                dst[2]= dstParam[2];
1165
        }
1166

    
1167
        if(isPacked(c->srcFormat))
1168
        {
1169
                if(dstStride[0]==srcStride[0])
1170
                        memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1171
                else
1172
                {
1173
                        int i;
1174
                        uint8_t *srcPtr= src[0];
1175
                        uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1176
                        int length;
1177
                        
1178
                        if(c->srcFormat==IMGFMT_YUY2)                 length= c->srcW*2;
1179
                        else if(c->srcFormat==IMGFMT_BGR15)         length= c->srcW*2;
1180
                        else if(c->srcFormat==IMGFMT_BGR16)         length= c->srcW*2;
1181
                        else if(c->srcFormat==IMGFMT_BGR24)         length= c->srcW*3;
1182
                        else if(c->srcFormat==IMGFMT_BGR32)         length= c->srcW*4;
1183
                        else return; /* that shouldnt happen */
1184

    
1185
                        for(i=0; i<srcSliceH; i++)
1186
                        {
1187
                                memcpy(dstPtr, srcPtr, length);
1188
                                srcPtr+= srcStride[0];
1189
                                dstPtr+= dstStride[0];
1190
                        }
1191
                }
1192
        }
1193
        else 
1194
        { /* Planar YUV */
1195
                int plane;
1196
                for(plane=0; plane<3; plane++)
1197
                {
1198
                        int length= plane==0 ? c->srcW  : ((c->srcW+1)>>1);
1199
                        int y=      plane==0 ? srcSliceY: ((srcSliceY+1)>>1);
1200
                        int height= plane==0 ? srcSliceH: ((srcSliceH+1)>>1);
1201
printf("%d %d %d %d %d %d\n", plane, length, y, height, dstStride[plane], srcStride[plane] );
1202
                        if(dstStride[plane]==srcStride[plane])
1203
                                memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1204
                        else
1205
                        {
1206
                                int i;
1207
                                uint8_t *srcPtr= src[plane];
1208
                                uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1209
                                for(i=0; i<height; i++)
1210
                                {
1211
                                        memcpy(dstPtr, srcPtr, length);
1212
                                        srcPtr+= srcStride[plane];
1213
                                        dstPtr+= dstStride[plane];
1214
                                }
1215
                        }
1216
                }
1217
        }
1218
}
1219

    
1220
SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1221
                         SwsFilter *srcFilter, SwsFilter *dstFilter){
1222

    
1223
        SwsContext *c;
1224
        int i;
1225
        int usesFilter;
1226
        SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1227

    
1228
#ifdef ARCH_X86
1229
        if(gCpuCaps.hasMMX)
1230
                asm volatile("emms\n\t"::: "memory");
1231
#endif
1232

    
1233
        if(swScale==NULL) globalInit();
1234

    
1235
        /* avoid dupplicate Formats, so we dont need to check to much */
1236
        if(srcFormat==IMGFMT_IYUV) srcFormat=IMGFMT_I420;
1237
        if(srcFormat==IMGFMT_Y8)   srcFormat=IMGFMT_Y800;
1238
        if(dstFormat==IMGFMT_Y8)   dstFormat=IMGFMT_Y800;
1239

    
1240
        if(!isSupportedIn(srcFormat)) 
1241
        {
1242
                fprintf(stderr, "swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
1243
                return NULL;
1244
        }
1245
        if(!isSupportedOut(dstFormat))
1246
        {
1247
                fprintf(stderr, "swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
1248
                return NULL;
1249
        }
1250

    
1251
        /* sanity check */
1252
        if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1253
        {
1254
                fprintf(stderr, "swScaler: %dx%d -> %dx%d is invalid scaling dimension\n", 
1255
                        srcW, srcH, dstW, dstH);
1256
                return NULL;
1257
        }
1258

    
1259
        if(!dstFilter) dstFilter= &dummyFilter;
1260
        if(!srcFilter) srcFilter= &dummyFilter;
1261

    
1262
        c= memalign(64, sizeof(SwsContext));
1263
        memset(c, 0, sizeof(SwsContext));
1264

    
1265
        c->srcW= srcW;
1266
        c->srcH= srcH;
1267
        c->dstW= dstW;
1268
        c->dstH= dstH;
1269
        c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1270
        c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
1271
        c->flags= flags;
1272
        c->dstFormat= dstFormat;
1273
        c->srcFormat= srcFormat;
1274

    
1275
        usesFilter=0;
1276
        if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
1277
        if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
1278
        if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
1279
        if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
1280
        if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
1281
        if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
1282
        if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
1283
        if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
1284
        
1285
        /* special Cases */
1286
        if(srcW==dstW && srcH==dstH && !usesFilter)
1287
        {
1288
                /* yuv2bgr */
1289
                if(isPlanarYUV(srcFormat) && isBGR(dstFormat))
1290
                {
1291
                        // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
1292
                        yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1293
                        c->swScale= planarYuvToBgr;
1294

    
1295
                        if(flags&SWS_PRINT_INFO)
1296
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1297
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1298
                        return c;
1299
                }
1300

    
1301
                /* simple copy */
1302
                if(srcFormat == dstFormat || (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat)))
1303
                {
1304
                        c->swScale= simpleCopy;
1305

    
1306
                        if(flags&SWS_PRINT_INFO)
1307
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1308
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1309
                        return c;
1310
                }
1311
        }
1312

    
1313
        if(cpuCaps.hasMMX2)
1314
        {
1315
                c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
1316
                if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
1317
                {
1318
                        if(flags&SWS_PRINT_INFO)
1319
                                fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
1320
                }
1321
        }
1322
        else
1323
                c->canMMX2BeUsed=0;
1324

    
1325

    
1326
        /* dont use full vertical UV input/internaly if the source doesnt even have it */
1327
        if(isHalfChrV(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_V);
1328
        /* dont use full horizontal UV input if the source doesnt even have it */
1329
        if(isHalfChrH(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INP);
1330
        /* dont use full horizontal UV internally if the destination doesnt even have it */
1331
        if(isHalfChrH(dstFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INT);
1332

    
1333
        if(flags&SWS_FULL_CHR_H_INP)        c->chrSrcW= srcW;
1334
        else                                c->chrSrcW= (srcW+1)>>1;
1335

    
1336
        if(flags&SWS_FULL_CHR_H_INT)        c->chrDstW= dstW;
1337
        else                                c->chrDstW= (dstW+1)>>1;
1338

    
1339
        if(flags&SWS_FULL_CHR_V)        c->chrSrcH= srcH;
1340
        else                                c->chrSrcH= (srcH+1)>>1;
1341

    
1342
        if(isHalfChrV(dstFormat))        c->chrDstH= (dstH+1)>>1;
1343
        else                                c->chrDstH= dstH;
1344

    
1345
        c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
1346
        c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
1347

    
1348

    
1349
        // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1350
        // but only for the FAST_BILINEAR mode otherwise do correct scaling
1351
        // n-2 is the last chrominance sample available
1352
        // this is not perfect, but noone shuld notice the difference, the more correct variant
1353
        // would be like the vertical one, but that would require some special code for the
1354
        // first and last pixel
1355
        if(flags&SWS_FAST_BILINEAR)
1356
        {
1357
                if(c->canMMX2BeUsed)
1358
                {
1359
                        c->lumXInc+= 20;
1360
                        c->chrXInc+= 20;
1361
                }
1362
                //we dont use the x86asm scaler if mmx is available
1363
                else if(cpuCaps.hasMMX)
1364
                {
1365
                        c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
1366
                        c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
1367
                }
1368
        }
1369

    
1370
        /* precalculate horizontal scaler filter coefficients */
1371
        {
1372
                const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
1373

    
1374
                initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
1375
                                 srcW      ,       dstW, filterAlign, 1<<14, flags,
1376
                                 srcFilter->lumH, dstFilter->lumH);
1377
                initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
1378
                                (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
1379
                                 srcFilter->chrH, dstFilter->chrH);
1380

    
1381
#ifdef ARCH_X86
1382
// cant downscale !!!
1383
                if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
1384
                {
1385
                        initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode);
1386
                        initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
1387
                }
1388
#endif
1389
        } // Init Horizontal stuff
1390

    
1391

    
1392

    
1393
        /* precalculate vertical scaler filter coefficients */
1394
        initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
1395
                        srcH      ,        dstH, 1, (1<<12)-4, flags,
1396
                        srcFilter->lumV, dstFilter->lumV);
1397
        initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
1398
                        (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
1399
                         srcFilter->chrV, dstFilter->chrV);
1400

    
1401
        // Calculate Buffer Sizes so that they wont run out while handling these damn slices
1402
        c->vLumBufSize= c->vLumFilterSize;
1403
        c->vChrBufSize= c->vChrFilterSize;
1404
        for(i=0; i<dstH; i++)
1405
        {
1406
                int chrI= i*c->chrDstH / dstH;
1407
                int nextSlice= MAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
1408
                                 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
1409
                nextSlice&= ~1; // Slices start at even boundaries
1410
                if(c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
1411
                        c->vLumBufSize= nextSlice - c->vLumFilterPos[i   ];
1412
                if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
1413
                        c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
1414
        }
1415

    
1416
        // allocate pixbufs (we use dynamic allocation because otherwise we would need to
1417
        c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
1418
        c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
1419
        //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
1420
        for(i=0; i<c->vLumBufSize; i++)
1421
                c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
1422
        for(i=0; i<c->vChrBufSize; i++)
1423
                c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
1424

    
1425
        //try to avoid drawing green stuff between the right end and the stride end
1426
        for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
1427
        for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
1428

    
1429
        ASSERT(c->chrDstH <= dstH)
1430

    
1431
        // pack filter data for mmx code
1432
        if(cpuCaps.hasMMX)
1433
        {
1434
                c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize*      dstH*4*sizeof(int16_t));
1435
                c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
1436
                for(i=0; i<c->vLumFilterSize*dstH; i++)
1437
                        c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
1438
                                c->vLumFilter[i];
1439
                for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
1440
                        c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
1441
                                c->vChrFilter[i];
1442
        }
1443

    
1444
        if(flags&SWS_PRINT_INFO)
1445
        {
1446
#ifdef DITHER1XBPP
1447
                char *dither= " dithered";
1448
#else
1449
                char *dither= "";
1450
#endif
1451
                if(flags&SWS_FAST_BILINEAR)
1452
                        fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler, ");
1453
                else if(flags&SWS_BILINEAR)
1454
                        fprintf(stderr, "\nSwScaler: BILINEAR scaler, ");
1455
                else if(flags&SWS_BICUBIC)
1456
                        fprintf(stderr, "\nSwScaler: BICUBIC scaler, ");
1457
                else if(flags&SWS_X)
1458
                        fprintf(stderr, "\nSwScaler: Experimental scaler, ");
1459
                else if(flags&SWS_POINT)
1460
                        fprintf(stderr, "\nSwScaler: Nearest Neighbor / POINT scaler, ");
1461
                else if(flags&SWS_AREA)
1462
                        fprintf(stderr, "\nSwScaler: Area Averageing scaler, ");
1463
                else
1464
                        fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
1465

    
1466
                if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
1467
                        fprintf(stderr, "from %s to%s %s ", 
1468
                                vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
1469
                else
1470
                        fprintf(stderr, "from %s to %s ", 
1471
                                vo_format_name(srcFormat), vo_format_name(dstFormat));
1472

    
1473
                if(cpuCaps.hasMMX2)
1474
                        fprintf(stderr, "using MMX2\n");
1475
                else if(cpuCaps.has3DNow)
1476
                        fprintf(stderr, "using 3DNOW\n");
1477
                else if(cpuCaps.hasMMX)
1478
                        fprintf(stderr, "using MMX\n");
1479
                else
1480
                        fprintf(stderr, "using C\n");
1481
        }
1482

    
1483
        if((flags & SWS_PRINT_INFO) && verbose)
1484
        {
1485
                if(cpuCaps.hasMMX)
1486
                {
1487
                        if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1488
                                printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1489
                        else
1490
                        {
1491
                                if(c->hLumFilterSize==4)
1492
                                        printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
1493
                                else if(c->hLumFilterSize==8)
1494
                                        printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
1495
                                else
1496
                                        printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
1497

    
1498
                                if(c->hChrFilterSize==4)
1499
                                        printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
1500
                                else if(c->hChrFilterSize==8)
1501
                                        printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
1502
                                else
1503
                                        printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
1504
                        }
1505
                }
1506
                else
1507
                {
1508
#ifdef ARCH_X86
1509
                        printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
1510
#else
1511
                        if(flags & SWS_FAST_BILINEAR)
1512
                                printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
1513
                        else
1514
                                printf("SwScaler: using C scaler for horizontal scaling\n");
1515
#endif
1516
                }
1517
                if(isPlanarYUV(dstFormat))
1518
                {
1519
                        if(c->vLumFilterSize==1)
1520
                                printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1521
                        else
1522
                                printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1523
                }
1524
                else
1525
                {
1526
                        if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
1527
                                printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1528
                                       "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
1529
                        else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
1530
                                printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1531
                        else
1532
                                printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1533
                }
1534

    
1535
                if(dstFormat==IMGFMT_BGR24)
1536
                        printf("SwScaler: using %s YV12->BGR24 Converter\n",
1537
                                cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
1538
                else if(dstFormat==IMGFMT_BGR32)
1539
                        printf("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1540
                else if(dstFormat==IMGFMT_BGR16)
1541
                        printf("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1542
                else if(dstFormat==IMGFMT_BGR15)
1543
                        printf("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1544

    
1545
                printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1546
        }
1547
        if((flags & SWS_PRINT_INFO) && verbose>1)
1548
        {
1549
                printf("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1550
                        c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
1551
                printf("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1552
                        c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
1553
        }
1554

    
1555
        c->swScale= swScale;
1556
        return c;
1557
}
1558

    
1559
/**
1560
 * returns a normalized gaussian curve used to filter stuff
1561
 * quality=3 is high quality, lowwer is lowwer quality
1562
 */
1563

    
1564
SwsVector *getGaussianVec(double variance, double quality){
1565
        const int length= (int)(variance*quality + 0.5) | 1;
1566
        int i;
1567
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1568
        double middle= (length-1)*0.5;
1569
        SwsVector *vec= malloc(sizeof(SwsVector));
1570

    
1571
        vec->coeff= coeff;
1572
        vec->length= length;
1573

    
1574
        for(i=0; i<length; i++)
1575
        {
1576
                double dist= i-middle;
1577
                coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1578
        }
1579

    
1580
        normalizeVec(vec, 1.0);
1581

    
1582
        return vec;
1583
}
1584

    
1585
SwsVector *getConstVec(double c, int length){
1586
        int i;
1587
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1588
        SwsVector *vec= malloc(sizeof(SwsVector));
1589

    
1590
        vec->coeff= coeff;
1591
        vec->length= length;
1592

    
1593
        for(i=0; i<length; i++)
1594
                coeff[i]= c;
1595

    
1596
        return vec;
1597
}
1598

    
1599

    
1600
SwsVector *getIdentityVec(void){
1601
        double *coeff= memalign(sizeof(double), sizeof(double));
1602
        SwsVector *vec= malloc(sizeof(SwsVector));
1603
        coeff[0]= 1.0;
1604

    
1605
        vec->coeff= coeff;
1606
        vec->length= 1;
1607

    
1608
        return vec;
1609
}
1610

    
1611
void normalizeVec(SwsVector *a, double height){
1612
        int i;
1613
        double sum=0;
1614
        double inv;
1615

    
1616
        for(i=0; i<a->length; i++)
1617
                sum+= a->coeff[i];
1618

    
1619
        inv= height/sum;
1620

    
1621
        for(i=0; i<a->length; i++)
1622
                a->coeff[i]*= height;
1623
}
1624

    
1625
void scaleVec(SwsVector *a, double scalar){
1626
        int i;
1627

    
1628
        for(i=0; i<a->length; i++)
1629
                a->coeff[i]*= scalar;
1630
}
1631

    
1632
static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
1633
        int length= a->length + b->length - 1;
1634
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1635
        int i, j;
1636
        SwsVector *vec= malloc(sizeof(SwsVector));
1637

    
1638
        vec->coeff= coeff;
1639
        vec->length= length;
1640

    
1641
        for(i=0; i<length; i++) coeff[i]= 0.0;
1642

    
1643
        for(i=0; i<a->length; i++)
1644
        {
1645
                for(j=0; j<b->length; j++)
1646
                {
1647
                        coeff[i+j]+= a->coeff[i]*b->coeff[j];
1648
                }
1649
        }
1650

    
1651
        return vec;
1652
}
1653

    
1654
static SwsVector *sumVec(SwsVector *a, SwsVector *b){
1655
        int length= MAX(a->length, b->length);
1656
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1657
        int i;
1658
        SwsVector *vec= malloc(sizeof(SwsVector));
1659

    
1660
        vec->coeff= coeff;
1661
        vec->length= length;
1662

    
1663
        for(i=0; i<length; i++) coeff[i]= 0.0;
1664

    
1665
        for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1666
        for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
1667

    
1668
        return vec;
1669
}
1670

    
1671
static SwsVector *diffVec(SwsVector *a, SwsVector *b){
1672
        int length= MAX(a->length, b->length);
1673
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1674
        int i;
1675
        SwsVector *vec= malloc(sizeof(SwsVector));
1676

    
1677
        vec->coeff= coeff;
1678
        vec->length= length;
1679

    
1680
        for(i=0; i<length; i++) coeff[i]= 0.0;
1681

    
1682
        for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1683
        for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
1684

    
1685
        return vec;
1686
}
1687

    
1688
/* shift left / or right if "shift" is negative */
1689
static SwsVector *getShiftedVec(SwsVector *a, int shift){
1690
        int length= a->length + ABS(shift)*2;
1691
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1692
        int i;
1693
        SwsVector *vec= malloc(sizeof(SwsVector));
1694

    
1695
        vec->coeff= coeff;
1696
        vec->length= length;
1697

    
1698
        for(i=0; i<length; i++) coeff[i]= 0.0;
1699

    
1700
        for(i=0; i<a->length; i++)
1701
        {
1702
                coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
1703
        }
1704

    
1705
        return vec;
1706
}
1707

    
1708
void shiftVec(SwsVector *a, int shift){
1709
        SwsVector *shifted= getShiftedVec(a, shift);
1710
        free(a->coeff);
1711
        a->coeff= shifted->coeff;
1712
        a->length= shifted->length;
1713
        free(shifted);
1714
}
1715

    
1716
void addVec(SwsVector *a, SwsVector *b){
1717
        SwsVector *sum= sumVec(a, b);
1718
        free(a->coeff);
1719
        a->coeff= sum->coeff;
1720
        a->length= sum->length;
1721
        free(sum);
1722
}
1723

    
1724
void subVec(SwsVector *a, SwsVector *b){
1725
        SwsVector *diff= diffVec(a, b);
1726
        free(a->coeff);
1727
        a->coeff= diff->coeff;
1728
        a->length= diff->length;
1729
        free(diff);
1730
}
1731

    
1732
void convVec(SwsVector *a, SwsVector *b){
1733
        SwsVector *conv= getConvVec(a, b);
1734
        free(a->coeff);
1735
        a->coeff= conv->coeff;
1736
        a->length= conv->length;
1737
        free(conv);
1738
}
1739

    
1740
SwsVector *cloneVec(SwsVector *a){
1741
        double *coeff= memalign(sizeof(double), a->length*sizeof(double));
1742
        int i;
1743
        SwsVector *vec= malloc(sizeof(SwsVector));
1744

    
1745
        vec->coeff= coeff;
1746
        vec->length= a->length;
1747

    
1748
        for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
1749

    
1750
        return vec;
1751
}
1752

    
1753
void printVec(SwsVector *a){
1754
        int i;
1755
        double max=0;
1756
        double min=0;
1757
        double range;
1758

    
1759
        for(i=0; i<a->length; i++)
1760
                if(a->coeff[i]>max) max= a->coeff[i];
1761

    
1762
        for(i=0; i<a->length; i++)
1763
                if(a->coeff[i]<min) min= a->coeff[i];
1764

    
1765
        range= max - min;
1766

    
1767
        for(i=0; i<a->length; i++)
1768
        {
1769
                int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
1770
                printf("%1.3f ", a->coeff[i]);
1771
                for(;x>0; x--) printf(" ");
1772
                printf("|\n");
1773
        }
1774
}
1775

    
1776
void freeVec(SwsVector *a){
1777
        if(!a) return;
1778
        if(a->coeff) free(a->coeff);
1779
        a->coeff=NULL;
1780
        a->length=0;
1781
        free(a);
1782
}
1783

    
1784
void freeSwsContext(SwsContext *c){
1785
        int i;
1786

    
1787
        if(!c) return;
1788

    
1789
        if(c->lumPixBuf)
1790
        {
1791
                for(i=0; i<c->vLumBufSize; i++)
1792
                {
1793
                        if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
1794
                        c->lumPixBuf[i]=NULL;
1795
                }
1796
                free(c->lumPixBuf);
1797
                c->lumPixBuf=NULL;
1798
        }
1799

    
1800
        if(c->chrPixBuf)
1801
        {
1802
                for(i=0; i<c->vChrBufSize; i++)
1803
                {
1804
                        if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
1805
                        c->chrPixBuf[i]=NULL;
1806
                }
1807
                free(c->chrPixBuf);
1808
                c->chrPixBuf=NULL;
1809
        }
1810

    
1811
        if(c->vLumFilter) free(c->vLumFilter);
1812
        c->vLumFilter = NULL;
1813
        if(c->vChrFilter) free(c->vChrFilter);
1814
        c->vChrFilter = NULL;
1815
        if(c->hLumFilter) free(c->hLumFilter);
1816
        c->hLumFilter = NULL;
1817
        if(c->hChrFilter) free(c->hChrFilter);
1818
        c->hChrFilter = NULL;
1819

    
1820
        if(c->vLumFilterPos) free(c->vLumFilterPos);
1821
        c->vLumFilterPos = NULL;
1822
        if(c->vChrFilterPos) free(c->vChrFilterPos);
1823
        c->vChrFilterPos = NULL;
1824
        if(c->hLumFilterPos) free(c->hLumFilterPos);
1825
        c->hLumFilterPos = NULL;
1826
        if(c->hChrFilterPos) free(c->hChrFilterPos);
1827
        c->hChrFilterPos = NULL;
1828

    
1829
        if(c->lumMmxFilter) free(c->lumMmxFilter);
1830
        c->lumMmxFilter = NULL;
1831
        if(c->chrMmxFilter) free(c->chrMmxFilter);
1832
        c->chrMmxFilter = NULL;
1833

    
1834
        free(c);
1835
}
1836

    
1837