Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale.c @ 2ba1bff0

History | View | Annotate | Download (55.3 KB)

1
/*
2
    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
/*
20
  supported Input formats: YV12, I420, IYUV, YUY2, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8, Y800
21
  supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32 (grayscale soon too)
22
  BGR15/16 support dithering
23
  
24
  unscaled special converters
25
  YV12/I420/IYUV -> BGR15/BGR16/BGR24/BGR32
26
  YV12/I420/IYUV -> YV12/I420/IYUV
27
  YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
28
  BGR24 -> BGR32 & RGB24 -> RGB32
29
  BGR32 -> BGR24 & RGB32 -> RGB24
30
  BGR15 -> BGR16
31
*/
32

    
33
/* 
34
tested special converters
35
 YV12/I420 -> BGR16
36
 YV12 -> YV12
37
 BGR15 -> BGR16
38
 BGR16 -> BGR16
39

40
untested special converters
41
  YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
42
  YV12/I420 -> YV12/I420
43
  YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
44
  BGR24 -> BGR32 & RGB24 -> RGB32
45
  BGR32 -> BGR24 & RGB32 -> RGB24
46
  BGR24 -> YV12
47
*/
48

    
49
#include <inttypes.h>
50
#include <string.h>
51
#include <math.h>
52
#include <stdio.h>
53
#include "../config.h"
54
#include "../mangle.h"
55
#ifdef HAVE_MALLOC_H
56
#include <malloc.h>
57
#endif
58
#include "swscale.h"
59
#include "../cpudetect.h"
60
#include "../bswap.h"
61
#include "../libvo/img_format.h"
62
#include "rgb2rgb.h"
63
#include "../libvo/fastmemcpy.h"
64
#undef MOVNTQ
65
#undef PAVGB
66

    
67
//#undef HAVE_MMX2
68
//#define HAVE_3DNOW
69
//#undef HAVE_MMX
70
//#undef ARCH_X86
71
//#define WORDS_BIGENDIAN
72
#define DITHER1XBPP
73

    
74
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
75

    
76
#define RET 0xC3 //near return opcode for X86
77

    
78
#ifdef MP_DEBUG
79
#define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
80
#else
81
#define ASSERT(x) ;
82
#endif
83

    
84
#ifdef M_PI
85
#define PI M_PI
86
#else
87
#define PI 3.14159265358979323846
88
#endif
89

    
90
//FIXME replace this with something faster
91
#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
92
#define isYUV(x)       ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
93
#define isHalfChrV(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
94
#define isHalfChrH(x)  ((x)==IMGFMT_YUY2 || (x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
95
#define isPacked(x)    ((x)==IMGFMT_YUY2 || ((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR || ((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
96
#define isGray(x)      ((x)==IMGFMT_Y800)
97
#define isSupportedIn(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
98
                        || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
99
                        || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
100
                        || (x)==IMGFMT_Y800)
101
#define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
102
                        || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
103
#define isBGR(x)       ((x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
104

    
105
#define RGB2YUV_SHIFT 16
106
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
107
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
108
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
109
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
110
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
111
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
112
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
113
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
114
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
115

    
116
extern int verbose; // defined in mplayer.c
117
/*
118
NOTES
119

120
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
121
horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
122

123
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
124

125
TODO
126
more intelligent missalignment avoidance for the horizontal scaler
127
write special vertical cubic upscale version
128
Optimize C code (yv12 / minmax)
129
add support for packed pixel yuv input & output
130
add support for Y8 output
131
optimize bgr24 & bgr32
132
add BGR4 output support
133
write special BGR->BGR scaler
134
deglobalize yuv2rgb*.c
135
*/
136

    
137
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
138
#define MIN(a,b) ((a) > (b) ? (b) : (a))
139
#define MAX(a,b) ((a) < (b) ? (b) : (a))
140

    
141
#ifdef ARCH_X86
142
#define CAN_COMPILE_X86_ASM
143
#endif
144

    
145
#ifdef CAN_COMPILE_X86_ASM
146
static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
147
static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
148
static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
149
static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
150
static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
151
static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
152
static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
153
static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
154
static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
155
static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
156
static uint64_t __attribute__((aligned(8))) w02=       0x0002000200020002LL;
157
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
158
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
159
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
160
static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
161

    
162
static volatile uint64_t __attribute__((aligned(8))) b5Dither;
163
static volatile uint64_t __attribute__((aligned(8))) g5Dither;
164
static volatile uint64_t __attribute__((aligned(8))) g6Dither;
165
static volatile uint64_t __attribute__((aligned(8))) r5Dither;
166

    
167
static uint64_t __attribute__((aligned(8))) dither4[2]={
168
        0x0103010301030103LL,
169
        0x0200020002000200LL,};
170

    
171
static uint64_t __attribute__((aligned(8))) dither8[2]={
172
        0x0602060206020602LL,
173
        0x0004000400040004LL,};
174

    
175
static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
176
static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
177
static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
178
static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
179
static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
180
static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
181

    
182
static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
183
static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
184
static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
185

    
186
#ifdef FAST_BGR2YV12
187
static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000000210041000DULL;
188
static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
189
static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
190
#else
191
static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000020E540830C8BULL;
192
static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
193
static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
194
#endif
195
static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
196
static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
197
static const uint64_t w1111       __attribute__((aligned(8))) = 0x0001000100010001ULL;
198

    
199
// FIXME remove
200
static uint64_t __attribute__((aligned(8))) asm_yalpha1;
201
static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
202
#endif
203

    
204
// clipping helper table for C implementations:
205
static unsigned char clip_table[768];
206

    
207
static unsigned short clip_table16b[768];
208
static unsigned short clip_table16g[768];
209
static unsigned short clip_table16r[768];
210
static unsigned short clip_table15b[768];
211
static unsigned short clip_table15g[768];
212
static unsigned short clip_table15r[768];
213

    
214
// yuv->rgb conversion tables:
215
static    int yuvtab_2568[256];
216
static    int yuvtab_3343[256];
217
static    int yuvtab_0c92[256];
218
static    int yuvtab_1a1e[256];
219
static    int yuvtab_40cf[256];
220
// Needed for cubic scaler to catch overflows
221
static    int clip_yuvtab_2568[768];
222
static    int clip_yuvtab_3343[768];
223
static    int clip_yuvtab_0c92[768];
224
static    int clip_yuvtab_1a1e[768];
225
static    int clip_yuvtab_40cf[768];
226

    
227
//global sws_flags from the command line
228
int sws_flags=2;
229

    
230
//global srcFilter
231
SwsFilter src_filter= {NULL, NULL, NULL, NULL};
232

    
233
float sws_lum_gblur= 0.0;
234
float sws_chr_gblur= 0.0;
235
int sws_chr_vshift= 0;
236
int sws_chr_hshift= 0;
237
float sws_chr_sharpen= 0.0;
238
float sws_lum_sharpen= 0.0;
239

    
240
/* cpuCaps combined from cpudetect and whats actually compiled in
241
   (if there is no support for something compiled in it wont appear here) */
242
static CpuCaps cpuCaps;
243

    
244
void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
245
             int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
246

    
247
static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
248

    
249
#ifdef CAN_COMPILE_X86_ASM
250
void in_asm_used_var_warning_killer()
251
{
252
 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
253
 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
254
 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
255
 if(i) i=0;
256
}
257
#endif
258

    
259
static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
260
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
261
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
262
{
263
        //FIXME Optimize (just quickly writen not opti..)
264
        int i;
265
        for(i=0; i<dstW; i++)
266
        {
267
                int val=0;
268
                int j;
269
                for(j=0; j<lumFilterSize; j++)
270
                        val += lumSrc[j][i] * lumFilter[j];
271

    
272
                dest[i]= MIN(MAX(val>>19, 0), 255);
273
        }
274

    
275
        if(uDest != NULL)
276
                for(i=0; i<(dstW>>1); i++)
277
                {
278
                        int u=0;
279
                        int v=0;
280
                        int j;
281
                        for(j=0; j<chrFilterSize; j++)
282
                        {
283
                                u += chrSrc[j][i] * chrFilter[j];
284
                                v += chrSrc[j][i + 2048] * chrFilter[j];
285
                        }
286

    
287
                        uDest[i]= MIN(MAX(u>>19, 0), 255);
288
                        vDest[i]= MIN(MAX(v>>19, 0), 255);
289
                }
290
}
291

    
292
static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
293
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
294
                                    uint8_t *dest, int dstW, int dstFormat)
295
{
296
        if(dstFormat==IMGFMT_BGR32)
297
        {
298
                int i;
299
#ifdef WORDS_BIGENDIAN
300
        dest++;
301
#endif
302
                for(i=0; i<(dstW>>1); i++){
303
                        int j;
304
                        int Y1=0;
305
                        int Y2=0;
306
                        int U=0;
307
                        int V=0;
308
                        int Cb, Cr, Cg;
309
                        for(j=0; j<lumFilterSize; j++)
310
                        {
311
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
312
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
313
                        }
314
                        for(j=0; j<chrFilterSize; j++)
315
                        {
316
                                U += chrSrc[j][i] * chrFilter[j];
317
                                V += chrSrc[j][i+2048] * chrFilter[j];
318
                        }
319
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
320
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
321
                        U >>= 19;
322
                        V >>= 19;
323

    
324
                        Cb= clip_yuvtab_40cf[U+ 256];
325
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
326
                        Cr= clip_yuvtab_3343[V+ 256];
327

    
328
                        dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
329
                        dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
330
                        dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
331

    
332
                        dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
333
                        dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
334
                        dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
335
                }
336
        }
337
        else if(dstFormat==IMGFMT_BGR24)
338
        {
339
                int i;
340
                for(i=0; i<(dstW>>1); i++){
341
                        int j;
342
                        int Y1=0;
343
                        int Y2=0;
344
                        int U=0;
345
                        int V=0;
346
                        int Cb, Cr, Cg;
347
                        for(j=0; j<lumFilterSize; j++)
348
                        {
349
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
350
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
351
                        }
352
                        for(j=0; j<chrFilterSize; j++)
353
                        {
354
                                U += chrSrc[j][i] * chrFilter[j];
355
                                V += chrSrc[j][i+2048] * chrFilter[j];
356
                        }
357
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
358
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
359
                        U >>= 19;
360
                        V >>= 19;
361

    
362
                        Cb= clip_yuvtab_40cf[U+ 256];
363
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
364
                        Cr= clip_yuvtab_3343[V+ 256];
365

    
366
                        dest[0]=clip_table[((Y1 + Cb) >>13)];
367
                        dest[1]=clip_table[((Y1 + Cg) >>13)];
368
                        dest[2]=clip_table[((Y1 + Cr) >>13)];
369

    
370
                        dest[3]=clip_table[((Y2 + Cb) >>13)];
371
                        dest[4]=clip_table[((Y2 + Cg) >>13)];
372
                        dest[5]=clip_table[((Y2 + Cr) >>13)];
373
                        dest+=6;
374
                }
375
        }
376
        else if(dstFormat==IMGFMT_BGR16)
377
        {
378
                int i;
379
#ifdef DITHER1XBPP
380
                static int ditherb1=1<<14;
381
                static int ditherg1=1<<13;
382
                static int ditherr1=2<<14;
383
                static int ditherb2=3<<14;
384
                static int ditherg2=3<<13;
385
                static int ditherr2=0<<14;
386

    
387
                ditherb1 ^= (1^2)<<14;
388
                ditherg1 ^= (1^2)<<13;
389
                ditherr1 ^= (1^2)<<14;
390
                ditherb2 ^= (3^0)<<14;
391
                ditherg2 ^= (3^0)<<13;
392
                ditherr2 ^= (3^0)<<14;
393
#else
394
                const int ditherb1=0;
395
                const int ditherg1=0;
396
                const int ditherr1=0;
397
                const int ditherb2=0;
398
                const int ditherg2=0;
399
                const int ditherr2=0;
400
#endif
401
                for(i=0; i<(dstW>>1); i++){
402
                        int j;
403
                        int Y1=0;
404
                        int Y2=0;
405
                        int U=0;
406
                        int V=0;
407
                        int Cb, Cr, Cg;
408
                        for(j=0; j<lumFilterSize; j++)
409
                        {
410
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
411
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
412
                        }
413
                        for(j=0; j<chrFilterSize; j++)
414
                        {
415
                                U += chrSrc[j][i] * chrFilter[j];
416
                                V += chrSrc[j][i+2048] * chrFilter[j];
417
                        }
418
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
419
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
420
                        U >>= 19;
421
                        V >>= 19;
422

    
423
                        Cb= clip_yuvtab_40cf[U+ 256];
424
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
425
                        Cr= clip_yuvtab_3343[V+ 256];
426

    
427
                        ((uint16_t*)dest)[2*i] =
428
                                clip_table16b[(Y1 + Cb + ditherb1) >>13] |
429
                                clip_table16g[(Y1 + Cg + ditherg1) >>13] |
430
                                clip_table16r[(Y1 + Cr + ditherr1) >>13];
431

    
432
                        ((uint16_t*)dest)[2*i+1] =
433
                                clip_table16b[(Y2 + Cb + ditherb2) >>13] |
434
                                clip_table16g[(Y2 + Cg + ditherg2) >>13] |
435
                                clip_table16r[(Y2 + Cr + ditherr2) >>13];
436
                }
437
        }
438
        else if(dstFormat==IMGFMT_BGR15)
439
        {
440
                int i;
441
#ifdef DITHER1XBPP
442
                static int ditherb1=1<<14;
443
                static int ditherg1=1<<14;
444
                static int ditherr1=2<<14;
445
                static int ditherb2=3<<14;
446
                static int ditherg2=3<<14;
447
                static int ditherr2=0<<14;
448

    
449
                ditherb1 ^= (1^2)<<14;
450
                ditherg1 ^= (1^2)<<14;
451
                ditherr1 ^= (1^2)<<14;
452
                ditherb2 ^= (3^0)<<14;
453
                ditherg2 ^= (3^0)<<14;
454
                ditherr2 ^= (3^0)<<14;
455
#else
456
                const int ditherb1=0;
457
                const int ditherg1=0;
458
                const int ditherr1=0;
459
                const int ditherb2=0;
460
                const int ditherg2=0;
461
                const int ditherr2=0;
462
#endif
463
                for(i=0; i<(dstW>>1); i++){
464
                        int j;
465
                        int Y1=0;
466
                        int Y2=0;
467
                        int U=0;
468
                        int V=0;
469
                        int Cb, Cr, Cg;
470
                        for(j=0; j<lumFilterSize; j++)
471
                        {
472
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
473
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
474
                        }
475
                        for(j=0; j<chrFilterSize; j++)
476
                        {
477
                                U += chrSrc[j][i] * chrFilter[j];
478
                                V += chrSrc[j][i+2048] * chrFilter[j];
479
                        }
480
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
481
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
482
                        U >>= 19;
483
                        V >>= 19;
484

    
485
                        Cb= clip_yuvtab_40cf[U+ 256];
486
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
487
                        Cr= clip_yuvtab_3343[V+ 256];
488

    
489
                        ((uint16_t*)dest)[2*i] =
490
                                clip_table15b[(Y1 + Cb + ditherb1) >>13] |
491
                                clip_table15g[(Y1 + Cg + ditherg1) >>13] |
492
                                clip_table15r[(Y1 + Cr + ditherr1) >>13];
493

    
494
                        ((uint16_t*)dest)[2*i+1] =
495
                                clip_table15b[(Y2 + Cb + ditherb2) >>13] |
496
                                clip_table15g[(Y2 + Cg + ditherg2) >>13] |
497
                                clip_table15r[(Y2 + Cr + ditherr2) >>13];
498
                }
499
        }
500
}
501

    
502

    
503
//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
504
//Plain C versions
505
#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
506
#define COMPILE_C
507
#endif
508

    
509
#ifdef CAN_COMPILE_X86_ASM
510

    
511
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
512
#define COMPILE_MMX
513
#endif
514

    
515
#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
516
#define COMPILE_MMX2
517
#endif
518

    
519
#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
520
#define COMPILE_3DNOW
521
#endif
522
#endif //CAN_COMPILE_X86_ASM
523

    
524
#undef HAVE_MMX
525
#undef HAVE_MMX2
526
#undef HAVE_3DNOW
527

    
528
#ifdef COMPILE_C
529
#undef HAVE_MMX
530
#undef HAVE_MMX2
531
#undef HAVE_3DNOW
532
#define RENAME(a) a ## _C
533
#include "swscale_template.c"
534
#endif
535

    
536
#ifdef CAN_COMPILE_X86_ASM
537

    
538
//X86 versions
539
/*
540
#undef RENAME
541
#undef HAVE_MMX
542
#undef HAVE_MMX2
543
#undef HAVE_3DNOW
544
#define ARCH_X86
545
#define RENAME(a) a ## _X86
546
#include "swscale_template.c"
547
*/
548
//MMX versions
549
#ifdef COMPILE_MMX
550
#undef RENAME
551
#define HAVE_MMX
552
#undef HAVE_MMX2
553
#undef HAVE_3DNOW
554
#define RENAME(a) a ## _MMX
555
#include "swscale_template.c"
556
#endif
557

    
558
//MMX2 versions
559
#ifdef COMPILE_MMX2
560
#undef RENAME
561
#define HAVE_MMX
562
#define HAVE_MMX2
563
#undef HAVE_3DNOW
564
#define RENAME(a) a ## _MMX2
565
#include "swscale_template.c"
566
#endif
567

    
568
//3DNOW versions
569
#ifdef COMPILE_3DNOW
570
#undef RENAME
571
#define HAVE_MMX
572
#undef HAVE_MMX2
573
#define HAVE_3DNOW
574
#define RENAME(a) a ## _3DNow
575
#include "swscale_template.c"
576
#endif
577

    
578
#endif //CAN_COMPILE_X86_ASM
579

    
580
// minor note: the HAVE_xyz is messed up after that line so dont use it
581

    
582

    
583
// old global scaler, dont use for new code
584
// will use sws_flags from the command line
585
void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
586
                             int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
587
                             int srcW, int srcH, int dstW, int dstH){
588

    
589
        static SwsContext *context=NULL;
590
        int dstFormat;
591
        int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
592

    
593
        switch(dstbpp)
594
        {
595
                case 8 : dstFormat= IMGFMT_Y8;                break;
596
                case 12: dstFormat= IMGFMT_YV12;        break;
597
                case 15: dstFormat= IMGFMT_BGR15;        break;
598
                case 16: dstFormat= IMGFMT_BGR16;        break;
599
                case 24: dstFormat= IMGFMT_BGR24;        break;
600
                case 32: dstFormat= IMGFMT_BGR32;        break;
601
                default: return;
602
        }
603

    
604
        if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
605

    
606
        context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
607
}
608

    
609
// will use sws_flags & src_filter (from cmd line)
610
SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
611
{
612
        int flags=0;
613
        static int firstTime=1;
614

    
615
#ifdef ARCH_X86
616
        if(gCpuCaps.hasMMX)
617
                asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
618
#endif
619
        if(firstTime)
620
        {
621
                firstTime=0;
622
                flags= SWS_PRINT_INFO;
623
        }
624
        else if(verbose>1) flags= SWS_PRINT_INFO;
625

    
626
        if(src_filter.lumH) freeVec(src_filter.lumH);
627
        if(src_filter.lumV) freeVec(src_filter.lumV);
628
        if(src_filter.chrH) freeVec(src_filter.chrH);
629
        if(src_filter.chrV) freeVec(src_filter.chrV);
630

    
631
        if(sws_lum_gblur!=0.0){
632
                src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
633
                src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
634
        }else{
635
                src_filter.lumH= getIdentityVec();
636
                src_filter.lumV= getIdentityVec();
637
        }
638

    
639
        if(sws_chr_gblur!=0.0){
640
                src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
641
                src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
642
        }else{
643
                src_filter.chrH= getIdentityVec();
644
                src_filter.chrV= getIdentityVec();
645
        }
646

    
647
        if(sws_chr_sharpen!=0.0){
648
                SwsVector *g= getConstVec(-1.0, 3);
649
                SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
650
                g->coeff[1]=2.0;
651
                addVec(id, g);
652
                convVec(src_filter.chrH, id);
653
                convVec(src_filter.chrV, id);
654
                freeVec(g);
655
                freeVec(id);
656
        }
657

    
658
        if(sws_lum_sharpen!=0.0){
659
                SwsVector *g= getConstVec(-1.0, 3);
660
                SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
661
                g->coeff[1]=2.0;
662
                addVec(id, g);
663
                convVec(src_filter.lumH, id);
664
                convVec(src_filter.lumV, id);
665
                freeVec(g);
666
                freeVec(id);
667
        }
668

    
669
        if(sws_chr_hshift)
670
                shiftVec(src_filter.chrH, sws_chr_hshift);
671

    
672
        if(sws_chr_vshift)
673
                shiftVec(src_filter.chrV, sws_chr_vshift);
674

    
675
        normalizeVec(src_filter.chrH, 1.0);
676
        normalizeVec(src_filter.chrV, 1.0);
677
        normalizeVec(src_filter.lumH, 1.0);
678
        normalizeVec(src_filter.lumV, 1.0);
679

    
680
        if(verbose > 1) printVec(src_filter.chrH);
681
        if(verbose > 1) printVec(src_filter.lumH);
682

    
683
        switch(sws_flags)
684
        {
685
                case 0: flags|= SWS_FAST_BILINEAR; break;
686
                case 1: flags|= SWS_BILINEAR; break;
687
                case 2: flags|= SWS_BICUBIC; break;
688
                case 3: flags|= SWS_X; break;
689
                case 4: flags|= SWS_POINT; break;
690
                case 5: flags|= SWS_AREA; break;
691
                default:flags|= SWS_BILINEAR; break;
692
        }
693

    
694
        return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
695
}
696

    
697

    
698
static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
699
                              int srcW, int dstW, int filterAlign, int one, int flags,
700
                              SwsVector *srcFilter, SwsVector *dstFilter)
701
{
702
        int i;
703
        int filterSize;
704
        int filter2Size;
705
        int minFilterSize;
706
        double *filter=NULL;
707
        double *filter2=NULL;
708
#ifdef ARCH_X86
709
        if(gCpuCaps.hasMMX)
710
                asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
711
#endif
712

    
713
        // Note the +1 is for the MMXscaler which reads over the end
714
        *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
715

    
716
        if(ABS(xInc - 0x10000) <10) // unscaled
717
        {
718
                int i;
719
                filterSize= 1;
720
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
721
                for(i=0; i<dstW*filterSize; i++) filter[i]=0;
722

    
723
                for(i=0; i<dstW; i++)
724
                {
725
                        filter[i*filterSize]=1;
726
                        (*filterPos)[i]=i;
727
                }
728

    
729
        }
730
        else if(flags&SWS_POINT) // lame looking point sampling mode
731
        {
732
                int i;
733
                int xDstInSrc;
734
                filterSize= 1;
735
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
736
                
737
                xDstInSrc= xInc/2 - 0x8000;
738
                for(i=0; i<dstW; i++)
739
                {
740
                        int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
741

    
742
                        (*filterPos)[i]= xx;
743
                        filter[i]= 1.0;
744
                        xDstInSrc+= xInc;
745
                }
746
        }
747
        else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
748
        {
749
                int i;
750
                int xDstInSrc;
751
                if     (flags&SWS_BICUBIC) filterSize= 4;
752
                else if(flags&SWS_X      ) filterSize= 4;
753
                else                           filterSize= 2; // SWS_BILINEAR / SWS_AREA 
754
//                printf("%d %d %d\n", filterSize, srcW, dstW);
755
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
756

    
757
                xDstInSrc= xInc/2 - 0x8000;
758
                for(i=0; i<dstW; i++)
759
                {
760
                        int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
761
                        int j;
762

    
763
                        (*filterPos)[i]= xx;
764
                        if((flags & SWS_BICUBIC) || (flags & SWS_X))
765
                        {
766
                                double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
767
                                double y1,y2,y3,y4;
768
                                double A= -0.6;
769
                                if(flags & SWS_BICUBIC){
770
                                                // Equation is from VirtualDub
771
                                        y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
772
                                        y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
773
                                        y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
774
                                        y4 = (                  +           A*d*d -       A*d*d*d);
775
                                }else{
776
                                                // cubic interpolation (derived it myself)
777
                                        y1 = (    -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
778
                                        y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
779
                                        y3 = (    +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
780
                                        y4 = (    -1.0*d           + 1.0*d*d*d)/6.0;
781
                                }
782

    
783
//                                printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
784
                                filter[i*filterSize + 0]= y1;
785
                                filter[i*filterSize + 1]= y2;
786
                                filter[i*filterSize + 2]= y3;
787
                                filter[i*filterSize + 3]= y4;
788
//                                printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
789
                        }
790
                        else
791
                        {
792
                                //Bilinear upscale / linear interpolate / Area averaging
793
                                for(j=0; j<filterSize; j++)
794
                                {
795
                                        double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
796
                                        double coeff= 1.0 - d;
797
                                        if(coeff<0) coeff=0;
798
        //                                printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
799
                                        filter[i*filterSize + j]= coeff;
800
                                        xx++;
801
                                }
802
                        }
803
                        xDstInSrc+= xInc;
804
                }
805
        }
806
        else // downscale
807
        {
808
                int xDstInSrc;
809
                if(flags&SWS_BICUBIC)        filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
810
                else if(flags&SWS_X)        filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
811
                else if(flags&SWS_AREA)        filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
812
                else /* BILINEAR */        filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
813
//                printf("%d %d %d\n", *filterSize, srcW, dstW);
814
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
815

    
816
                xDstInSrc= xInc/2 - 0x8000;
817
                for(i=0; i<dstW; i++)
818
                {
819
                        int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
820
                        int j;
821
                        (*filterPos)[i]= xx;
822
                        for(j=0; j<filterSize; j++)
823
                        {
824
                                double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
825
                                double coeff;
826
                                if((flags & SWS_BICUBIC) || (flags & SWS_X))
827
                                {
828
                                        double A= -0.75;
829
//                                        d*=2;
830
                                        // Equation is from VirtualDub
831
                                        if(d<1.0)
832
                                                coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
833
                                        else if(d<2.0)
834
                                                coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
835
                                        else
836
                                                coeff=0.0;
837
                                }
838
                                else if(flags & SWS_AREA)
839
                                {
840
                                        double srcPixelSize= (1<<16)/(double)xInc;
841
                                        if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
842
                                        else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
843
                                        else coeff=0.0;
844
                                }
845
                                else
846
                                {
847
                                        coeff= 1.0 - d;
848
                                        if(coeff<0) coeff=0;
849
                                }
850
//                                printf("%1.3f %2.3f %d \n", coeff, d, xDstInSrc);
851
                                filter[i*filterSize + j]= coeff;
852
                                xx++;
853
                        }
854
                        xDstInSrc+= xInc;
855
                }
856
        }
857

    
858
        /* apply src & dst Filter to filter -> filter2
859
           free(filter);
860
        */
861
        filter2Size= filterSize;
862
        if(srcFilter) filter2Size+= srcFilter->length - 1;
863
        if(dstFilter) filter2Size+= dstFilter->length - 1;
864
        filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
865

    
866
        for(i=0; i<dstW; i++)
867
        {
868
                int j;
869
                SwsVector scaleFilter;
870
                SwsVector *outVec;
871

    
872
                scaleFilter.coeff= filter + i*filterSize;
873
                scaleFilter.length= filterSize;
874

    
875
                if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
876
                else              outVec= &scaleFilter;
877

    
878
                ASSERT(outVec->length == filter2Size)
879
                //FIXME dstFilter
880

    
881
                for(j=0; j<outVec->length; j++)
882
                {
883
                        filter2[i*filter2Size + j]= outVec->coeff[j];
884
                }
885

    
886
                (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
887

    
888
                if(outVec != &scaleFilter) freeVec(outVec);
889
        }
890
        free(filter); filter=NULL;
891

    
892
        /* try to reduce the filter-size (step1 find size and shift left) */
893
        // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
894
        minFilterSize= 0;
895
        for(i=dstW-1; i>=0; i--)
896
        {
897
                int min= filter2Size;
898
                int j;
899
                double cutOff=0.0;
900

    
901
                /* get rid off near zero elements on the left by shifting left */
902
                for(j=0; j<filter2Size; j++)
903
                {
904
                        int k;
905
                        cutOff += ABS(filter2[i*filter2Size]);
906

    
907
                        if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
908

    
909
                        /* preserve Monotonicity because the core cant handle the filter otherwise */
910
                        if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
911

    
912
                        // Move filter coeffs left
913
                        for(k=1; k<filter2Size; k++)
914
                                filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
915
                        filter2[i*filter2Size + k - 1]= 0.0;
916
                        (*filterPos)[i]++;
917
                }
918

    
919
                cutOff=0.0;
920
                /* count near zeros on the right */
921
                for(j=filter2Size-1; j>0; j--)
922
                {
923
                        cutOff += ABS(filter2[i*filter2Size + j]);
924

    
925
                        if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
926
                        min--;
927
                }
928

    
929
                if(min>minFilterSize) minFilterSize= min;
930
        }
931

    
932
        filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
933
        filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
934
        *outFilterSize= filterSize;
935

    
936
        if((flags&SWS_PRINT_INFO) && verbose)
937
                printf("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
938
        /* try to reduce the filter-size (step2 reduce it) */
939
        for(i=0; i<dstW; i++)
940
        {
941
                int j;
942

    
943
                for(j=0; j<filterSize; j++)
944
                {
945
                        if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
946
                        else                   filter[i*filterSize + j]= filter2[i*filter2Size + j];
947
                }
948
        }
949
        free(filter2); filter2=NULL;
950
        
951
        ASSERT(filterSize > 0)
952

    
953
        //FIXME try to align filterpos if possible
954

    
955
        //fix borders
956
        for(i=0; i<dstW; i++)
957
        {
958
                int j;
959
                if((*filterPos)[i] < 0)
960
                {
961
                        // Move filter coeffs left to compensate for filterPos
962
                        for(j=1; j<filterSize; j++)
963
                        {
964
                                int left= MAX(j + (*filterPos)[i], 0);
965
                                filter[i*filterSize + left] += filter[i*filterSize + j];
966
                                filter[i*filterSize + j]=0;
967
                        }
968
                        (*filterPos)[i]= 0;
969
                }
970

    
971
                if((*filterPos)[i] + filterSize > srcW)
972
                {
973
                        int shift= (*filterPos)[i] + filterSize - srcW;
974
                        // Move filter coeffs right to compensate for filterPos
975
                        for(j=filterSize-2; j>=0; j--)
976
                        {
977
                                int right= MIN(j + shift, filterSize-1);
978
                                filter[i*filterSize +right] += filter[i*filterSize +j];
979
                                filter[i*filterSize +j]=0;
980
                        }
981
                        (*filterPos)[i]= srcW - filterSize;
982
                }
983
        }
984

    
985
        // Note the +1 is for the MMXscaler which reads over the end
986
        *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
987
        memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
988

    
989
        /* Normalize & Store in outFilter */
990
        for(i=0; i<dstW; i++)
991
        {
992
                int j;
993
                double sum=0;
994
                double scale= one;
995
                for(j=0; j<filterSize; j++)
996
                {
997
                        sum+= filter[i*filterSize + j];
998
                }
999
                scale/= sum;
1000
                for(j=0; j<filterSize; j++)
1001
                {
1002
                        (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
1003
                }
1004
        }
1005
        
1006
        (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1007
        for(i=0; i<*outFilterSize; i++)
1008
        {
1009
                int j= dstW*(*outFilterSize);
1010
                (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1011
        }
1012

    
1013
        free(filter);
1014
}
1015

    
1016
#ifdef ARCH_X86
1017
static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
1018
{
1019
        uint8_t *fragment;
1020
        int imm8OfPShufW1;
1021
        int imm8OfPShufW2;
1022
        int fragmentLength;
1023

    
1024
        int xpos, i;
1025

    
1026
        // create an optimized horizontal scaling routine
1027

    
1028
        //code fragment
1029

    
1030
        asm volatile(
1031
                "jmp 9f                                \n\t"
1032
        // Begin
1033
                "0:                                \n\t"
1034
                "movq (%%esi), %%mm0                \n\t" //FIXME Alignment
1035
                "movq %%mm0, %%mm1                \n\t"
1036
                "psrlq $8, %%mm0                \n\t"
1037
                "punpcklbw %%mm7, %%mm1        \n\t"
1038
                "movq %%mm2, %%mm3                \n\t"
1039
                "punpcklbw %%mm7, %%mm0        \n\t"
1040
                "addw %%bx, %%cx                \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
1041
                "pshufw $0xFF, %%mm1, %%mm1        \n\t"
1042
                "1:                                \n\t"
1043
                "adcl %%edx, %%esi                \n\t" //xx+= (4*lumXInc)>>16 + carry
1044
                "pshufw $0xFF, %%mm0, %%mm0        \n\t"
1045
                "2:                                \n\t"
1046
                "psrlw $9, %%mm3                \n\t"
1047
                "psubw %%mm1, %%mm0                \n\t"
1048
                "pmullw %%mm3, %%mm0                \n\t"
1049
                "paddw %%mm6, %%mm2                \n\t" // 2*alpha += xpos&0xFFFF
1050
                "psllw $7, %%mm1                \n\t"
1051
                "paddw %%mm1, %%mm0                \n\t"
1052

    
1053
                "movq %%mm0, (%%edi, %%eax)        \n\t"
1054

    
1055
                "addl $8, %%eax                        \n\t"
1056
        // End
1057
                "9:                                \n\t"
1058
//                "int $3\n\t"
1059
                "leal 0b, %0                        \n\t"
1060
                "leal 1b, %1                        \n\t"
1061
                "leal 2b, %2                        \n\t"
1062
                "decl %1                        \n\t"
1063
                "decl %2                        \n\t"
1064
                "subl %0, %1                        \n\t"
1065
                "subl %0, %2                        \n\t"
1066
                "leal 9b, %3                        \n\t"
1067
                "subl %0, %3                        \n\t"
1068
                :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
1069
                "=r" (fragmentLength)
1070
        );
1071

    
1072
        xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1073

    
1074
        for(i=0; i<dstW/8; i++)
1075
        {
1076
                int xx=xpos>>16;
1077

    
1078
                if((i&3) == 0)
1079
                {
1080
                        int a=0;
1081
                        int b=((xpos+xInc)>>16) - xx;
1082
                        int c=((xpos+xInc*2)>>16) - xx;
1083
                        int d=((xpos+xInc*3)>>16) - xx;
1084

    
1085
                        memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
1086

    
1087
                        funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
1088
                        funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
1089
                                a | (b<<2) | (c<<4) | (d<<6);
1090

    
1091
                        // if we dont need to read 8 bytes than dont :), reduces the chance of
1092
                        // crossing a cache line
1093
                        if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
1094

    
1095
                        funnyCode[fragmentLength*(i+4)/4]= RET;
1096
                }
1097
                xpos+=xInc;
1098
        }
1099
}
1100
#endif // ARCH_X86
1101

    
1102
//FIXME remove
1103
void SwScale_Init(){
1104
}
1105

    
1106
static void globalInit(){
1107
    // generating tables:
1108
    int i;
1109
    for(i=0; i<768; i++){
1110
        int c= MIN(MAX(i-256, 0), 255);
1111
        clip_table[i]=c;
1112
        yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1113
        yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1114
        yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1115
        yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1116
        yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
1117
    }
1118

    
1119
    for(i=0; i<768; i++)
1120
    {
1121
        int v= clip_table[i];
1122
        clip_table16b[i]= le2me_16( v>>3);
1123
        clip_table16g[i]= le2me_16((v<<3)&0x07E0);
1124
        clip_table16r[i]= le2me_16((v<<8)&0xF800);
1125
        clip_table15b[i]= le2me_16( v>>3);
1126
        clip_table15g[i]= le2me_16((v<<2)&0x03E0);
1127
        clip_table15r[i]= le2me_16((v<<7)&0x7C00);
1128
    }
1129

    
1130
cpuCaps= gCpuCaps;
1131

    
1132
#ifdef RUNTIME_CPUDETECT
1133
#ifdef CAN_COMPILE_X86_ASM
1134
        // ordered per speed fasterst first
1135
        if(gCpuCaps.hasMMX2)
1136
                swScale= swScale_MMX2;
1137
        else if(gCpuCaps.has3DNow)
1138
                swScale= swScale_3DNow;
1139
        else if(gCpuCaps.hasMMX)
1140
                swScale= swScale_MMX;
1141
        else
1142
                swScale= swScale_C;
1143

    
1144
#else
1145
        swScale= swScale_C;
1146
        cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1147
#endif
1148
#else //RUNTIME_CPUDETECT
1149
#ifdef HAVE_MMX2
1150
        swScale= swScale_MMX2;
1151
        cpuCaps.has3DNow = 0;
1152
#elif defined (HAVE_3DNOW)
1153
        swScale= swScale_3DNow;
1154
        cpuCaps.hasMMX2 = 0;
1155
#elif defined (HAVE_MMX)
1156
        swScale= swScale_MMX;
1157
        cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1158
#else
1159
        swScale= swScale_C;
1160
        cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1161
#endif
1162
#endif //!RUNTIME_CPUDETECT
1163
}
1164

    
1165
/* Warper functions for yuv2bgr */
1166
static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1167
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1168
        uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1169

    
1170
        if(c->srcFormat==IMGFMT_YV12)
1171
                yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1172
        else /* I420 & IYUV */
1173
                yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1174
}
1175

    
1176
static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1177
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1178
        
1179
        if(dstStride[0]*3==srcStride[0]*4)
1180
                rgb24to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1181
        else
1182
        {
1183
                int i;
1184
                uint8_t *srcPtr= src[0];
1185
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1186

    
1187
                for(i=0; i<srcSliceH; i++)
1188
                {
1189
                        rgb24to32(srcPtr, dstPtr, c->srcW*3);
1190
                        srcPtr+= srcStride[0];
1191
                        dstPtr+= dstStride[0];
1192
                }
1193
        }     
1194
}
1195

    
1196
static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1197
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1198
        
1199
        if(dstStride[0]*4==srcStride[0]*3)
1200
                rgb32to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1201
        else
1202
        {
1203
                int i;
1204
                uint8_t *srcPtr= src[0];
1205
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1206

    
1207
                for(i=0; i<srcSliceH; i++)
1208
                {
1209
                        rgb32to24(srcPtr, dstPtr, c->srcW<<2);
1210
                        srcPtr+= srcStride[0];
1211
                        dstPtr+= dstStride[0];
1212
                }
1213
        }     
1214
}
1215

    
1216
static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1217
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1218
        
1219
        if(dstStride[0]==srcStride[0])
1220
                rgb15to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1221
        else
1222
        {
1223
                int i;
1224
                uint8_t *srcPtr= src[0];
1225
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1226

    
1227
                for(i=0; i<srcSliceH; i++)
1228
                {
1229
                        rgb15to16(srcPtr, dstPtr, c->srcW<<1);
1230
                        srcPtr+= srcStride[0];
1231
                        dstPtr+= dstStride[0];
1232
                }
1233
        }     
1234
}
1235

    
1236
static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1237
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1238

    
1239
        rgb24toyv12(
1240
                src[0], 
1241
                dst[0]+ srcSliceY    *dstStride[0], 
1242
                dst[1]+(srcSliceY>>1)*dstStride[1], 
1243
                dst[2]+(srcSliceY>>1)*dstStride[2],
1244
                c->srcW, srcSliceH, 
1245
                dstStride[0], dstStride[1], srcStride[0]);
1246
}
1247

    
1248

    
1249
/* unscaled copy like stuff (assumes nearly identical formats) */
1250
static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
1251
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1252

    
1253
        int srcStride[3];
1254
        uint8_t *src[3];
1255
        uint8_t *dst[3];
1256

    
1257
        if(c->srcFormat == IMGFMT_I420){
1258
                src[0]= srcParam[0];
1259
                src[1]= srcParam[2];
1260
                src[2]= srcParam[1];
1261
                srcStride[0]= srcStrideParam[0];
1262
                srcStride[1]= srcStrideParam[2];
1263
                srcStride[2]= srcStrideParam[1];
1264
        }
1265
        else if(c->srcFormat==IMGFMT_YV12){
1266
                src[0]= srcParam[0];
1267
                src[1]= srcParam[1];
1268
                src[2]= srcParam[2];
1269
                srcStride[0]= srcStrideParam[0];
1270
                srcStride[1]= srcStrideParam[1];
1271
                srcStride[2]= srcStrideParam[2];
1272
        }
1273
        else if(isPacked(c->srcFormat) || isGray(c->srcFormat)){
1274
                src[0]= srcParam[0];
1275
                src[1]=
1276
                src[2]= NULL;
1277
                srcStride[0]= srcStrideParam[0];
1278
                srcStride[1]=
1279
                srcStride[2]= 0;
1280
        }
1281

    
1282
        if(c->dstFormat == IMGFMT_I420){
1283
                dst[0]= dstParam[0];
1284
                dst[1]= dstParam[2];
1285
                dst[2]= dstParam[1];
1286
                
1287
        }else{
1288
                dst[0]= dstParam[0];
1289
                dst[1]= dstParam[1];
1290
                dst[2]= dstParam[2];
1291
        }
1292

    
1293
        if(isPacked(c->srcFormat))
1294
        {
1295
                if(dstStride[0]==srcStride[0])
1296
                        memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1297
                else
1298
                {
1299
                        int i;
1300
                        uint8_t *srcPtr= src[0];
1301
                        uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1302
                        int length=0;
1303

    
1304
                        /* universal length finder */
1305
                        while(length+c->srcW <= ABS(dstStride[0]) 
1306
                           && length+c->srcW <= ABS(srcStride[0])) length+= c->srcW;
1307
                        ASSERT(length!=0);
1308

    
1309
                        for(i=0; i<srcSliceH; i++)
1310
                        {
1311
                                memcpy(dstPtr, srcPtr, length);
1312
                                srcPtr+= srcStride[0];
1313
                                dstPtr+= dstStride[0];
1314
                        }
1315
                }
1316
        }
1317
        else 
1318
        { /* Planar YUV */
1319
                int plane;
1320
                for(plane=0; plane<3; plane++)
1321
                {
1322
                        int length= plane==0 ? c->srcW  : ((c->srcW+1)>>1);
1323
                        int y=      plane==0 ? srcSliceY: ((srcSliceY+1)>>1);
1324
                        int height= plane==0 ? srcSliceH: ((srcSliceH+1)>>1);
1325

    
1326
                        if(dstStride[plane]==srcStride[plane])
1327
                                memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1328
                        else
1329
                        {
1330
                                int i;
1331
                                uint8_t *srcPtr= src[plane];
1332
                                uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1333
                                for(i=0; i<height; i++)
1334
                                {
1335
                                        memcpy(dstPtr, srcPtr, length);
1336
                                        srcPtr+= srcStride[plane];
1337
                                        dstPtr+= dstStride[plane];
1338
                                }
1339
                        }
1340
                }
1341
        }
1342
}
1343

    
1344
SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1345
                         SwsFilter *srcFilter, SwsFilter *dstFilter){
1346

    
1347
        SwsContext *c;
1348
        int i;
1349
        int usesFilter;
1350
        SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1351

    
1352
#ifdef ARCH_X86
1353
        if(gCpuCaps.hasMMX)
1354
                asm volatile("emms\n\t"::: "memory");
1355
#endif
1356

    
1357
        if(swScale==NULL) globalInit();
1358

    
1359
        /* avoid dupplicate Formats, so we dont need to check to much */
1360
        if(srcFormat==IMGFMT_IYUV) srcFormat=IMGFMT_I420;
1361
        if(srcFormat==IMGFMT_Y8)   srcFormat=IMGFMT_Y800;
1362
        if(dstFormat==IMGFMT_Y8)   dstFormat=IMGFMT_Y800;
1363

    
1364
        if(!isSupportedIn(srcFormat)) 
1365
        {
1366
                fprintf(stderr, "swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
1367
                return NULL;
1368
        }
1369
        if(!isSupportedOut(dstFormat))
1370
        {
1371
                fprintf(stderr, "swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
1372
                return NULL;
1373
        }
1374

    
1375
        /* sanity check */
1376
        if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1377
        {
1378
                fprintf(stderr, "swScaler: %dx%d -> %dx%d is invalid scaling dimension\n", 
1379
                        srcW, srcH, dstW, dstH);
1380
                return NULL;
1381
        }
1382

    
1383
        if(!dstFilter) dstFilter= &dummyFilter;
1384
        if(!srcFilter) srcFilter= &dummyFilter;
1385

    
1386
        c= memalign(64, sizeof(SwsContext));
1387
        memset(c, 0, sizeof(SwsContext));
1388

    
1389
        c->srcW= srcW;
1390
        c->srcH= srcH;
1391
        c->dstW= dstW;
1392
        c->dstH= dstH;
1393
        c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1394
        c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
1395
        c->flags= flags;
1396
        c->dstFormat= dstFormat;
1397
        c->srcFormat= srcFormat;
1398

    
1399
        usesFilter=0;
1400
        if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
1401
        if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
1402
        if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
1403
        if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
1404
        if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
1405
        if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
1406
        if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
1407
        if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
1408
        
1409
        /* unscaled special Cases */
1410
        if(srcW==dstW && srcH==dstH && !usesFilter)
1411
        {
1412
                /* yuv2bgr */
1413
                if(isPlanarYUV(srcFormat) && isBGR(dstFormat))
1414
                {
1415
                        // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
1416
#ifdef WORDS_BIGENDIAN
1417
                        yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
1418
#else
1419
                        yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1420
#endif
1421
                        c->swScale= planarYuvToBgr;
1422

    
1423
                        if(flags&SWS_PRINT_INFO)
1424
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1425
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1426
                        return c;
1427
                }
1428

    
1429
                /* simple copy */
1430
                if(srcFormat == dstFormat || (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat)))
1431
                {
1432
                        c->swScale= simpleCopy;
1433

    
1434
                        if(flags&SWS_PRINT_INFO)
1435
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1436
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1437
                        return c;
1438
                }
1439
                
1440
                /* bgr32to24 & rgb32to24*/
1441
                if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR24)
1442
                 ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB24))
1443
                {
1444
                        c->swScale= bgr32to24Wrapper;
1445

    
1446
                        if(flags&SWS_PRINT_INFO)
1447
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1448
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1449
                        return c;
1450
                }
1451
                
1452
                /* bgr24to32 & rgb24to32*/
1453
                if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR32)
1454
                 ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB32))
1455
                {
1456
                        c->swScale= bgr24to32Wrapper;
1457

    
1458
                        if(flags&SWS_PRINT_INFO)
1459
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1460
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1461
                        return c;
1462
                }
1463

    
1464
                /* bgr15to16 */
1465
                if(srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR16)
1466
                {
1467
                        c->swScale= bgr15to16Wrapper;
1468

    
1469
                        if(flags&SWS_PRINT_INFO)
1470
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1471
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1472
                        return c;
1473
                }
1474

    
1475
                /* bgr24toYV12 */
1476
                if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
1477
                {
1478
                        c->swScale= bgr24toyv12Wrapper;
1479

    
1480
                        if(flags&SWS_PRINT_INFO)
1481
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1482
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1483
                        return c;
1484
                }
1485
        }
1486

    
1487
        if(cpuCaps.hasMMX2)
1488
        {
1489
                c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
1490
                if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
1491
                {
1492
                        if(flags&SWS_PRINT_INFO)
1493
                                fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
1494
                }
1495
        }
1496
        else
1497
                c->canMMX2BeUsed=0;
1498

    
1499

    
1500
        /* dont use full vertical UV input/internaly if the source doesnt even have it */
1501
        if(isHalfChrV(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_V);
1502
        /* dont use full horizontal UV input if the source doesnt even have it */
1503
        if(isHalfChrH(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INP);
1504
        /* dont use full horizontal UV internally if the destination doesnt even have it */
1505
        if(isHalfChrH(dstFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INT);
1506

    
1507
        if(flags&SWS_FULL_CHR_H_INP)        c->chrSrcW= srcW;
1508
        else                                c->chrSrcW= (srcW+1)>>1;
1509

    
1510
        if(flags&SWS_FULL_CHR_H_INT)        c->chrDstW= dstW;
1511
        else                                c->chrDstW= (dstW+1)>>1;
1512

    
1513
        if(flags&SWS_FULL_CHR_V)        c->chrSrcH= srcH;
1514
        else                                c->chrSrcH= (srcH+1)>>1;
1515

    
1516
        if(isHalfChrV(dstFormat))        c->chrDstH= (dstH+1)>>1;
1517
        else                                c->chrDstH= dstH;
1518

    
1519
        c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
1520
        c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
1521

    
1522

    
1523
        // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1524
        // but only for the FAST_BILINEAR mode otherwise do correct scaling
1525
        // n-2 is the last chrominance sample available
1526
        // this is not perfect, but noone shuld notice the difference, the more correct variant
1527
        // would be like the vertical one, but that would require some special code for the
1528
        // first and last pixel
1529
        if(flags&SWS_FAST_BILINEAR)
1530
        {
1531
                if(c->canMMX2BeUsed)
1532
                {
1533
                        c->lumXInc+= 20;
1534
                        c->chrXInc+= 20;
1535
                }
1536
                //we dont use the x86asm scaler if mmx is available
1537
                else if(cpuCaps.hasMMX)
1538
                {
1539
                        c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
1540
                        c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
1541
                }
1542
        }
1543

    
1544
        /* precalculate horizontal scaler filter coefficients */
1545
        {
1546
                const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
1547

    
1548
                initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
1549
                                 srcW      ,       dstW, filterAlign, 1<<14, flags,
1550
                                 srcFilter->lumH, dstFilter->lumH);
1551
                initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
1552
                                (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
1553
                                 srcFilter->chrH, dstFilter->chrH);
1554

    
1555
#ifdef ARCH_X86
1556
// cant downscale !!!
1557
                if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
1558
                {
1559
                        initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode);
1560
                        initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
1561
                }
1562
#endif
1563
        } // Init Horizontal stuff
1564

    
1565

    
1566

    
1567
        /* precalculate vertical scaler filter coefficients */
1568
        initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
1569
                        srcH      ,        dstH, 1, (1<<12)-4, flags,
1570
                        srcFilter->lumV, dstFilter->lumV);
1571
        initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
1572
                        (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
1573
                         srcFilter->chrV, dstFilter->chrV);
1574

    
1575
        // Calculate Buffer Sizes so that they wont run out while handling these damn slices
1576
        c->vLumBufSize= c->vLumFilterSize;
1577
        c->vChrBufSize= c->vChrFilterSize;
1578
        for(i=0; i<dstH; i++)
1579
        {
1580
                int chrI= i*c->chrDstH / dstH;
1581
                int nextSlice= MAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
1582
                                 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
1583
                nextSlice&= ~1; // Slices start at even boundaries
1584
                if(c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
1585
                        c->vLumBufSize= nextSlice - c->vLumFilterPos[i   ];
1586
                if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
1587
                        c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
1588
        }
1589

    
1590
        // allocate pixbufs (we use dynamic allocation because otherwise we would need to
1591
        c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
1592
        c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
1593
        //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
1594
        for(i=0; i<c->vLumBufSize; i++)
1595
                c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
1596
        for(i=0; i<c->vChrBufSize; i++)
1597
                c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
1598

    
1599
        //try to avoid drawing green stuff between the right end and the stride end
1600
        for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
1601
        for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
1602

    
1603
        ASSERT(c->chrDstH <= dstH)
1604

    
1605
        // pack filter data for mmx code
1606
        if(cpuCaps.hasMMX)
1607
        {
1608
                c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize*      dstH*4*sizeof(int16_t));
1609
                c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
1610
                for(i=0; i<c->vLumFilterSize*dstH; i++)
1611
                        c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
1612
                                c->vLumFilter[i];
1613
                for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
1614
                        c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
1615
                                c->vChrFilter[i];
1616
        }
1617

    
1618
        if(flags&SWS_PRINT_INFO)
1619
        {
1620
#ifdef DITHER1XBPP
1621
                char *dither= " dithered";
1622
#else
1623
                char *dither= "";
1624
#endif
1625
                if(flags&SWS_FAST_BILINEAR)
1626
                        fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler, ");
1627
                else if(flags&SWS_BILINEAR)
1628
                        fprintf(stderr, "\nSwScaler: BILINEAR scaler, ");
1629
                else if(flags&SWS_BICUBIC)
1630
                        fprintf(stderr, "\nSwScaler: BICUBIC scaler, ");
1631
                else if(flags&SWS_X)
1632
                        fprintf(stderr, "\nSwScaler: Experimental scaler, ");
1633
                else if(flags&SWS_POINT)
1634
                        fprintf(stderr, "\nSwScaler: Nearest Neighbor / POINT scaler, ");
1635
                else if(flags&SWS_AREA)
1636
                        fprintf(stderr, "\nSwScaler: Area Averageing scaler, ");
1637
                else
1638
                        fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
1639

    
1640
                if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
1641
                        fprintf(stderr, "from %s to%s %s ", 
1642
                                vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
1643
                else
1644
                        fprintf(stderr, "from %s to %s ", 
1645
                                vo_format_name(srcFormat), vo_format_name(dstFormat));
1646

    
1647
                if(cpuCaps.hasMMX2)
1648
                        fprintf(stderr, "using MMX2\n");
1649
                else if(cpuCaps.has3DNow)
1650
                        fprintf(stderr, "using 3DNOW\n");
1651
                else if(cpuCaps.hasMMX)
1652
                        fprintf(stderr, "using MMX\n");
1653
                else
1654
                        fprintf(stderr, "using C\n");
1655
        }
1656

    
1657
        if((flags & SWS_PRINT_INFO) && verbose)
1658
        {
1659
                if(cpuCaps.hasMMX)
1660
                {
1661
                        if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1662
                                printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1663
                        else
1664
                        {
1665
                                if(c->hLumFilterSize==4)
1666
                                        printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
1667
                                else if(c->hLumFilterSize==8)
1668
                                        printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
1669
                                else
1670
                                        printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
1671

    
1672
                                if(c->hChrFilterSize==4)
1673
                                        printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
1674
                                else if(c->hChrFilterSize==8)
1675
                                        printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
1676
                                else
1677
                                        printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
1678
                        }
1679
                }
1680
                else
1681
                {
1682
#ifdef ARCH_X86
1683
                        printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
1684
#else
1685
                        if(flags & SWS_FAST_BILINEAR)
1686
                                printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
1687
                        else
1688
                                printf("SwScaler: using C scaler for horizontal scaling\n");
1689
#endif
1690
                }
1691
                if(isPlanarYUV(dstFormat))
1692
                {
1693
                        if(c->vLumFilterSize==1)
1694
                                printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1695
                        else
1696
                                printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1697
                }
1698
                else
1699
                {
1700
                        if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
1701
                                printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1702
                                       "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
1703
                        else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
1704
                                printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1705
                        else
1706
                                printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1707
                }
1708

    
1709
                if(dstFormat==IMGFMT_BGR24)
1710
                        printf("SwScaler: using %s YV12->BGR24 Converter\n",
1711
                                cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
1712
                else if(dstFormat==IMGFMT_BGR32)
1713
                        printf("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1714
                else if(dstFormat==IMGFMT_BGR16)
1715
                        printf("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1716
                else if(dstFormat==IMGFMT_BGR15)
1717
                        printf("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1718

    
1719
                printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1720
        }
1721
        if((flags & SWS_PRINT_INFO) && verbose>1)
1722
        {
1723
                printf("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1724
                        c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
1725
                printf("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1726
                        c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
1727
        }
1728

    
1729
        c->swScale= swScale;
1730
        return c;
1731
}
1732

    
1733
/**
1734
 * returns a normalized gaussian curve used to filter stuff
1735
 * quality=3 is high quality, lowwer is lowwer quality
1736
 */
1737

    
1738
SwsVector *getGaussianVec(double variance, double quality){
1739
        const int length= (int)(variance*quality + 0.5) | 1;
1740
        int i;
1741
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1742
        double middle= (length-1)*0.5;
1743
        SwsVector *vec= malloc(sizeof(SwsVector));
1744

    
1745
        vec->coeff= coeff;
1746
        vec->length= length;
1747

    
1748
        for(i=0; i<length; i++)
1749
        {
1750
                double dist= i-middle;
1751
                coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1752
        }
1753

    
1754
        normalizeVec(vec, 1.0);
1755

    
1756
        return vec;
1757
}
1758

    
1759
SwsVector *getConstVec(double c, int length){
1760
        int i;
1761
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1762
        SwsVector *vec= malloc(sizeof(SwsVector));
1763

    
1764
        vec->coeff= coeff;
1765
        vec->length= length;
1766

    
1767
        for(i=0; i<length; i++)
1768
                coeff[i]= c;
1769

    
1770
        return vec;
1771
}
1772

    
1773

    
1774
SwsVector *getIdentityVec(void){
1775
        double *coeff= memalign(sizeof(double), sizeof(double));
1776
        SwsVector *vec= malloc(sizeof(SwsVector));
1777
        coeff[0]= 1.0;
1778

    
1779
        vec->coeff= coeff;
1780
        vec->length= 1;
1781

    
1782
        return vec;
1783
}
1784

    
1785
void normalizeVec(SwsVector *a, double height){
1786
        int i;
1787
        double sum=0;
1788
        double inv;
1789

    
1790
        for(i=0; i<a->length; i++)
1791
                sum+= a->coeff[i];
1792

    
1793
        inv= height/sum;
1794

    
1795
        for(i=0; i<a->length; i++)
1796
                a->coeff[i]*= height;
1797
}
1798

    
1799
void scaleVec(SwsVector *a, double scalar){
1800
        int i;
1801

    
1802
        for(i=0; i<a->length; i++)
1803
                a->coeff[i]*= scalar;
1804
}
1805

    
1806
static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
1807
        int length= a->length + b->length - 1;
1808
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1809
        int i, j;
1810
        SwsVector *vec= malloc(sizeof(SwsVector));
1811

    
1812
        vec->coeff= coeff;
1813
        vec->length= length;
1814

    
1815
        for(i=0; i<length; i++) coeff[i]= 0.0;
1816

    
1817
        for(i=0; i<a->length; i++)
1818
        {
1819
                for(j=0; j<b->length; j++)
1820
                {
1821
                        coeff[i+j]+= a->coeff[i]*b->coeff[j];
1822
                }
1823
        }
1824

    
1825
        return vec;
1826
}
1827

    
1828
static SwsVector *sumVec(SwsVector *a, SwsVector *b){
1829
        int length= MAX(a->length, b->length);
1830
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1831
        int i;
1832
        SwsVector *vec= malloc(sizeof(SwsVector));
1833

    
1834
        vec->coeff= coeff;
1835
        vec->length= length;
1836

    
1837
        for(i=0; i<length; i++) coeff[i]= 0.0;
1838

    
1839
        for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1840
        for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
1841

    
1842
        return vec;
1843
}
1844

    
1845
static SwsVector *diffVec(SwsVector *a, SwsVector *b){
1846
        int length= MAX(a->length, b->length);
1847
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1848
        int i;
1849
        SwsVector *vec= malloc(sizeof(SwsVector));
1850

    
1851
        vec->coeff= coeff;
1852
        vec->length= length;
1853

    
1854
        for(i=0; i<length; i++) coeff[i]= 0.0;
1855

    
1856
        for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1857
        for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
1858

    
1859
        return vec;
1860
}
1861

    
1862
/* shift left / or right if "shift" is negative */
1863
static SwsVector *getShiftedVec(SwsVector *a, int shift){
1864
        int length= a->length + ABS(shift)*2;
1865
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1866
        int i;
1867
        SwsVector *vec= malloc(sizeof(SwsVector));
1868

    
1869
        vec->coeff= coeff;
1870
        vec->length= length;
1871

    
1872
        for(i=0; i<length; i++) coeff[i]= 0.0;
1873

    
1874
        for(i=0; i<a->length; i++)
1875
        {
1876
                coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
1877
        }
1878

    
1879
        return vec;
1880
}
1881

    
1882
void shiftVec(SwsVector *a, int shift){
1883
        SwsVector *shifted= getShiftedVec(a, shift);
1884
        free(a->coeff);
1885
        a->coeff= shifted->coeff;
1886
        a->length= shifted->length;
1887
        free(shifted);
1888
}
1889

    
1890
void addVec(SwsVector *a, SwsVector *b){
1891
        SwsVector *sum= sumVec(a, b);
1892
        free(a->coeff);
1893
        a->coeff= sum->coeff;
1894
        a->length= sum->length;
1895
        free(sum);
1896
}
1897

    
1898
void subVec(SwsVector *a, SwsVector *b){
1899
        SwsVector *diff= diffVec(a, b);
1900
        free(a->coeff);
1901
        a->coeff= diff->coeff;
1902
        a->length= diff->length;
1903
        free(diff);
1904
}
1905

    
1906
void convVec(SwsVector *a, SwsVector *b){
1907
        SwsVector *conv= getConvVec(a, b);
1908
        free(a->coeff);
1909
        a->coeff= conv->coeff;
1910
        a->length= conv->length;
1911
        free(conv);
1912
}
1913

    
1914
SwsVector *cloneVec(SwsVector *a){
1915
        double *coeff= memalign(sizeof(double), a->length*sizeof(double));
1916
        int i;
1917
        SwsVector *vec= malloc(sizeof(SwsVector));
1918

    
1919
        vec->coeff= coeff;
1920
        vec->length= a->length;
1921

    
1922
        for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
1923

    
1924
        return vec;
1925
}
1926

    
1927
void printVec(SwsVector *a){
1928
        int i;
1929
        double max=0;
1930
        double min=0;
1931
        double range;
1932

    
1933
        for(i=0; i<a->length; i++)
1934
                if(a->coeff[i]>max) max= a->coeff[i];
1935

    
1936
        for(i=0; i<a->length; i++)
1937
                if(a->coeff[i]<min) min= a->coeff[i];
1938

    
1939
        range= max - min;
1940

    
1941
        for(i=0; i<a->length; i++)
1942
        {
1943
                int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
1944
                printf("%1.3f ", a->coeff[i]);
1945
                for(;x>0; x--) printf(" ");
1946
                printf("|\n");
1947
        }
1948
}
1949

    
1950
void freeVec(SwsVector *a){
1951
        if(!a) return;
1952
        if(a->coeff) free(a->coeff);
1953
        a->coeff=NULL;
1954
        a->length=0;
1955
        free(a);
1956
}
1957

    
1958
void freeSwsContext(SwsContext *c){
1959
        int i;
1960

    
1961
        if(!c) return;
1962

    
1963
        if(c->lumPixBuf)
1964
        {
1965
                for(i=0; i<c->vLumBufSize; i++)
1966
                {
1967
                        if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
1968
                        c->lumPixBuf[i]=NULL;
1969
                }
1970
                free(c->lumPixBuf);
1971
                c->lumPixBuf=NULL;
1972
        }
1973

    
1974
        if(c->chrPixBuf)
1975
        {
1976
                for(i=0; i<c->vChrBufSize; i++)
1977
                {
1978
                        if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
1979
                        c->chrPixBuf[i]=NULL;
1980
                }
1981
                free(c->chrPixBuf);
1982
                c->chrPixBuf=NULL;
1983
        }
1984

    
1985
        if(c->vLumFilter) free(c->vLumFilter);
1986
        c->vLumFilter = NULL;
1987
        if(c->vChrFilter) free(c->vChrFilter);
1988
        c->vChrFilter = NULL;
1989
        if(c->hLumFilter) free(c->hLumFilter);
1990
        c->hLumFilter = NULL;
1991
        if(c->hChrFilter) free(c->hChrFilter);
1992
        c->hChrFilter = NULL;
1993

    
1994
        if(c->vLumFilterPos) free(c->vLumFilterPos);
1995
        c->vLumFilterPos = NULL;
1996
        if(c->vChrFilterPos) free(c->vChrFilterPos);
1997
        c->vChrFilterPos = NULL;
1998
        if(c->hLumFilterPos) free(c->hLumFilterPos);
1999
        c->hLumFilterPos = NULL;
2000
        if(c->hChrFilterPos) free(c->hChrFilterPos);
2001
        c->hChrFilterPos = NULL;
2002

    
2003
        if(c->lumMmxFilter) free(c->lumMmxFilter);
2004
        c->lumMmxFilter = NULL;
2005
        if(c->chrMmxFilter) free(c->chrMmxFilter);
2006
        c->chrMmxFilter = NULL;
2007

    
2008
        free(c);
2009
}
2010

    
2011