Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale.c @ df3c183a

History | View | Annotate | Download (55.3 KB)

1
/*
2
    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
/*
20
  supported Input formats: YV12, I420, IYUV, YUY2, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8, Y800
21
  supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32 (grayscale soon too)
22
  BGR15/16 support dithering
23
  
24
  unscaled special converters
25
  YV12/I420/IYUV -> BGR15/BGR16/BGR24/BGR32
26
  YV12/I420/IYUV -> YV12/I420/IYUV
27
  YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
28
  BGR24 -> BGR32 & RGB24 -> RGB32
29
  BGR32 -> BGR24 & RGB32 -> RGB24
30
  BGR15 -> BGR16
31
*/
32

    
33
/* 
34
tested special converters
35
 YV12/I420 -> BGR16
36
 YV12 -> YV12
37
 BGR15 -> BGR16
38
 BGR16 -> BGR16
39

40
untested special converters
41
  YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
42
  YV12/I420 -> YV12/I420
43
  YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
44
  BGR24 -> BGR32 & RGB24 -> RGB32
45
  BGR32 -> BGR24 & RGB32 -> RGB24
46
  BGR24 -> YV12
47
*/
48

    
49
#include <inttypes.h>
50
#include <string.h>
51
#include <math.h>
52
#include <stdio.h>
53
#include "../config.h"
54
#include "../mangle.h"
55
#ifdef HAVE_MALLOC_H
56
#include <malloc.h>
57
#endif
58
#include "swscale.h"
59
#include "../cpudetect.h"
60
#include "../bswap.h"
61
#include "../libvo/img_format.h"
62
#include "rgb2rgb.h"
63
#include "../libvo/fastmemcpy.h"
64
#undef MOVNTQ
65
#undef PAVGB
66

    
67
//#undef HAVE_MMX2
68
//#define HAVE_3DNOW
69
//#undef HAVE_MMX
70
//#undef ARCH_X86
71
#define DITHER1XBPP
72

    
73
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
74

    
75
#define RET 0xC3 //near return opcode for X86
76

    
77
#ifdef MP_DEBUG
78
#define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
79
#else
80
#define ASSERT(x) ;
81
#endif
82

    
83
#ifdef M_PI
84
#define PI M_PI
85
#else
86
#define PI 3.14159265358979323846
87
#endif
88

    
89
//FIXME replace this with something faster
90
#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
91
#define isYUV(x)       ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
92
#define isHalfChrV(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
93
#define isHalfChrH(x)  ((x)==IMGFMT_YUY2 || (x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
94
#define isPacked(x)    ((x)==IMGFMT_YUY2 || ((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR || ((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
95
#define isGray(x)      ((x)==IMGFMT_Y800)
96
#define isSupportedIn(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
97
                        || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
98
                        || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
99
                        || (x)==IMGFMT_Y800)
100
#define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
101
                        || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
102
#define isBGR(x)       ((x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
103

    
104
#define RGB2YUV_SHIFT 16
105
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
106
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
107
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
108
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
109
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
110
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
111
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
112
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
113
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
114

    
115
extern int verbose; // defined in mplayer.c
116
/*
117
NOTES
118

119
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
120
horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
121

122
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
123

124
TODO
125
more intelligent missalignment avoidance for the horizontal scaler
126
write special vertical cubic upscale version
127
Optimize C code (yv12 / minmax)
128
add support for packed pixel yuv input & output
129
add support for Y8 output
130
optimize bgr24 & bgr32
131
add BGR4 output support
132
write special BGR->BGR scaler
133
deglobalize yuv2rgb*.c
134
*/
135

    
136
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
137
#define MIN(a,b) ((a) > (b) ? (b) : (a))
138
#define MAX(a,b) ((a) < (b) ? (b) : (a))
139

    
140
#ifdef ARCH_X86
141
#define CAN_COMPILE_X86_ASM
142
#endif
143

    
144
#ifdef CAN_COMPILE_X86_ASM
145
static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
146
static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
147
static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
148
static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
149
static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
150
static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
151
static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
152
static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
153
static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
154
static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
155
static uint64_t __attribute__((aligned(8))) w02=       0x0002000200020002LL;
156
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
157
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
158
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
159
static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
160

    
161
static volatile uint64_t __attribute__((aligned(8))) b5Dither;
162
static volatile uint64_t __attribute__((aligned(8))) g5Dither;
163
static volatile uint64_t __attribute__((aligned(8))) g6Dither;
164
static volatile uint64_t __attribute__((aligned(8))) r5Dither;
165

    
166
static uint64_t __attribute__((aligned(8))) dither4[2]={
167
        0x0103010301030103LL,
168
        0x0200020002000200LL,};
169

    
170
static uint64_t __attribute__((aligned(8))) dither8[2]={
171
        0x0602060206020602LL,
172
        0x0004000400040004LL,};
173

    
174
static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
175
static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
176
static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
177
static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
178
static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
179
static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
180

    
181
static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
182
static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
183
static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
184

    
185
#ifdef FAST_BGR2YV12
186
static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000000210041000DULL;
187
static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
188
static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
189
#else
190
static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000020E540830C8BULL;
191
static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
192
static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
193
#endif
194
static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
195
static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
196
static const uint64_t w1111       __attribute__((aligned(8))) = 0x0001000100010001ULL;
197

    
198
// FIXME remove
199
static uint64_t __attribute__((aligned(8))) asm_yalpha1;
200
static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
201
#endif
202

    
203
// clipping helper table for C implementations:
204
static unsigned char clip_table[768];
205

    
206
static unsigned short clip_table16b[768];
207
static unsigned short clip_table16g[768];
208
static unsigned short clip_table16r[768];
209
static unsigned short clip_table15b[768];
210
static unsigned short clip_table15g[768];
211
static unsigned short clip_table15r[768];
212

    
213
// yuv->rgb conversion tables:
214
static    int yuvtab_2568[256];
215
static    int yuvtab_3343[256];
216
static    int yuvtab_0c92[256];
217
static    int yuvtab_1a1e[256];
218
static    int yuvtab_40cf[256];
219
// Needed for cubic scaler to catch overflows
220
static    int clip_yuvtab_2568[768];
221
static    int clip_yuvtab_3343[768];
222
static    int clip_yuvtab_0c92[768];
223
static    int clip_yuvtab_1a1e[768];
224
static    int clip_yuvtab_40cf[768];
225

    
226
//global sws_flags from the command line
227
int sws_flags=2;
228

    
229
//global srcFilter
230
SwsFilter src_filter= {NULL, NULL, NULL, NULL};
231

    
232
float sws_lum_gblur= 0.0;
233
float sws_chr_gblur= 0.0;
234
int sws_chr_vshift= 0;
235
int sws_chr_hshift= 0;
236
float sws_chr_sharpen= 0.0;
237
float sws_lum_sharpen= 0.0;
238

    
239
/* cpuCaps combined from cpudetect and whats actually compiled in
240
   (if there is no support for something compiled in it wont appear here) */
241
static CpuCaps cpuCaps;
242

    
243
void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
244
             int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
245

    
246
static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
247

    
248
#ifdef CAN_COMPILE_X86_ASM
249
void in_asm_used_var_warning_killer()
250
{
251
 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
252
 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
253
 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
254
 if(i) i=0;
255
}
256
#endif
257

    
258
static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
259
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
260
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
261
{
262
        //FIXME Optimize (just quickly writen not opti..)
263
        int i;
264
        for(i=0; i<dstW; i++)
265
        {
266
                int val=0;
267
                int j;
268
                for(j=0; j<lumFilterSize; j++)
269
                        val += lumSrc[j][i] * lumFilter[j];
270

    
271
                dest[i]= MIN(MAX(val>>19, 0), 255);
272
        }
273

    
274
        if(uDest != NULL)
275
                for(i=0; i<(dstW>>1); i++)
276
                {
277
                        int u=0;
278
                        int v=0;
279
                        int j;
280
                        for(j=0; j<chrFilterSize; j++)
281
                        {
282
                                u += chrSrc[j][i] * chrFilter[j];
283
                                v += chrSrc[j][i + 2048] * chrFilter[j];
284
                        }
285

    
286
                        uDest[i]= MIN(MAX(u>>19, 0), 255);
287
                        vDest[i]= MIN(MAX(v>>19, 0), 255);
288
                }
289
}
290

    
291
static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
292
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
293
                                    uint8_t *dest, int dstW, int dstFormat)
294
{
295
        if(dstFormat==IMGFMT_BGR32)
296
        {
297
#ifdef WORDS_BIGENDIAN
298
        dest++;
299
#endif
300
                int i;
301
                for(i=0; i<(dstW>>1); i++){
302
                        int j;
303
                        int Y1=0;
304
                        int Y2=0;
305
                        int U=0;
306
                        int V=0;
307
                        int Cb, Cr, Cg;
308
                        for(j=0; j<lumFilterSize; j++)
309
                        {
310
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
311
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
312
                        }
313
                        for(j=0; j<chrFilterSize; j++)
314
                        {
315
                                U += chrSrc[j][i] * chrFilter[j];
316
                                V += chrSrc[j][i+2048] * chrFilter[j];
317
                        }
318
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
319
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
320
                        U >>= 19;
321
                        V >>= 19;
322

    
323
                        Cb= clip_yuvtab_40cf[U+ 256];
324
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
325
                        Cr= clip_yuvtab_3343[V+ 256];
326

    
327
                        dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
328
                        dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
329
                        dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
330

    
331
                        dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
332
                        dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
333
                        dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
334
                }
335
        }
336
        else if(dstFormat==IMGFMT_BGR24)
337
        {
338
                int i;
339
                for(i=0; i<(dstW>>1); i++){
340
                        int j;
341
                        int Y1=0;
342
                        int Y2=0;
343
                        int U=0;
344
                        int V=0;
345
                        int Cb, Cr, Cg;
346
                        for(j=0; j<lumFilterSize; j++)
347
                        {
348
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
349
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
350
                        }
351
                        for(j=0; j<chrFilterSize; j++)
352
                        {
353
                                U += chrSrc[j][i] * chrFilter[j];
354
                                V += chrSrc[j][i+2048] * chrFilter[j];
355
                        }
356
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
357
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
358
                        U >>= 19;
359
                        V >>= 19;
360

    
361
                        Cb= clip_yuvtab_40cf[U+ 256];
362
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
363
                        Cr= clip_yuvtab_3343[V+ 256];
364

    
365
                        dest[0]=clip_table[((Y1 + Cb) >>13)];
366
                        dest[1]=clip_table[((Y1 + Cg) >>13)];
367
                        dest[2]=clip_table[((Y1 + Cr) >>13)];
368

    
369
                        dest[3]=clip_table[((Y2 + Cb) >>13)];
370
                        dest[4]=clip_table[((Y2 + Cg) >>13)];
371
                        dest[5]=clip_table[((Y2 + Cr) >>13)];
372
                        dest+=6;
373
                }
374
        }
375
        else if(dstFormat==IMGFMT_BGR16)
376
        {
377
                int i;
378
#ifdef DITHER1XBPP
379
                static int ditherb1=1<<14;
380
                static int ditherg1=1<<13;
381
                static int ditherr1=2<<14;
382
                static int ditherb2=3<<14;
383
                static int ditherg2=3<<13;
384
                static int ditherr2=0<<14;
385

    
386
                ditherb1 ^= (1^2)<<14;
387
                ditherg1 ^= (1^2)<<13;
388
                ditherr1 ^= (1^2)<<14;
389
                ditherb2 ^= (3^0)<<14;
390
                ditherg2 ^= (3^0)<<13;
391
                ditherr2 ^= (3^0)<<14;
392
#else
393
                const int ditherb1=0;
394
                const int ditherg1=0;
395
                const int ditherr1=0;
396
                const int ditherb2=0;
397
                const int ditherg2=0;
398
                const int ditherr2=0;
399
#endif
400
                for(i=0; i<(dstW>>1); i++){
401
                        int j;
402
                        int Y1=0;
403
                        int Y2=0;
404
                        int U=0;
405
                        int V=0;
406
                        int Cb, Cr, Cg;
407
                        for(j=0; j<lumFilterSize; j++)
408
                        {
409
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
410
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
411
                        }
412
                        for(j=0; j<chrFilterSize; j++)
413
                        {
414
                                U += chrSrc[j][i] * chrFilter[j];
415
                                V += chrSrc[j][i+2048] * chrFilter[j];
416
                        }
417
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
418
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
419
                        U >>= 19;
420
                        V >>= 19;
421

    
422
                        Cb= clip_yuvtab_40cf[U+ 256];
423
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
424
                        Cr= clip_yuvtab_3343[V+ 256];
425

    
426
                        ((uint16_t*)dest)[2*i] =
427
                                clip_table16b[(Y1 + Cb + ditherb1) >>13] |
428
                                clip_table16g[(Y1 + Cg + ditherg1) >>13] |
429
                                clip_table16r[(Y1 + Cr + ditherr1) >>13];
430

    
431
                        ((uint16_t*)dest)[2*i+1] =
432
                                clip_table16b[(Y2 + Cb + ditherb2) >>13] |
433
                                clip_table16g[(Y2 + Cg + ditherg2) >>13] |
434
                                clip_table16r[(Y2 + Cr + ditherr2) >>13];
435
                }
436
        }
437
        else if(dstFormat==IMGFMT_BGR15)
438
        {
439
                int i;
440
#ifdef DITHER1XBPP
441
                static int ditherb1=1<<14;
442
                static int ditherg1=1<<14;
443
                static int ditherr1=2<<14;
444
                static int ditherb2=3<<14;
445
                static int ditherg2=3<<14;
446
                static int ditherr2=0<<14;
447

    
448
                ditherb1 ^= (1^2)<<14;
449
                ditherg1 ^= (1^2)<<14;
450
                ditherr1 ^= (1^2)<<14;
451
                ditherb2 ^= (3^0)<<14;
452
                ditherg2 ^= (3^0)<<14;
453
                ditherr2 ^= (3^0)<<14;
454
#else
455
                const int ditherb1=0;
456
                const int ditherg1=0;
457
                const int ditherr1=0;
458
                const int ditherb2=0;
459
                const int ditherg2=0;
460
                const int ditherr2=0;
461
#endif
462
                for(i=0; i<(dstW>>1); i++){
463
                        int j;
464
                        int Y1=0;
465
                        int Y2=0;
466
                        int U=0;
467
                        int V=0;
468
                        int Cb, Cr, Cg;
469
                        for(j=0; j<lumFilterSize; j++)
470
                        {
471
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
472
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
473
                        }
474
                        for(j=0; j<chrFilterSize; j++)
475
                        {
476
                                U += chrSrc[j][i] * chrFilter[j];
477
                                V += chrSrc[j][i+2048] * chrFilter[j];
478
                        }
479
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
480
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
481
                        U >>= 19;
482
                        V >>= 19;
483

    
484
                        Cb= clip_yuvtab_40cf[U+ 256];
485
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
486
                        Cr= clip_yuvtab_3343[V+ 256];
487

    
488
                        ((uint16_t*)dest)[2*i] =
489
                                clip_table15b[(Y1 + Cb + ditherb1) >>13] |
490
                                clip_table15g[(Y1 + Cg + ditherg1) >>13] |
491
                                clip_table15r[(Y1 + Cr + ditherr1) >>13];
492

    
493
                        ((uint16_t*)dest)[2*i+1] =
494
                                clip_table15b[(Y2 + Cb + ditherb2) >>13] |
495
                                clip_table15g[(Y2 + Cg + ditherg2) >>13] |
496
                                clip_table15r[(Y2 + Cr + ditherr2) >>13];
497
                }
498
        }
499
}
500

    
501

    
502
//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
503
//Plain C versions
504
#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
505
#define COMPILE_C
506
#endif
507

    
508
#ifdef CAN_COMPILE_X86_ASM
509

    
510
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
511
#define COMPILE_MMX
512
#endif
513

    
514
#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
515
#define COMPILE_MMX2
516
#endif
517

    
518
#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
519
#define COMPILE_3DNOW
520
#endif
521
#endif //CAN_COMPILE_X86_ASM
522

    
523
#undef HAVE_MMX
524
#undef HAVE_MMX2
525
#undef HAVE_3DNOW
526

    
527
#ifdef COMPILE_C
528
#undef HAVE_MMX
529
#undef HAVE_MMX2
530
#undef HAVE_3DNOW
531
#define RENAME(a) a ## _C
532
#include "swscale_template.c"
533
#endif
534

    
535
#ifdef CAN_COMPILE_X86_ASM
536

    
537
//X86 versions
538
/*
539
#undef RENAME
540
#undef HAVE_MMX
541
#undef HAVE_MMX2
542
#undef HAVE_3DNOW
543
#define ARCH_X86
544
#define RENAME(a) a ## _X86
545
#include "swscale_template.c"
546
*/
547
//MMX versions
548
#ifdef COMPILE_MMX
549
#undef RENAME
550
#define HAVE_MMX
551
#undef HAVE_MMX2
552
#undef HAVE_3DNOW
553
#define RENAME(a) a ## _MMX
554
#include "swscale_template.c"
555
#endif
556

    
557
//MMX2 versions
558
#ifdef COMPILE_MMX2
559
#undef RENAME
560
#define HAVE_MMX
561
#define HAVE_MMX2
562
#undef HAVE_3DNOW
563
#define RENAME(a) a ## _MMX2
564
#include "swscale_template.c"
565
#endif
566

    
567
//3DNOW versions
568
#ifdef COMPILE_3DNOW
569
#undef RENAME
570
#define HAVE_MMX
571
#undef HAVE_MMX2
572
#define HAVE_3DNOW
573
#define RENAME(a) a ## _3DNow
574
#include "swscale_template.c"
575
#endif
576

    
577
#endif //CAN_COMPILE_X86_ASM
578

    
579
// minor note: the HAVE_xyz is messed up after that line so dont use it
580

    
581

    
582
// old global scaler, dont use for new code
583
// will use sws_flags from the command line
584
void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
585
                             int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
586
                             int srcW, int srcH, int dstW, int dstH){
587

    
588
        static SwsContext *context=NULL;
589
        int dstFormat;
590
        int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
591

    
592
        switch(dstbpp)
593
        {
594
                case 8 : dstFormat= IMGFMT_Y8;                break;
595
                case 12: dstFormat= IMGFMT_YV12;        break;
596
                case 15: dstFormat= IMGFMT_BGR15;        break;
597
                case 16: dstFormat= IMGFMT_BGR16;        break;
598
                case 24: dstFormat= IMGFMT_BGR24;        break;
599
                case 32: dstFormat= IMGFMT_BGR32;        break;
600
                default: return;
601
        }
602

    
603
        if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
604

    
605
        context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
606
}
607

    
608
// will use sws_flags & src_filter (from cmd line)
609
SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
610
{
611
        int flags=0;
612
        static int firstTime=1;
613

    
614
#ifdef ARCH_X86
615
        if(gCpuCaps.hasMMX)
616
                asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
617
#endif
618
        if(firstTime)
619
        {
620
                firstTime=0;
621
                flags= SWS_PRINT_INFO;
622
        }
623
        else if(verbose>1) flags= SWS_PRINT_INFO;
624

    
625
        if(src_filter.lumH) freeVec(src_filter.lumH);
626
        if(src_filter.lumV) freeVec(src_filter.lumV);
627
        if(src_filter.chrH) freeVec(src_filter.chrH);
628
        if(src_filter.chrV) freeVec(src_filter.chrV);
629

    
630
        if(sws_lum_gblur!=0.0){
631
                src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
632
                src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
633
        }else{
634
                src_filter.lumH= getIdentityVec();
635
                src_filter.lumV= getIdentityVec();
636
        }
637

    
638
        if(sws_chr_gblur!=0.0){
639
                src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
640
                src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
641
        }else{
642
                src_filter.chrH= getIdentityVec();
643
                src_filter.chrV= getIdentityVec();
644
        }
645

    
646
        if(sws_chr_sharpen!=0.0){
647
                SwsVector *g= getConstVec(-1.0, 3);
648
                SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
649
                g->coeff[1]=2.0;
650
                addVec(id, g);
651
                convVec(src_filter.chrH, id);
652
                convVec(src_filter.chrV, id);
653
                freeVec(g);
654
                freeVec(id);
655
        }
656

    
657
        if(sws_lum_sharpen!=0.0){
658
                SwsVector *g= getConstVec(-1.0, 3);
659
                SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
660
                g->coeff[1]=2.0;
661
                addVec(id, g);
662
                convVec(src_filter.lumH, id);
663
                convVec(src_filter.lumV, id);
664
                freeVec(g);
665
                freeVec(id);
666
        }
667

    
668
        if(sws_chr_hshift)
669
                shiftVec(src_filter.chrH, sws_chr_hshift);
670

    
671
        if(sws_chr_vshift)
672
                shiftVec(src_filter.chrV, sws_chr_vshift);
673

    
674
        normalizeVec(src_filter.chrH, 1.0);
675
        normalizeVec(src_filter.chrV, 1.0);
676
        normalizeVec(src_filter.lumH, 1.0);
677
        normalizeVec(src_filter.lumV, 1.0);
678

    
679
        if(verbose > 1) printVec(src_filter.chrH);
680
        if(verbose > 1) printVec(src_filter.lumH);
681

    
682
        switch(sws_flags)
683
        {
684
                case 0: flags|= SWS_FAST_BILINEAR; break;
685
                case 1: flags|= SWS_BILINEAR; break;
686
                case 2: flags|= SWS_BICUBIC; break;
687
                case 3: flags|= SWS_X; break;
688
                case 4: flags|= SWS_POINT; break;
689
                case 5: flags|= SWS_AREA; break;
690
                default:flags|= SWS_BILINEAR; break;
691
        }
692

    
693
        return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
694
}
695

    
696

    
697
static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
698
                              int srcW, int dstW, int filterAlign, int one, int flags,
699
                              SwsVector *srcFilter, SwsVector *dstFilter)
700
{
701
        int i;
702
        int filterSize;
703
        int filter2Size;
704
        int minFilterSize;
705
        double *filter=NULL;
706
        double *filter2=NULL;
707
#ifdef ARCH_X86
708
        if(gCpuCaps.hasMMX)
709
                asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
710
#endif
711

    
712
        // Note the +1 is for the MMXscaler which reads over the end
713
        *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
714

    
715
        if(ABS(xInc - 0x10000) <10) // unscaled
716
        {
717
                int i;
718
                filterSize= 1;
719
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
720
                for(i=0; i<dstW*filterSize; i++) filter[i]=0;
721

    
722
                for(i=0; i<dstW; i++)
723
                {
724
                        filter[i*filterSize]=1;
725
                        (*filterPos)[i]=i;
726
                }
727

    
728
        }
729
        else if(flags&SWS_POINT) // lame looking point sampling mode
730
        {
731
                int i;
732
                int xDstInSrc;
733
                filterSize= 1;
734
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
735
                
736
                xDstInSrc= xInc/2 - 0x8000;
737
                for(i=0; i<dstW; i++)
738
                {
739
                        int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
740

    
741
                        (*filterPos)[i]= xx;
742
                        filter[i]= 1.0;
743
                        xDstInSrc+= xInc;
744
                }
745
        }
746
        else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
747
        {
748
                int i;
749
                int xDstInSrc;
750
                if     (flags&SWS_BICUBIC) filterSize= 4;
751
                else if(flags&SWS_X      ) filterSize= 4;
752
                else                           filterSize= 2; // SWS_BILINEAR / SWS_AREA 
753
//                printf("%d %d %d\n", filterSize, srcW, dstW);
754
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
755

    
756
                xDstInSrc= xInc/2 - 0x8000;
757
                for(i=0; i<dstW; i++)
758
                {
759
                        int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
760
                        int j;
761

    
762
                        (*filterPos)[i]= xx;
763
                        if((flags & SWS_BICUBIC) || (flags & SWS_X))
764
                        {
765
                                double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
766
                                double y1,y2,y3,y4;
767
                                double A= -0.6;
768
                                if(flags & SWS_BICUBIC){
769
                                                // Equation is from VirtualDub
770
                                        y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
771
                                        y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
772
                                        y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
773
                                        y4 = (                  +           A*d*d -       A*d*d*d);
774
                                }else{
775
                                                // cubic interpolation (derived it myself)
776
                                        y1 = (    -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
777
                                        y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
778
                                        y3 = (    +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
779
                                        y4 = (    -1.0*d           + 1.0*d*d*d)/6.0;
780
                                }
781

    
782
//                                printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
783
                                filter[i*filterSize + 0]= y1;
784
                                filter[i*filterSize + 1]= y2;
785
                                filter[i*filterSize + 2]= y3;
786
                                filter[i*filterSize + 3]= y4;
787
//                                printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
788
                        }
789
                        else
790
                        {
791
                                //Bilinear upscale / linear interpolate / Area averaging
792
                                for(j=0; j<filterSize; j++)
793
                                {
794
                                        double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
795
                                        double coeff= 1.0 - d;
796
                                        if(coeff<0) coeff=0;
797
        //                                printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
798
                                        filter[i*filterSize + j]= coeff;
799
                                        xx++;
800
                                }
801
                        }
802
                        xDstInSrc+= xInc;
803
                }
804
        }
805
        else // downscale
806
        {
807
                int xDstInSrc;
808
                if(flags&SWS_BICUBIC)        filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
809
                else if(flags&SWS_X)        filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
810
                else if(flags&SWS_AREA)        filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
811
                else /* BILINEAR */        filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
812
//                printf("%d %d %d\n", *filterSize, srcW, dstW);
813
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
814

    
815
                xDstInSrc= xInc/2 - 0x8000;
816
                for(i=0; i<dstW; i++)
817
                {
818
                        int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
819
                        int j;
820
                        (*filterPos)[i]= xx;
821
                        for(j=0; j<filterSize; j++)
822
                        {
823
                                double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
824
                                double coeff;
825
                                if((flags & SWS_BICUBIC) || (flags & SWS_X))
826
                                {
827
                                        double A= -0.75;
828
//                                        d*=2;
829
                                        // Equation is from VirtualDub
830
                                        if(d<1.0)
831
                                                coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
832
                                        else if(d<2.0)
833
                                                coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
834
                                        else
835
                                                coeff=0.0;
836
                                }
837
                                else if(flags & SWS_AREA)
838
                                {
839
                                        double srcPixelSize= (1<<16)/(double)xInc;
840
                                        if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
841
                                        else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
842
                                        else coeff=0.0;
843
                                }
844
                                else
845
                                {
846
                                        coeff= 1.0 - d;
847
                                        if(coeff<0) coeff=0;
848
                                }
849
//                                printf("%1.3f %2.3f %d \n", coeff, d, xDstInSrc);
850
                                filter[i*filterSize + j]= coeff;
851
                                xx++;
852
                        }
853
                        xDstInSrc+= xInc;
854
                }
855
        }
856

    
857
        /* apply src & dst Filter to filter -> filter2
858
           free(filter);
859
        */
860
        filter2Size= filterSize;
861
        if(srcFilter) filter2Size+= srcFilter->length - 1;
862
        if(dstFilter) filter2Size+= dstFilter->length - 1;
863
        filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
864

    
865
        for(i=0; i<dstW; i++)
866
        {
867
                int j;
868
                SwsVector scaleFilter;
869
                SwsVector *outVec;
870

    
871
                scaleFilter.coeff= filter + i*filterSize;
872
                scaleFilter.length= filterSize;
873

    
874
                if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
875
                else              outVec= &scaleFilter;
876

    
877
                ASSERT(outVec->length == filter2Size)
878
                //FIXME dstFilter
879

    
880
                for(j=0; j<outVec->length; j++)
881
                {
882
                        filter2[i*filter2Size + j]= outVec->coeff[j];
883
                }
884

    
885
                (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
886

    
887
                if(outVec != &scaleFilter) freeVec(outVec);
888
        }
889
        free(filter); filter=NULL;
890

    
891
        /* try to reduce the filter-size (step1 find size and shift left) */
892
        // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
893
        minFilterSize= 0;
894
        for(i=dstW-1; i>=0; i--)
895
        {
896
                int min= filter2Size;
897
                int j;
898
                double cutOff=0.0;
899

    
900
                /* get rid off near zero elements on the left by shifting left */
901
                for(j=0; j<filter2Size; j++)
902
                {
903
                        int k;
904
                        cutOff += ABS(filter2[i*filter2Size]);
905

    
906
                        if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
907

    
908
                        /* preserve Monotonicity because the core cant handle the filter otherwise */
909
                        if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
910

    
911
                        // Move filter coeffs left
912
                        for(k=1; k<filter2Size; k++)
913
                                filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
914
                        filter2[i*filter2Size + k - 1]= 0.0;
915
                        (*filterPos)[i]++;
916
                }
917

    
918
                cutOff=0.0;
919
                /* count near zeros on the right */
920
                for(j=filter2Size-1; j>0; j--)
921
                {
922
                        cutOff += ABS(filter2[i*filter2Size + j]);
923

    
924
                        if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
925
                        min--;
926
                }
927

    
928
                if(min>minFilterSize) minFilterSize= min;
929
        }
930

    
931
        filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
932
        filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
933
        *outFilterSize= filterSize;
934

    
935
        if((flags&SWS_PRINT_INFO) && verbose)
936
                printf("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
937
        /* try to reduce the filter-size (step2 reduce it) */
938
        for(i=0; i<dstW; i++)
939
        {
940
                int j;
941

    
942
                for(j=0; j<filterSize; j++)
943
                {
944
                        if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
945
                        else                   filter[i*filterSize + j]= filter2[i*filter2Size + j];
946
                }
947
        }
948
        free(filter2); filter2=NULL;
949
        
950
        ASSERT(filterSize > 0)
951

    
952
        //FIXME try to align filterpos if possible
953

    
954
        //fix borders
955
        for(i=0; i<dstW; i++)
956
        {
957
                int j;
958
                if((*filterPos)[i] < 0)
959
                {
960
                        // Move filter coeffs left to compensate for filterPos
961
                        for(j=1; j<filterSize; j++)
962
                        {
963
                                int left= MAX(j + (*filterPos)[i], 0);
964
                                filter[i*filterSize + left] += filter[i*filterSize + j];
965
                                filter[i*filterSize + j]=0;
966
                        }
967
                        (*filterPos)[i]= 0;
968
                }
969

    
970
                if((*filterPos)[i] + filterSize > srcW)
971
                {
972
                        int shift= (*filterPos)[i] + filterSize - srcW;
973
                        // Move filter coeffs right to compensate for filterPos
974
                        for(j=filterSize-2; j>=0; j--)
975
                        {
976
                                int right= MIN(j + shift, filterSize-1);
977
                                filter[i*filterSize +right] += filter[i*filterSize +j];
978
                                filter[i*filterSize +j]=0;
979
                        }
980
                        (*filterPos)[i]= srcW - filterSize;
981
                }
982
        }
983

    
984
        // Note the +1 is for the MMXscaler which reads over the end
985
        *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
986
        memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
987

    
988
        /* Normalize & Store in outFilter */
989
        for(i=0; i<dstW; i++)
990
        {
991
                int j;
992
                double sum=0;
993
                double scale= one;
994
                for(j=0; j<filterSize; j++)
995
                {
996
                        sum+= filter[i*filterSize + j];
997
                }
998
                scale/= sum;
999
                for(j=0; j<filterSize; j++)
1000
                {
1001
                        (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
1002
                }
1003
        }
1004
        
1005
        (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1006
        for(i=0; i<*outFilterSize; i++)
1007
        {
1008
                int j= dstW*(*outFilterSize);
1009
                (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1010
        }
1011

    
1012
        free(filter);
1013
}
1014

    
1015
#ifdef ARCH_X86
1016
static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
1017
{
1018
        uint8_t *fragment;
1019
        int imm8OfPShufW1;
1020
        int imm8OfPShufW2;
1021
        int fragmentLength;
1022

    
1023
        int xpos, i;
1024

    
1025
        // create an optimized horizontal scaling routine
1026

    
1027
        //code fragment
1028

    
1029
        asm volatile(
1030
                "jmp 9f                                \n\t"
1031
        // Begin
1032
                "0:                                \n\t"
1033
                "movq (%%esi), %%mm0                \n\t" //FIXME Alignment
1034
                "movq %%mm0, %%mm1                \n\t"
1035
                "psrlq $8, %%mm0                \n\t"
1036
                "punpcklbw %%mm7, %%mm1        \n\t"
1037
                "movq %%mm2, %%mm3                \n\t"
1038
                "punpcklbw %%mm7, %%mm0        \n\t"
1039
                "addw %%bx, %%cx                \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
1040
                "pshufw $0xFF, %%mm1, %%mm1        \n\t"
1041
                "1:                                \n\t"
1042
                "adcl %%edx, %%esi                \n\t" //xx+= (4*lumXInc)>>16 + carry
1043
                "pshufw $0xFF, %%mm0, %%mm0        \n\t"
1044
                "2:                                \n\t"
1045
                "psrlw $9, %%mm3                \n\t"
1046
                "psubw %%mm1, %%mm0                \n\t"
1047
                "pmullw %%mm3, %%mm0                \n\t"
1048
                "paddw %%mm6, %%mm2                \n\t" // 2*alpha += xpos&0xFFFF
1049
                "psllw $7, %%mm1                \n\t"
1050
                "paddw %%mm1, %%mm0                \n\t"
1051

    
1052
                "movq %%mm0, (%%edi, %%eax)        \n\t"
1053

    
1054
                "addl $8, %%eax                        \n\t"
1055
        // End
1056
                "9:                                \n\t"
1057
//                "int $3\n\t"
1058
                "leal 0b, %0                        \n\t"
1059
                "leal 1b, %1                        \n\t"
1060
                "leal 2b, %2                        \n\t"
1061
                "decl %1                        \n\t"
1062
                "decl %2                        \n\t"
1063
                "subl %0, %1                        \n\t"
1064
                "subl %0, %2                        \n\t"
1065
                "leal 9b, %3                        \n\t"
1066
                "subl %0, %3                        \n\t"
1067
                :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
1068
                "=r" (fragmentLength)
1069
        );
1070

    
1071
        xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1072

    
1073
        for(i=0; i<dstW/8; i++)
1074
        {
1075
                int xx=xpos>>16;
1076

    
1077
                if((i&3) == 0)
1078
                {
1079
                        int a=0;
1080
                        int b=((xpos+xInc)>>16) - xx;
1081
                        int c=((xpos+xInc*2)>>16) - xx;
1082
                        int d=((xpos+xInc*3)>>16) - xx;
1083

    
1084
                        memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
1085

    
1086
                        funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
1087
                        funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
1088
                                a | (b<<2) | (c<<4) | (d<<6);
1089

    
1090
                        // if we dont need to read 8 bytes than dont :), reduces the chance of
1091
                        // crossing a cache line
1092
                        if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
1093

    
1094
                        funnyCode[fragmentLength*(i+4)/4]= RET;
1095
                }
1096
                xpos+=xInc;
1097
        }
1098
}
1099
#endif // ARCH_X86
1100

    
1101
//FIXME remove
1102
void SwScale_Init(){
1103
}
1104

    
1105
static void globalInit(){
1106
    // generating tables:
1107
    int i;
1108
    for(i=0; i<768; i++){
1109
        int c= MIN(MAX(i-256, 0), 255);
1110
        clip_table[i]=c;
1111
        yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1112
        yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1113
        yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1114
        yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1115
        yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
1116
    }
1117

    
1118
    for(i=0; i<768; i++)
1119
    {
1120
        int v= clip_table[i];
1121
        clip_table16b[i]= le2me_16( v>>3);
1122
        clip_table16g[i]= le2me_16((v<<3)&0x07E0);
1123
        clip_table16r[i]= le2me_16((v<<8)&0xF800);
1124
        clip_table15b[i]= le2me_16( v>>3);
1125
        clip_table15g[i]= le2me_16((v<<2)&0x03E0);
1126
        clip_table15r[i]= le2me_16((v<<7)&0x7C00);
1127
    }
1128

    
1129
cpuCaps= gCpuCaps;
1130

    
1131
#ifdef RUNTIME_CPUDETECT
1132
#ifdef CAN_COMPILE_X86_ASM
1133
        // ordered per speed fasterst first
1134
        if(gCpuCaps.hasMMX2)
1135
                swScale= swScale_MMX2;
1136
        else if(gCpuCaps.has3DNow)
1137
                swScale= swScale_3DNow;
1138
        else if(gCpuCaps.hasMMX)
1139
                swScale= swScale_MMX;
1140
        else
1141
                swScale= swScale_C;
1142

    
1143
#else
1144
        swScale= swScale_C;
1145
        cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1146
#endif
1147
#else //RUNTIME_CPUDETECT
1148
#ifdef HAVE_MMX2
1149
        swScale= swScale_MMX2;
1150
        cpuCaps.has3DNow = 0;
1151
#elif defined (HAVE_3DNOW)
1152
        swScale= swScale_3DNow;
1153
        cpuCaps.hasMMX2 = 0;
1154
#elif defined (HAVE_MMX)
1155
        swScale= swScale_MMX;
1156
        cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1157
#else
1158
        swScale= swScale_C;
1159
        cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1160
#endif
1161
#endif //!RUNTIME_CPUDETECT
1162
}
1163

    
1164
/* Warper functions for yuv2bgr */
1165
static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1166
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1167
        uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1168

    
1169
        if(c->srcFormat==IMGFMT_YV12)
1170
                yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1171
        else /* I420 & IYUV */
1172
                yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1173
}
1174

    
1175
static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1176
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1177
        
1178
        if(dstStride[0]*3==srcStride[0]*4)
1179
                rgb24to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1180
        else
1181
        {
1182
                int i;
1183
                uint8_t *srcPtr= src[0];
1184
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1185

    
1186
                for(i=0; i<srcSliceH; i++)
1187
                {
1188
                        rgb24to32(srcPtr, dstPtr, c->srcW*3);
1189
                        srcPtr+= srcStride[0];
1190
                        dstPtr+= dstStride[0];
1191
                }
1192
        }     
1193
}
1194

    
1195
static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1196
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1197
        
1198
        if(dstStride[0]*4==srcStride[0]*3)
1199
                rgb32to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1200
        else
1201
        {
1202
                int i;
1203
                uint8_t *srcPtr= src[0];
1204
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1205

    
1206
                for(i=0; i<srcSliceH; i++)
1207
                {
1208
                        rgb32to24(srcPtr, dstPtr, c->srcW<<2);
1209
                        srcPtr+= srcStride[0];
1210
                        dstPtr+= dstStride[0];
1211
                }
1212
        }     
1213
}
1214

    
1215
static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1216
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1217
        
1218
        if(dstStride[0]==srcStride[0])
1219
                rgb15to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1220
        else
1221
        {
1222
                int i;
1223
                uint8_t *srcPtr= src[0];
1224
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1225

    
1226
                for(i=0; i<srcSliceH; i++)
1227
                {
1228
                        rgb15to16(srcPtr, dstPtr, c->srcW<<1);
1229
                        srcPtr+= srcStride[0];
1230
                        dstPtr+= dstStride[0];
1231
                }
1232
        }     
1233
}
1234

    
1235
static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1236
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1237

    
1238
        rgb24toyv12(
1239
                src[0], 
1240
                dst[0]+ srcSliceY    *dstStride[0], 
1241
                dst[1]+(srcSliceY>>1)*dstStride[1], 
1242
                dst[2]+(srcSliceY>>1)*dstStride[2],
1243
                c->srcW, srcSliceH, 
1244
                dstStride[0], dstStride[1], srcStride[0]);
1245
}
1246

    
1247

    
1248
/* unscaled copy like stuff (assumes nearly identical formats) */
1249
static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
1250
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1251

    
1252
        int srcStride[3];
1253
        uint8_t *src[3];
1254
        uint8_t *dst[3];
1255

    
1256
        if(c->srcFormat == IMGFMT_I420){
1257
                src[0]= srcParam[0];
1258
                src[1]= srcParam[2];
1259
                src[2]= srcParam[1];
1260
                srcStride[0]= srcStrideParam[0];
1261
                srcStride[1]= srcStrideParam[2];
1262
                srcStride[2]= srcStrideParam[1];
1263
        }
1264
        else if(c->srcFormat==IMGFMT_YV12){
1265
                src[0]= srcParam[0];
1266
                src[1]= srcParam[1];
1267
                src[2]= srcParam[2];
1268
                srcStride[0]= srcStrideParam[0];
1269
                srcStride[1]= srcStrideParam[1];
1270
                srcStride[2]= srcStrideParam[2];
1271
        }
1272
        else if(isPacked(c->srcFormat) || isGray(c->srcFormat)){
1273
                src[0]= srcParam[0];
1274
                src[1]=
1275
                src[2]= NULL;
1276
                srcStride[0]= srcStrideParam[0];
1277
                srcStride[1]=
1278
                srcStride[2]= 0;
1279
        }
1280

    
1281
        if(c->dstFormat == IMGFMT_I420){
1282
                dst[0]= dstParam[0];
1283
                dst[1]= dstParam[2];
1284
                dst[2]= dstParam[1];
1285
                
1286
        }else{
1287
                dst[0]= dstParam[0];
1288
                dst[1]= dstParam[1];
1289
                dst[2]= dstParam[2];
1290
        }
1291

    
1292
        if(isPacked(c->srcFormat))
1293
        {
1294
                if(dstStride[0]==srcStride[0])
1295
                        memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1296
                else
1297
                {
1298
                        int i;
1299
                        uint8_t *srcPtr= src[0];
1300
                        uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1301
                        int length=0;
1302

    
1303
                        /* universal length finder */
1304
                        while(length+c->srcW <= ABS(dstStride[0]) 
1305
                           && length+c->srcW <= ABS(srcStride[0])) length+= c->srcW;
1306
                        ASSERT(length!=0);
1307

    
1308
                        for(i=0; i<srcSliceH; i++)
1309
                        {
1310
                                memcpy(dstPtr, srcPtr, length);
1311
                                srcPtr+= srcStride[0];
1312
                                dstPtr+= dstStride[0];
1313
                        }
1314
                }
1315
        }
1316
        else 
1317
        { /* Planar YUV */
1318
                int plane;
1319
                for(plane=0; plane<3; plane++)
1320
                {
1321
                        int length= plane==0 ? c->srcW  : ((c->srcW+1)>>1);
1322
                        int y=      plane==0 ? srcSliceY: ((srcSliceY+1)>>1);
1323
                        int height= plane==0 ? srcSliceH: ((srcSliceH+1)>>1);
1324

    
1325
                        if(dstStride[plane]==srcStride[plane])
1326
                                memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1327
                        else
1328
                        {
1329
                                int i;
1330
                                uint8_t *srcPtr= src[plane];
1331
                                uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1332
                                for(i=0; i<height; i++)
1333
                                {
1334
                                        memcpy(dstPtr, srcPtr, length);
1335
                                        srcPtr+= srcStride[plane];
1336
                                        dstPtr+= dstStride[plane];
1337
                                }
1338
                        }
1339
                }
1340
        }
1341
}
1342

    
1343
SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1344
                         SwsFilter *srcFilter, SwsFilter *dstFilter){
1345

    
1346
        SwsContext *c;
1347
        int i;
1348
        int usesFilter;
1349
        SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1350

    
1351
#ifdef ARCH_X86
1352
        if(gCpuCaps.hasMMX)
1353
                asm volatile("emms\n\t"::: "memory");
1354
#endif
1355

    
1356
        if(swScale==NULL) globalInit();
1357

    
1358
        /* avoid dupplicate Formats, so we dont need to check to much */
1359
        if(srcFormat==IMGFMT_IYUV) srcFormat=IMGFMT_I420;
1360
        if(srcFormat==IMGFMT_Y8)   srcFormat=IMGFMT_Y800;
1361
        if(dstFormat==IMGFMT_Y8)   dstFormat=IMGFMT_Y800;
1362

    
1363
        if(!isSupportedIn(srcFormat)) 
1364
        {
1365
                fprintf(stderr, "swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
1366
                return NULL;
1367
        }
1368
        if(!isSupportedOut(dstFormat))
1369
        {
1370
                fprintf(stderr, "swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
1371
                return NULL;
1372
        }
1373

    
1374
        /* sanity check */
1375
        if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1376
        {
1377
                fprintf(stderr, "swScaler: %dx%d -> %dx%d is invalid scaling dimension\n", 
1378
                        srcW, srcH, dstW, dstH);
1379
                return NULL;
1380
        }
1381

    
1382
        if(!dstFilter) dstFilter= &dummyFilter;
1383
        if(!srcFilter) srcFilter= &dummyFilter;
1384

    
1385
        c= memalign(64, sizeof(SwsContext));
1386
        memset(c, 0, sizeof(SwsContext));
1387

    
1388
        c->srcW= srcW;
1389
        c->srcH= srcH;
1390
        c->dstW= dstW;
1391
        c->dstH= dstH;
1392
        c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1393
        c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
1394
        c->flags= flags;
1395
        c->dstFormat= dstFormat;
1396
        c->srcFormat= srcFormat;
1397

    
1398
        usesFilter=0;
1399
        if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
1400
        if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
1401
        if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
1402
        if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
1403
        if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
1404
        if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
1405
        if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
1406
        if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
1407
        
1408
        /* unscaled special Cases */
1409
        if(srcW==dstW && srcH==dstH && !usesFilter)
1410
        {
1411
                /* yuv2bgr */
1412
                if(isPlanarYUV(srcFormat) && isBGR(dstFormat))
1413
                {
1414
                        // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
1415
#ifdef WORDS_BIGENDIAN
1416
                        yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
1417
#else
1418
                        yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1419
#endif
1420
                        c->swScale= planarYuvToBgr;
1421

    
1422
                        if(flags&SWS_PRINT_INFO)
1423
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1424
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1425
                        return c;
1426
                }
1427

    
1428
                /* simple copy */
1429
                if(srcFormat == dstFormat || (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat)))
1430
                {
1431
                        c->swScale= simpleCopy;
1432

    
1433
                        if(flags&SWS_PRINT_INFO)
1434
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1435
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1436
                        return c;
1437
                }
1438
                
1439
                /* bgr32to24 & rgb32to24*/
1440
                if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR24)
1441
                 ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB24))
1442
                {
1443
                        c->swScale= bgr32to24Wrapper;
1444

    
1445
                        if(flags&SWS_PRINT_INFO)
1446
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1447
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1448
                        return c;
1449
                }
1450
                
1451
                /* bgr24to32 & rgb24to32*/
1452
                if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR32)
1453
                 ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB32))
1454
                {
1455
                        c->swScale= bgr24to32Wrapper;
1456

    
1457
                        if(flags&SWS_PRINT_INFO)
1458
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1459
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1460
                        return c;
1461
                }
1462

    
1463
                /* bgr15to16 */
1464
                if(srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR16)
1465
                {
1466
                        c->swScale= bgr15to16Wrapper;
1467

    
1468
                        if(flags&SWS_PRINT_INFO)
1469
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1470
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1471
                        return c;
1472
                }
1473

    
1474
                /* bgr24toYV12 */
1475
                if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
1476
                {
1477
                        c->swScale= bgr24toyv12Wrapper;
1478

    
1479
                        if(flags&SWS_PRINT_INFO)
1480
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1481
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1482
                        return c;
1483
                }
1484
        }
1485

    
1486
        if(cpuCaps.hasMMX2)
1487
        {
1488
                c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
1489
                if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
1490
                {
1491
                        if(flags&SWS_PRINT_INFO)
1492
                                fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
1493
                }
1494
        }
1495
        else
1496
                c->canMMX2BeUsed=0;
1497

    
1498

    
1499
        /* dont use full vertical UV input/internaly if the source doesnt even have it */
1500
        if(isHalfChrV(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_V);
1501
        /* dont use full horizontal UV input if the source doesnt even have it */
1502
        if(isHalfChrH(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INP);
1503
        /* dont use full horizontal UV internally if the destination doesnt even have it */
1504
        if(isHalfChrH(dstFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INT);
1505

    
1506
        if(flags&SWS_FULL_CHR_H_INP)        c->chrSrcW= srcW;
1507
        else                                c->chrSrcW= (srcW+1)>>1;
1508

    
1509
        if(flags&SWS_FULL_CHR_H_INT)        c->chrDstW= dstW;
1510
        else                                c->chrDstW= (dstW+1)>>1;
1511

    
1512
        if(flags&SWS_FULL_CHR_V)        c->chrSrcH= srcH;
1513
        else                                c->chrSrcH= (srcH+1)>>1;
1514

    
1515
        if(isHalfChrV(dstFormat))        c->chrDstH= (dstH+1)>>1;
1516
        else                                c->chrDstH= dstH;
1517

    
1518
        c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
1519
        c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
1520

    
1521

    
1522
        // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1523
        // but only for the FAST_BILINEAR mode otherwise do correct scaling
1524
        // n-2 is the last chrominance sample available
1525
        // this is not perfect, but noone shuld notice the difference, the more correct variant
1526
        // would be like the vertical one, but that would require some special code for the
1527
        // first and last pixel
1528
        if(flags&SWS_FAST_BILINEAR)
1529
        {
1530
                if(c->canMMX2BeUsed)
1531
                {
1532
                        c->lumXInc+= 20;
1533
                        c->chrXInc+= 20;
1534
                }
1535
                //we dont use the x86asm scaler if mmx is available
1536
                else if(cpuCaps.hasMMX)
1537
                {
1538
                        c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
1539
                        c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
1540
                }
1541
        }
1542

    
1543
        /* precalculate horizontal scaler filter coefficients */
1544
        {
1545
                const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
1546

    
1547
                initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
1548
                                 srcW      ,       dstW, filterAlign, 1<<14, flags,
1549
                                 srcFilter->lumH, dstFilter->lumH);
1550
                initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
1551
                                (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
1552
                                 srcFilter->chrH, dstFilter->chrH);
1553

    
1554
#ifdef ARCH_X86
1555
// cant downscale !!!
1556
                if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
1557
                {
1558
                        initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode);
1559
                        initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
1560
                }
1561
#endif
1562
        } // Init Horizontal stuff
1563

    
1564

    
1565

    
1566
        /* precalculate vertical scaler filter coefficients */
1567
        initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
1568
                        srcH      ,        dstH, 1, (1<<12)-4, flags,
1569
                        srcFilter->lumV, dstFilter->lumV);
1570
        initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
1571
                        (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
1572
                         srcFilter->chrV, dstFilter->chrV);
1573

    
1574
        // Calculate Buffer Sizes so that they wont run out while handling these damn slices
1575
        c->vLumBufSize= c->vLumFilterSize;
1576
        c->vChrBufSize= c->vChrFilterSize;
1577
        for(i=0; i<dstH; i++)
1578
        {
1579
                int chrI= i*c->chrDstH / dstH;
1580
                int nextSlice= MAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
1581
                                 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
1582
                nextSlice&= ~1; // Slices start at even boundaries
1583
                if(c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
1584
                        c->vLumBufSize= nextSlice - c->vLumFilterPos[i   ];
1585
                if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
1586
                        c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
1587
        }
1588

    
1589
        // allocate pixbufs (we use dynamic allocation because otherwise we would need to
1590
        c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
1591
        c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
1592
        //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
1593
        for(i=0; i<c->vLumBufSize; i++)
1594
                c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
1595
        for(i=0; i<c->vChrBufSize; i++)
1596
                c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
1597

    
1598
        //try to avoid drawing green stuff between the right end and the stride end
1599
        for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
1600
        for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
1601

    
1602
        ASSERT(c->chrDstH <= dstH)
1603

    
1604
        // pack filter data for mmx code
1605
        if(cpuCaps.hasMMX)
1606
        {
1607
                c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize*      dstH*4*sizeof(int16_t));
1608
                c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
1609
                for(i=0; i<c->vLumFilterSize*dstH; i++)
1610
                        c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
1611
                                c->vLumFilter[i];
1612
                for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
1613
                        c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
1614
                                c->vChrFilter[i];
1615
        }
1616

    
1617
        if(flags&SWS_PRINT_INFO)
1618
        {
1619
#ifdef DITHER1XBPP
1620
                char *dither= " dithered";
1621
#else
1622
                char *dither= "";
1623
#endif
1624
                if(flags&SWS_FAST_BILINEAR)
1625
                        fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler, ");
1626
                else if(flags&SWS_BILINEAR)
1627
                        fprintf(stderr, "\nSwScaler: BILINEAR scaler, ");
1628
                else if(flags&SWS_BICUBIC)
1629
                        fprintf(stderr, "\nSwScaler: BICUBIC scaler, ");
1630
                else if(flags&SWS_X)
1631
                        fprintf(stderr, "\nSwScaler: Experimental scaler, ");
1632
                else if(flags&SWS_POINT)
1633
                        fprintf(stderr, "\nSwScaler: Nearest Neighbor / POINT scaler, ");
1634
                else if(flags&SWS_AREA)
1635
                        fprintf(stderr, "\nSwScaler: Area Averageing scaler, ");
1636
                else
1637
                        fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
1638

    
1639
                if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
1640
                        fprintf(stderr, "from %s to%s %s ", 
1641
                                vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
1642
                else
1643
                        fprintf(stderr, "from %s to %s ", 
1644
                                vo_format_name(srcFormat), vo_format_name(dstFormat));
1645

    
1646
                if(cpuCaps.hasMMX2)
1647
                        fprintf(stderr, "using MMX2\n");
1648
                else if(cpuCaps.has3DNow)
1649
                        fprintf(stderr, "using 3DNOW\n");
1650
                else if(cpuCaps.hasMMX)
1651
                        fprintf(stderr, "using MMX\n");
1652
                else
1653
                        fprintf(stderr, "using C\n");
1654
        }
1655

    
1656
        if((flags & SWS_PRINT_INFO) && verbose)
1657
        {
1658
                if(cpuCaps.hasMMX)
1659
                {
1660
                        if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1661
                                printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1662
                        else
1663
                        {
1664
                                if(c->hLumFilterSize==4)
1665
                                        printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
1666
                                else if(c->hLumFilterSize==8)
1667
                                        printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
1668
                                else
1669
                                        printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
1670

    
1671
                                if(c->hChrFilterSize==4)
1672
                                        printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
1673
                                else if(c->hChrFilterSize==8)
1674
                                        printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
1675
                                else
1676
                                        printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
1677
                        }
1678
                }
1679
                else
1680
                {
1681
#ifdef ARCH_X86
1682
                        printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
1683
#else
1684
                        if(flags & SWS_FAST_BILINEAR)
1685
                                printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
1686
                        else
1687
                                printf("SwScaler: using C scaler for horizontal scaling\n");
1688
#endif
1689
                }
1690
                if(isPlanarYUV(dstFormat))
1691
                {
1692
                        if(c->vLumFilterSize==1)
1693
                                printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1694
                        else
1695
                                printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1696
                }
1697
                else
1698
                {
1699
                        if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
1700
                                printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1701
                                       "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
1702
                        else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
1703
                                printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1704
                        else
1705
                                printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1706
                }
1707

    
1708
                if(dstFormat==IMGFMT_BGR24)
1709
                        printf("SwScaler: using %s YV12->BGR24 Converter\n",
1710
                                cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
1711
                else if(dstFormat==IMGFMT_BGR32)
1712
                        printf("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1713
                else if(dstFormat==IMGFMT_BGR16)
1714
                        printf("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1715
                else if(dstFormat==IMGFMT_BGR15)
1716
                        printf("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1717

    
1718
                printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1719
        }
1720
        if((flags & SWS_PRINT_INFO) && verbose>1)
1721
        {
1722
                printf("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1723
                        c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
1724
                printf("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1725
                        c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
1726
        }
1727

    
1728
        c->swScale= swScale;
1729
        return c;
1730
}
1731

    
1732
/**
1733
 * returns a normalized gaussian curve used to filter stuff
1734
 * quality=3 is high quality, lowwer is lowwer quality
1735
 */
1736

    
1737
SwsVector *getGaussianVec(double variance, double quality){
1738
        const int length= (int)(variance*quality + 0.5) | 1;
1739
        int i;
1740
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1741
        double middle= (length-1)*0.5;
1742
        SwsVector *vec= malloc(sizeof(SwsVector));
1743

    
1744
        vec->coeff= coeff;
1745
        vec->length= length;
1746

    
1747
        for(i=0; i<length; i++)
1748
        {
1749
                double dist= i-middle;
1750
                coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1751
        }
1752

    
1753
        normalizeVec(vec, 1.0);
1754

    
1755
        return vec;
1756
}
1757

    
1758
SwsVector *getConstVec(double c, int length){
1759
        int i;
1760
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1761
        SwsVector *vec= malloc(sizeof(SwsVector));
1762

    
1763
        vec->coeff= coeff;
1764
        vec->length= length;
1765

    
1766
        for(i=0; i<length; i++)
1767
                coeff[i]= c;
1768

    
1769
        return vec;
1770
}
1771

    
1772

    
1773
SwsVector *getIdentityVec(void){
1774
        double *coeff= memalign(sizeof(double), sizeof(double));
1775
        SwsVector *vec= malloc(sizeof(SwsVector));
1776
        coeff[0]= 1.0;
1777

    
1778
        vec->coeff= coeff;
1779
        vec->length= 1;
1780

    
1781
        return vec;
1782
}
1783

    
1784
void normalizeVec(SwsVector *a, double height){
1785
        int i;
1786
        double sum=0;
1787
        double inv;
1788

    
1789
        for(i=0; i<a->length; i++)
1790
                sum+= a->coeff[i];
1791

    
1792
        inv= height/sum;
1793

    
1794
        for(i=0; i<a->length; i++)
1795
                a->coeff[i]*= height;
1796
}
1797

    
1798
void scaleVec(SwsVector *a, double scalar){
1799
        int i;
1800

    
1801
        for(i=0; i<a->length; i++)
1802
                a->coeff[i]*= scalar;
1803
}
1804

    
1805
static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
1806
        int length= a->length + b->length - 1;
1807
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1808
        int i, j;
1809
        SwsVector *vec= malloc(sizeof(SwsVector));
1810

    
1811
        vec->coeff= coeff;
1812
        vec->length= length;
1813

    
1814
        for(i=0; i<length; i++) coeff[i]= 0.0;
1815

    
1816
        for(i=0; i<a->length; i++)
1817
        {
1818
                for(j=0; j<b->length; j++)
1819
                {
1820
                        coeff[i+j]+= a->coeff[i]*b->coeff[j];
1821
                }
1822
        }
1823

    
1824
        return vec;
1825
}
1826

    
1827
static SwsVector *sumVec(SwsVector *a, SwsVector *b){
1828
        int length= MAX(a->length, b->length);
1829
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1830
        int i;
1831
        SwsVector *vec= malloc(sizeof(SwsVector));
1832

    
1833
        vec->coeff= coeff;
1834
        vec->length= length;
1835

    
1836
        for(i=0; i<length; i++) coeff[i]= 0.0;
1837

    
1838
        for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1839
        for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
1840

    
1841
        return vec;
1842
}
1843

    
1844
static SwsVector *diffVec(SwsVector *a, SwsVector *b){
1845
        int length= MAX(a->length, b->length);
1846
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1847
        int i;
1848
        SwsVector *vec= malloc(sizeof(SwsVector));
1849

    
1850
        vec->coeff= coeff;
1851
        vec->length= length;
1852

    
1853
        for(i=0; i<length; i++) coeff[i]= 0.0;
1854

    
1855
        for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1856
        for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
1857

    
1858
        return vec;
1859
}
1860

    
1861
/* shift left / or right if "shift" is negative */
1862
static SwsVector *getShiftedVec(SwsVector *a, int shift){
1863
        int length= a->length + ABS(shift)*2;
1864
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1865
        int i;
1866
        SwsVector *vec= malloc(sizeof(SwsVector));
1867

    
1868
        vec->coeff= coeff;
1869
        vec->length= length;
1870

    
1871
        for(i=0; i<length; i++) coeff[i]= 0.0;
1872

    
1873
        for(i=0; i<a->length; i++)
1874
        {
1875
                coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
1876
        }
1877

    
1878
        return vec;
1879
}
1880

    
1881
void shiftVec(SwsVector *a, int shift){
1882
        SwsVector *shifted= getShiftedVec(a, shift);
1883
        free(a->coeff);
1884
        a->coeff= shifted->coeff;
1885
        a->length= shifted->length;
1886
        free(shifted);
1887
}
1888

    
1889
void addVec(SwsVector *a, SwsVector *b){
1890
        SwsVector *sum= sumVec(a, b);
1891
        free(a->coeff);
1892
        a->coeff= sum->coeff;
1893
        a->length= sum->length;
1894
        free(sum);
1895
}
1896

    
1897
void subVec(SwsVector *a, SwsVector *b){
1898
        SwsVector *diff= diffVec(a, b);
1899
        free(a->coeff);
1900
        a->coeff= diff->coeff;
1901
        a->length= diff->length;
1902
        free(diff);
1903
}
1904

    
1905
void convVec(SwsVector *a, SwsVector *b){
1906
        SwsVector *conv= getConvVec(a, b);
1907
        free(a->coeff);
1908
        a->coeff= conv->coeff;
1909
        a->length= conv->length;
1910
        free(conv);
1911
}
1912

    
1913
SwsVector *cloneVec(SwsVector *a){
1914
        double *coeff= memalign(sizeof(double), a->length*sizeof(double));
1915
        int i;
1916
        SwsVector *vec= malloc(sizeof(SwsVector));
1917

    
1918
        vec->coeff= coeff;
1919
        vec->length= a->length;
1920

    
1921
        for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
1922

    
1923
        return vec;
1924
}
1925

    
1926
void printVec(SwsVector *a){
1927
        int i;
1928
        double max=0;
1929
        double min=0;
1930
        double range;
1931

    
1932
        for(i=0; i<a->length; i++)
1933
                if(a->coeff[i]>max) max= a->coeff[i];
1934

    
1935
        for(i=0; i<a->length; i++)
1936
                if(a->coeff[i]<min) min= a->coeff[i];
1937

    
1938
        range= max - min;
1939

    
1940
        for(i=0; i<a->length; i++)
1941
        {
1942
                int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
1943
                printf("%1.3f ", a->coeff[i]);
1944
                for(;x>0; x--) printf(" ");
1945
                printf("|\n");
1946
        }
1947
}
1948

    
1949
void freeVec(SwsVector *a){
1950
        if(!a) return;
1951
        if(a->coeff) free(a->coeff);
1952
        a->coeff=NULL;
1953
        a->length=0;
1954
        free(a);
1955
}
1956

    
1957
void freeSwsContext(SwsContext *c){
1958
        int i;
1959

    
1960
        if(!c) return;
1961

    
1962
        if(c->lumPixBuf)
1963
        {
1964
                for(i=0; i<c->vLumBufSize; i++)
1965
                {
1966
                        if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
1967
                        c->lumPixBuf[i]=NULL;
1968
                }
1969
                free(c->lumPixBuf);
1970
                c->lumPixBuf=NULL;
1971
        }
1972

    
1973
        if(c->chrPixBuf)
1974
        {
1975
                for(i=0; i<c->vChrBufSize; i++)
1976
                {
1977
                        if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
1978
                        c->chrPixBuf[i]=NULL;
1979
                }
1980
                free(c->chrPixBuf);
1981
                c->chrPixBuf=NULL;
1982
        }
1983

    
1984
        if(c->vLumFilter) free(c->vLumFilter);
1985
        c->vLumFilter = NULL;
1986
        if(c->vChrFilter) free(c->vChrFilter);
1987
        c->vChrFilter = NULL;
1988
        if(c->hLumFilter) free(c->hLumFilter);
1989
        c->hLumFilter = NULL;
1990
        if(c->hChrFilter) free(c->hChrFilter);
1991
        c->hChrFilter = NULL;
1992

    
1993
        if(c->vLumFilterPos) free(c->vLumFilterPos);
1994
        c->vLumFilterPos = NULL;
1995
        if(c->vChrFilterPos) free(c->vChrFilterPos);
1996
        c->vChrFilterPos = NULL;
1997
        if(c->hLumFilterPos) free(c->hLumFilterPos);
1998
        c->hLumFilterPos = NULL;
1999
        if(c->hChrFilterPos) free(c->hChrFilterPos);
2000
        c->hChrFilterPos = NULL;
2001

    
2002
        if(c->lumMmxFilter) free(c->lumMmxFilter);
2003
        c->lumMmxFilter = NULL;
2004
        if(c->chrMmxFilter) free(c->chrMmxFilter);
2005
        c->chrMmxFilter = NULL;
2006

    
2007
        free(c);
2008
}
2009

    
2010