Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale.c @ ec22603f

History | View | Annotate | Download (55.1 KB)

1
/*
2
    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
/*
20
  supported Input formats: YV12, I420, IYUV, YUY2, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8, Y800
21
  supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32 (grayscale soon too)
22
  BGR15/16 support dithering
23
  
24
  unscaled special converters
25
  YV12/I420/IYUV -> BGR15/BGR16/BGR24/BGR32
26
  YV12/I420/IYUV -> YV12/I420/IYUV
27
  YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
28
  BGR24 -> BGR32 & RGB24 -> RGB32
29
  BGR32 -> BGR24 & RGB32 -> RGB24
30
  BGR15 -> BGR16
31
*/
32

    
33
/* 
34
tested special converters
35
 YV12/I420 -> BGR16
36
 YV12 -> YV12
37
 BGR15 -> BGR16
38

39
untested special converters
40
  YV12/I420/IYUV -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
41
  YV12/I420/IYUV -> YV12/I420/IYUV 
42
  YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
43
  BGR24 -> BGR32 & RGB24 -> RGB32
44
  BGR32 -> BGR24 & RGB32 -> RGB24
45
  BGR24 -> YV12
46
*/
47

    
48
#include <inttypes.h>
49
#include <string.h>
50
#include <math.h>
51
#include <stdio.h>
52
#include "../config.h"
53
#include "../mangle.h"
54
#ifdef HAVE_MALLOC_H
55
#include <malloc.h>
56
#endif
57
#include "swscale.h"
58
#include "../cpudetect.h"
59
#include "../bswap.h"
60
#include "../libvo/img_format.h"
61
#include "rgb2rgb.h"
62
#include "../libvo/fastmemcpy.h"
63
#undef MOVNTQ
64
#undef PAVGB
65

    
66
//#undef HAVE_MMX2
67
//#define HAVE_3DNOW
68
//#undef HAVE_MMX
69
//#undef ARCH_X86
70
#define DITHER1XBPP
71

    
72
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
73

    
74
#define RET 0xC3 //near return opcode for X86
75

    
76
#ifdef MP_DEBUG
77
#define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
78
#else
79
#define ASSERT(x) ;
80
#endif
81

    
82
#ifdef M_PI
83
#define PI M_PI
84
#else
85
#define PI 3.14159265358979323846
86
#endif
87

    
88
//FIXME replace this with something faster
89
#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
90
#define isYUV(x)       ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
91
#define isHalfChrV(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
92
#define isHalfChrH(x)  ((x)==IMGFMT_YUY2 || (x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
93
#define isPacked(x)    ((x)==IMGFMT_YUY2 || ((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR || ((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
94
#define isGray(x)      ((x)==IMGFMT_Y800)
95
#define isSupportedIn(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
96
                        || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
97
                        || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
98
                        || (x)==IMGFMT_Y800)
99
#define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
100
                        || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
101
#define isBGR(x)       ((x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
102

    
103
#define RGB2YUV_SHIFT 16
104
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
105
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
106
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
107
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
108
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
109
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
110
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
111
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
112
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
113

    
114
extern int verbose; // defined in mplayer.c
115
/*
116
NOTES
117

118
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
119
horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
120

121
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
122

123
TODO
124
more intelligent missalignment avoidance for the horizontal scaler
125
write special vertical cubic upscale version
126
Optimize C code (yv12 / minmax)
127
add support for packed pixel yuv input & output
128
add support for Y8 output
129
optimize bgr24 & bgr32
130
add BGR4 output support
131
write special BGR->BGR scaler
132
deglobalize yuv2rgb*.c
133
*/
134

    
135
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
136
#define MIN(a,b) ((a) > (b) ? (b) : (a))
137
#define MAX(a,b) ((a) < (b) ? (b) : (a))
138

    
139
#ifdef ARCH_X86
140
#define CAN_COMPILE_X86_ASM
141
#endif
142

    
143
#ifdef CAN_COMPILE_X86_ASM
144
static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
145
static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
146
static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
147
static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
148
static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
149
static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
150
static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
151
static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
152
static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
153
static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
154
static uint64_t __attribute__((aligned(8))) w02=       0x0002000200020002LL;
155
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
156
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
157
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
158
static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
159

    
160
static volatile uint64_t __attribute__((aligned(8))) b5Dither;
161
static volatile uint64_t __attribute__((aligned(8))) g5Dither;
162
static volatile uint64_t __attribute__((aligned(8))) g6Dither;
163
static volatile uint64_t __attribute__((aligned(8))) r5Dither;
164

    
165
static uint64_t __attribute__((aligned(8))) dither4[2]={
166
        0x0103010301030103LL,
167
        0x0200020002000200LL,};
168

    
169
static uint64_t __attribute__((aligned(8))) dither8[2]={
170
        0x0602060206020602LL,
171
        0x0004000400040004LL,};
172

    
173
static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
174
static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
175
static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
176
static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
177
static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
178
static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
179

    
180
static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
181
static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
182
static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
183

    
184
#ifdef FAST_BGR2YV12
185
static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000000210041000DULL;
186
static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
187
static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
188
#else
189
static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000020E540830C8BULL;
190
static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
191
static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
192
#endif
193
static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
194
static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
195
static const uint64_t w1111       __attribute__((aligned(8))) = 0x0001000100010001ULL;
196

    
197
// FIXME remove
198
static uint64_t __attribute__((aligned(8))) asm_yalpha1;
199
static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
200
#endif
201

    
202
// clipping helper table for C implementations:
203
static unsigned char clip_table[768];
204

    
205
static unsigned short clip_table16b[768];
206
static unsigned short clip_table16g[768];
207
static unsigned short clip_table16r[768];
208
static unsigned short clip_table15b[768];
209
static unsigned short clip_table15g[768];
210
static unsigned short clip_table15r[768];
211

    
212
// yuv->rgb conversion tables:
213
static    int yuvtab_2568[256];
214
static    int yuvtab_3343[256];
215
static    int yuvtab_0c92[256];
216
static    int yuvtab_1a1e[256];
217
static    int yuvtab_40cf[256];
218
// Needed for cubic scaler to catch overflows
219
static    int clip_yuvtab_2568[768];
220
static    int clip_yuvtab_3343[768];
221
static    int clip_yuvtab_0c92[768];
222
static    int clip_yuvtab_1a1e[768];
223
static    int clip_yuvtab_40cf[768];
224

    
225
//global sws_flags from the command line
226
int sws_flags=2;
227

    
228
//global srcFilter
229
SwsFilter src_filter= {NULL, NULL, NULL, NULL};
230

    
231
float sws_lum_gblur= 0.0;
232
float sws_chr_gblur= 0.0;
233
int sws_chr_vshift= 0;
234
int sws_chr_hshift= 0;
235
float sws_chr_sharpen= 0.0;
236
float sws_lum_sharpen= 0.0;
237

    
238
/* cpuCaps combined from cpudetect and whats actually compiled in
239
   (if there is no support for something compiled in it wont appear here) */
240
static CpuCaps cpuCaps;
241

    
242
void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
243
             int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
244

    
245
static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
246

    
247
#ifdef CAN_COMPILE_X86_ASM
248
void in_asm_used_var_warning_killer()
249
{
250
 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
251
 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
252
 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
253
 if(i) i=0;
254
}
255
#endif
256

    
257
static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
258
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
259
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
260
{
261
        //FIXME Optimize (just quickly writen not opti..)
262
        int i;
263
        for(i=0; i<dstW; i++)
264
        {
265
                int val=0;
266
                int j;
267
                for(j=0; j<lumFilterSize; j++)
268
                        val += lumSrc[j][i] * lumFilter[j];
269

    
270
                dest[i]= MIN(MAX(val>>19, 0), 255);
271
        }
272

    
273
        if(uDest != NULL)
274
                for(i=0; i<(dstW>>1); i++)
275
                {
276
                        int u=0;
277
                        int v=0;
278
                        int j;
279
                        for(j=0; j<chrFilterSize; j++)
280
                        {
281
                                u += chrSrc[j][i] * chrFilter[j];
282
                                v += chrSrc[j][i + 2048] * chrFilter[j];
283
                        }
284

    
285
                        uDest[i]= MIN(MAX(u>>19, 0), 255);
286
                        vDest[i]= MIN(MAX(v>>19, 0), 255);
287
                }
288
}
289

    
290
static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
291
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
292
                                    uint8_t *dest, int dstW, int dstFormat)
293
{
294
        if(dstFormat==IMGFMT_BGR32)
295
        {
296
                int i;
297
                for(i=0; i<(dstW>>1); i++){
298
                        int j;
299
                        int Y1=0;
300
                        int Y2=0;
301
                        int U=0;
302
                        int V=0;
303
                        int Cb, Cr, Cg;
304
                        for(j=0; j<lumFilterSize; j++)
305
                        {
306
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
307
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
308
                        }
309
                        for(j=0; j<chrFilterSize; j++)
310
                        {
311
                                U += chrSrc[j][i] * chrFilter[j];
312
                                V += chrSrc[j][i+2048] * chrFilter[j];
313
                        }
314
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
315
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
316
                        U >>= 19;
317
                        V >>= 19;
318

    
319
                        Cb= clip_yuvtab_40cf[U+ 256];
320
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
321
                        Cr= clip_yuvtab_3343[V+ 256];
322

    
323
                        dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
324
                        dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
325
                        dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
326

    
327
                        dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
328
                        dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
329
                        dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
330
                }
331
        }
332
        else if(dstFormat==IMGFMT_BGR24)
333
        {
334
                int i;
335
                for(i=0; i<(dstW>>1); i++){
336
                        int j;
337
                        int Y1=0;
338
                        int Y2=0;
339
                        int U=0;
340
                        int V=0;
341
                        int Cb, Cr, Cg;
342
                        for(j=0; j<lumFilterSize; j++)
343
                        {
344
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
345
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
346
                        }
347
                        for(j=0; j<chrFilterSize; j++)
348
                        {
349
                                U += chrSrc[j][i] * chrFilter[j];
350
                                V += chrSrc[j][i+2048] * chrFilter[j];
351
                        }
352
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
353
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
354
                        U >>= 19;
355
                        V >>= 19;
356

    
357
                        Cb= clip_yuvtab_40cf[U+ 256];
358
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
359
                        Cr= clip_yuvtab_3343[V+ 256];
360

    
361
                        dest[0]=clip_table[((Y1 + Cb) >>13)];
362
                        dest[1]=clip_table[((Y1 + Cg) >>13)];
363
                        dest[2]=clip_table[((Y1 + Cr) >>13)];
364

    
365
                        dest[3]=clip_table[((Y2 + Cb) >>13)];
366
                        dest[4]=clip_table[((Y2 + Cg) >>13)];
367
                        dest[5]=clip_table[((Y2 + Cr) >>13)];
368
                        dest+=6;
369
                }
370
        }
371
        else if(dstFormat==IMGFMT_BGR16)
372
        {
373
                int i;
374
#ifdef DITHER1XBPP
375
                static int ditherb1=1<<14;
376
                static int ditherg1=1<<13;
377
                static int ditherr1=2<<14;
378
                static int ditherb2=3<<14;
379
                static int ditherg2=3<<13;
380
                static int ditherr2=0<<14;
381

    
382
                ditherb1 ^= (1^2)<<14;
383
                ditherg1 ^= (1^2)<<13;
384
                ditherr1 ^= (1^2)<<14;
385
                ditherb2 ^= (3^0)<<14;
386
                ditherg2 ^= (3^0)<<13;
387
                ditherr2 ^= (3^0)<<14;
388
#else
389
                const int ditherb1=0;
390
                const int ditherg1=0;
391
                const int ditherr1=0;
392
                const int ditherb2=0;
393
                const int ditherg2=0;
394
                const int ditherr2=0;
395
#endif
396
                for(i=0; i<(dstW>>1); i++){
397
                        int j;
398
                        int Y1=0;
399
                        int Y2=0;
400
                        int U=0;
401
                        int V=0;
402
                        int Cb, Cr, Cg;
403
                        for(j=0; j<lumFilterSize; j++)
404
                        {
405
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
406
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
407
                        }
408
                        for(j=0; j<chrFilterSize; j++)
409
                        {
410
                                U += chrSrc[j][i] * chrFilter[j];
411
                                V += chrSrc[j][i+2048] * chrFilter[j];
412
                        }
413
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
414
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
415
                        U >>= 19;
416
                        V >>= 19;
417

    
418
                        Cb= clip_yuvtab_40cf[U+ 256];
419
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
420
                        Cr= clip_yuvtab_3343[V+ 256];
421

    
422
                        ((uint16_t*)dest)[2*i] =
423
                                clip_table16b[(Y1 + Cb + ditherb1) >>13] |
424
                                clip_table16g[(Y1 + Cg + ditherg1) >>13] |
425
                                clip_table16r[(Y1 + Cr + ditherr1) >>13];
426

    
427
                        ((uint16_t*)dest)[2*i+1] =
428
                                clip_table16b[(Y2 + Cb + ditherb2) >>13] |
429
                                clip_table16g[(Y2 + Cg + ditherg2) >>13] |
430
                                clip_table16r[(Y2 + Cr + ditherr2) >>13];
431
                }
432
        }
433
        else if(dstFormat==IMGFMT_BGR15)
434
        {
435
                int i;
436
#ifdef DITHER1XBPP
437
                static int ditherb1=1<<14;
438
                static int ditherg1=1<<14;
439
                static int ditherr1=2<<14;
440
                static int ditherb2=3<<14;
441
                static int ditherg2=3<<14;
442
                static int ditherr2=0<<14;
443

    
444
                ditherb1 ^= (1^2)<<14;
445
                ditherg1 ^= (1^2)<<14;
446
                ditherr1 ^= (1^2)<<14;
447
                ditherb2 ^= (3^0)<<14;
448
                ditherg2 ^= (3^0)<<14;
449
                ditherr2 ^= (3^0)<<14;
450
#else
451
                const int ditherb1=0;
452
                const int ditherg1=0;
453
                const int ditherr1=0;
454
                const int ditherb2=0;
455
                const int ditherg2=0;
456
                const int ditherr2=0;
457
#endif
458
                for(i=0; i<(dstW>>1); i++){
459
                        int j;
460
                        int Y1=0;
461
                        int Y2=0;
462
                        int U=0;
463
                        int V=0;
464
                        int Cb, Cr, Cg;
465
                        for(j=0; j<lumFilterSize; j++)
466
                        {
467
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
468
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
469
                        }
470
                        for(j=0; j<chrFilterSize; j++)
471
                        {
472
                                U += chrSrc[j][i] * chrFilter[j];
473
                                V += chrSrc[j][i+2048] * chrFilter[j];
474
                        }
475
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
476
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
477
                        U >>= 19;
478
                        V >>= 19;
479

    
480
                        Cb= clip_yuvtab_40cf[U+ 256];
481
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
482
                        Cr= clip_yuvtab_3343[V+ 256];
483

    
484
                        ((uint16_t*)dest)[2*i] =
485
                                clip_table15b[(Y1 + Cb + ditherb1) >>13] |
486
                                clip_table15g[(Y1 + Cg + ditherg1) >>13] |
487
                                clip_table15r[(Y1 + Cr + ditherr1) >>13];
488

    
489
                        ((uint16_t*)dest)[2*i+1] =
490
                                clip_table15b[(Y2 + Cb + ditherb2) >>13] |
491
                                clip_table15g[(Y2 + Cg + ditherg2) >>13] |
492
                                clip_table15r[(Y2 + Cr + ditherr2) >>13];
493
                }
494
        }
495
}
496

    
497

    
498
//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
499
//Plain C versions
500
#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
501
#define COMPILE_C
502
#endif
503

    
504
#ifdef CAN_COMPILE_X86_ASM
505

    
506
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
507
#define COMPILE_MMX
508
#endif
509

    
510
#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
511
#define COMPILE_MMX2
512
#endif
513

    
514
#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
515
#define COMPILE_3DNOW
516
#endif
517
#endif //CAN_COMPILE_X86_ASM
518

    
519
#undef HAVE_MMX
520
#undef HAVE_MMX2
521
#undef HAVE_3DNOW
522

    
523
#ifdef COMPILE_C
524
#undef HAVE_MMX
525
#undef HAVE_MMX2
526
#undef HAVE_3DNOW
527
#define RENAME(a) a ## _C
528
#include "swscale_template.c"
529
#endif
530

    
531
#ifdef CAN_COMPILE_X86_ASM
532

    
533
//X86 versions
534
/*
535
#undef RENAME
536
#undef HAVE_MMX
537
#undef HAVE_MMX2
538
#undef HAVE_3DNOW
539
#define ARCH_X86
540
#define RENAME(a) a ## _X86
541
#include "swscale_template.c"
542
*/
543
//MMX versions
544
#ifdef COMPILE_MMX
545
#undef RENAME
546
#define HAVE_MMX
547
#undef HAVE_MMX2
548
#undef HAVE_3DNOW
549
#define RENAME(a) a ## _MMX
550
#include "swscale_template.c"
551
#endif
552

    
553
//MMX2 versions
554
#ifdef COMPILE_MMX2
555
#undef RENAME
556
#define HAVE_MMX
557
#define HAVE_MMX2
558
#undef HAVE_3DNOW
559
#define RENAME(a) a ## _MMX2
560
#include "swscale_template.c"
561
#endif
562

    
563
//3DNOW versions
564
#ifdef COMPILE_3DNOW
565
#undef RENAME
566
#define HAVE_MMX
567
#undef HAVE_MMX2
568
#define HAVE_3DNOW
569
#define RENAME(a) a ## _3DNow
570
#include "swscale_template.c"
571
#endif
572

    
573
#endif //CAN_COMPILE_X86_ASM
574

    
575
// minor note: the HAVE_xyz is messed up after that line so dont use it
576

    
577

    
578
// old global scaler, dont use for new code
579
// will use sws_flags from the command line
580
void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
581
                             int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
582
                             int srcW, int srcH, int dstW, int dstH){
583

    
584
        static SwsContext *context=NULL;
585
        int dstFormat;
586
        int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
587

    
588
        switch(dstbpp)
589
        {
590
                case 8 : dstFormat= IMGFMT_Y8;                break;
591
                case 12: dstFormat= IMGFMT_YV12;        break;
592
                case 15: dstFormat= IMGFMT_BGR15;        break;
593
                case 16: dstFormat= IMGFMT_BGR16;        break;
594
                case 24: dstFormat= IMGFMT_BGR24;        break;
595
                case 32: dstFormat= IMGFMT_BGR32;        break;
596
                default: return;
597
        }
598

    
599
        if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
600

    
601
        context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
602
}
603

    
604
// will use sws_flags & src_filter (from cmd line)
605
SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
606
{
607
        int flags=0;
608
        static int firstTime=1;
609

    
610
#ifdef ARCH_X86
611
        if(gCpuCaps.hasMMX)
612
                asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
613
#endif
614
        if(firstTime)
615
        {
616
                firstTime=0;
617
                flags= SWS_PRINT_INFO;
618
        }
619
        else if(verbose>1) flags= SWS_PRINT_INFO;
620

    
621
        if(src_filter.lumH) freeVec(src_filter.lumH);
622
        if(src_filter.lumV) freeVec(src_filter.lumV);
623
        if(src_filter.chrH) freeVec(src_filter.chrH);
624
        if(src_filter.chrV) freeVec(src_filter.chrV);
625

    
626
        if(sws_lum_gblur!=0.0){
627
                src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
628
                src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
629
        }else{
630
                src_filter.lumH= getIdentityVec();
631
                src_filter.lumV= getIdentityVec();
632
        }
633

    
634
        if(sws_chr_gblur!=0.0){
635
                src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
636
                src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
637
        }else{
638
                src_filter.chrH= getIdentityVec();
639
                src_filter.chrV= getIdentityVec();
640
        }
641

    
642
        if(sws_chr_sharpen!=0.0){
643
                SwsVector *g= getConstVec(-1.0, 3);
644
                SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
645
                g->coeff[1]=2.0;
646
                addVec(id, g);
647
                convVec(src_filter.chrH, id);
648
                convVec(src_filter.chrV, id);
649
                freeVec(g);
650
                freeVec(id);
651
        }
652

    
653
        if(sws_lum_sharpen!=0.0){
654
                SwsVector *g= getConstVec(-1.0, 3);
655
                SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
656
                g->coeff[1]=2.0;
657
                addVec(id, g);
658
                convVec(src_filter.lumH, id);
659
                convVec(src_filter.lumV, id);
660
                freeVec(g);
661
                freeVec(id);
662
        }
663

    
664
        if(sws_chr_hshift)
665
                shiftVec(src_filter.chrH, sws_chr_hshift);
666

    
667
        if(sws_chr_vshift)
668
                shiftVec(src_filter.chrV, sws_chr_vshift);
669

    
670
        normalizeVec(src_filter.chrH, 1.0);
671
        normalizeVec(src_filter.chrV, 1.0);
672
        normalizeVec(src_filter.lumH, 1.0);
673
        normalizeVec(src_filter.lumV, 1.0);
674

    
675
        if(verbose > 1) printVec(src_filter.chrH);
676
        if(verbose > 1) printVec(src_filter.lumH);
677

    
678
        switch(sws_flags)
679
        {
680
                case 0: flags|= SWS_FAST_BILINEAR; break;
681
                case 1: flags|= SWS_BILINEAR; break;
682
                case 2: flags|= SWS_BICUBIC; break;
683
                case 3: flags|= SWS_X; break;
684
                case 4: flags|= SWS_POINT; break;
685
                case 5: flags|= SWS_AREA; break;
686
                default:flags|= SWS_BILINEAR; break;
687
        }
688

    
689
        return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
690
}
691

    
692

    
693
static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
694
                              int srcW, int dstW, int filterAlign, int one, int flags,
695
                              SwsVector *srcFilter, SwsVector *dstFilter)
696
{
697
        int i;
698
        int filterSize;
699
        int filter2Size;
700
        int minFilterSize;
701
        double *filter=NULL;
702
        double *filter2=NULL;
703
#ifdef ARCH_X86
704
        if(gCpuCaps.hasMMX)
705
                asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
706
#endif
707

    
708
        // Note the +1 is for the MMXscaler which reads over the end
709
        *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
710

    
711
        if(ABS(xInc - 0x10000) <10) // unscaled
712
        {
713
                int i;
714
                filterSize= 1;
715
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
716
                for(i=0; i<dstW*filterSize; i++) filter[i]=0;
717

    
718
                for(i=0; i<dstW; i++)
719
                {
720
                        filter[i*filterSize]=1;
721
                        (*filterPos)[i]=i;
722
                }
723

    
724
        }
725
        else if(flags&SWS_POINT) // lame looking point sampling mode
726
        {
727
                int i;
728
                int xDstInSrc;
729
                filterSize= 1;
730
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
731
                
732
                xDstInSrc= xInc/2 - 0x8000;
733
                for(i=0; i<dstW; i++)
734
                {
735
                        int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
736

    
737
                        (*filterPos)[i]= xx;
738
                        filter[i]= 1.0;
739
                        xDstInSrc+= xInc;
740
                }
741
        }
742
        else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
743
        {
744
                int i;
745
                int xDstInSrc;
746
                if     (flags&SWS_BICUBIC) filterSize= 4;
747
                else if(flags&SWS_X      ) filterSize= 4;
748
                else                           filterSize= 2; // SWS_BILINEAR / SWS_AREA 
749
//                printf("%d %d %d\n", filterSize, srcW, dstW);
750
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
751

    
752
                xDstInSrc= xInc/2 - 0x8000;
753
                for(i=0; i<dstW; i++)
754
                {
755
                        int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
756
                        int j;
757

    
758
                        (*filterPos)[i]= xx;
759
                        if((flags & SWS_BICUBIC) || (flags & SWS_X))
760
                        {
761
                                double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
762
                                double y1,y2,y3,y4;
763
                                double A= -0.6;
764
                                if(flags & SWS_BICUBIC){
765
                                                // Equation is from VirtualDub
766
                                        y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
767
                                        y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
768
                                        y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
769
                                        y4 = (                  +           A*d*d -       A*d*d*d);
770
                                }else{
771
                                                // cubic interpolation (derived it myself)
772
                                        y1 = (    -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
773
                                        y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
774
                                        y3 = (    +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
775
                                        y4 = (    -1.0*d           + 1.0*d*d*d)/6.0;
776
                                }
777

    
778
//                                printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
779
                                filter[i*filterSize + 0]= y1;
780
                                filter[i*filterSize + 1]= y2;
781
                                filter[i*filterSize + 2]= y3;
782
                                filter[i*filterSize + 3]= y4;
783
//                                printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
784
                        }
785
                        else
786
                        {
787
                                //Bilinear upscale / linear interpolate / Area averaging
788
                                for(j=0; j<filterSize; j++)
789
                                {
790
                                        double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
791
                                        double coeff= 1.0 - d;
792
                                        if(coeff<0) coeff=0;
793
        //                                printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
794
                                        filter[i*filterSize + j]= coeff;
795
                                        xx++;
796
                                }
797
                        }
798
                        xDstInSrc+= xInc;
799
                }
800
        }
801
        else // downscale
802
        {
803
                int xDstInSrc;
804
                if(flags&SWS_BICUBIC)        filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
805
                else if(flags&SWS_X)        filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
806
                else if(flags&SWS_AREA)        filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
807
                else /* BILINEAR */        filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
808
//                printf("%d %d %d\n", *filterSize, srcW, dstW);
809
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
810

    
811
                xDstInSrc= xInc/2 - 0x8000;
812
                for(i=0; i<dstW; i++)
813
                {
814
                        int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
815
                        int j;
816
                        (*filterPos)[i]= xx;
817
                        for(j=0; j<filterSize; j++)
818
                        {
819
                                double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
820
                                double coeff;
821
                                if((flags & SWS_BICUBIC) || (flags & SWS_X))
822
                                {
823
                                        double A= -0.75;
824
//                                        d*=2;
825
                                        // Equation is from VirtualDub
826
                                        if(d<1.0)
827
                                                coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
828
                                        else if(d<2.0)
829
                                                coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
830
                                        else
831
                                                coeff=0.0;
832
                                }
833
                                else if(flags & SWS_AREA)
834
                                {
835
                                        double srcPixelSize= (1<<16)/(double)xInc;
836
                                        if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
837
                                        else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
838
                                        else coeff=0.0;
839
                                }
840
                                else
841
                                {
842
                                        coeff= 1.0 - d;
843
                                        if(coeff<0) coeff=0;
844
                                }
845
//                                printf("%1.3f %2.3f %d \n", coeff, d, xDstInSrc);
846
                                filter[i*filterSize + j]= coeff;
847
                                xx++;
848
                        }
849
                        xDstInSrc+= xInc;
850
                }
851
        }
852

    
853
        /* apply src & dst Filter to filter -> filter2
854
           free(filter);
855
        */
856
        filter2Size= filterSize;
857
        if(srcFilter) filter2Size+= srcFilter->length - 1;
858
        if(dstFilter) filter2Size+= dstFilter->length - 1;
859
        filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
860

    
861
        for(i=0; i<dstW; i++)
862
        {
863
                int j;
864
                SwsVector scaleFilter;
865
                SwsVector *outVec;
866

    
867
                scaleFilter.coeff= filter + i*filterSize;
868
                scaleFilter.length= filterSize;
869

    
870
                if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
871
                else              outVec= &scaleFilter;
872

    
873
                ASSERT(outVec->length == filter2Size)
874
                //FIXME dstFilter
875

    
876
                for(j=0; j<outVec->length; j++)
877
                {
878
                        filter2[i*filter2Size + j]= outVec->coeff[j];
879
                }
880

    
881
                (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
882

    
883
                if(outVec != &scaleFilter) freeVec(outVec);
884
        }
885
        free(filter); filter=NULL;
886

    
887
        /* try to reduce the filter-size (step1 find size and shift left) */
888
        // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
889
        minFilterSize= 0;
890
        for(i=dstW-1; i>=0; i--)
891
        {
892
                int min= filter2Size;
893
                int j;
894
                double cutOff=0.0;
895

    
896
                /* get rid off near zero elements on the left by shifting left */
897
                for(j=0; j<filter2Size; j++)
898
                {
899
                        int k;
900
                        cutOff += ABS(filter2[i*filter2Size]);
901

    
902
                        if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
903

    
904
                        /* preserve Monotonicity because the core cant handle the filter otherwise */
905
                        if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
906

    
907
                        // Move filter coeffs left
908
                        for(k=1; k<filter2Size; k++)
909
                                filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
910
                        filter2[i*filter2Size + k - 1]= 0.0;
911
                        (*filterPos)[i]++;
912
                }
913

    
914
                cutOff=0.0;
915
                /* count near zeros on the right */
916
                for(j=filter2Size-1; j>0; j--)
917
                {
918
                        cutOff += ABS(filter2[i*filter2Size + j]);
919

    
920
                        if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
921
                        min--;
922
                }
923

    
924
                if(min>minFilterSize) minFilterSize= min;
925
        }
926

    
927
        filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
928
        filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
929
        *outFilterSize= filterSize;
930

    
931
        if((flags&SWS_PRINT_INFO) && verbose)
932
                printf("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
933
        /* try to reduce the filter-size (step2 reduce it) */
934
        for(i=0; i<dstW; i++)
935
        {
936
                int j;
937

    
938
                for(j=0; j<filterSize; j++)
939
                {
940
                        if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
941
                        else                   filter[i*filterSize + j]= filter2[i*filter2Size + j];
942
                }
943
        }
944
        free(filter2); filter2=NULL;
945
        
946
        ASSERT(filterSize > 0)
947

    
948
        //FIXME try to align filterpos if possible
949

    
950
        //fix borders
951
        for(i=0; i<dstW; i++)
952
        {
953
                int j;
954
                if((*filterPos)[i] < 0)
955
                {
956
                        // Move filter coeffs left to compensate for filterPos
957
                        for(j=1; j<filterSize; j++)
958
                        {
959
                                int left= MAX(j + (*filterPos)[i], 0);
960
                                filter[i*filterSize + left] += filter[i*filterSize + j];
961
                                filter[i*filterSize + j]=0;
962
                        }
963
                        (*filterPos)[i]= 0;
964
                }
965

    
966
                if((*filterPos)[i] + filterSize > srcW)
967
                {
968
                        int shift= (*filterPos)[i] + filterSize - srcW;
969
                        // Move filter coeffs right to compensate for filterPos
970
                        for(j=filterSize-2; j>=0; j--)
971
                        {
972
                                int right= MIN(j + shift, filterSize-1);
973
                                filter[i*filterSize +right] += filter[i*filterSize +j];
974
                                filter[i*filterSize +j]=0;
975
                        }
976
                        (*filterPos)[i]= srcW - filterSize;
977
                }
978
        }
979

    
980
        // Note the +1 is for the MMXscaler which reads over the end
981
        *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
982
        memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
983

    
984
        /* Normalize & Store in outFilter */
985
        for(i=0; i<dstW; i++)
986
        {
987
                int j;
988
                double sum=0;
989
                double scale= one;
990
                for(j=0; j<filterSize; j++)
991
                {
992
                        sum+= filter[i*filterSize + j];
993
                }
994
                scale/= sum;
995
                for(j=0; j<filterSize; j++)
996
                {
997
                        (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
998
                }
999
        }
1000
        
1001
        (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1002
        for(i=0; i<*outFilterSize; i++)
1003
        {
1004
                int j= dstW*(*outFilterSize);
1005
                (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1006
        }
1007

    
1008
        free(filter);
1009
}
1010

    
1011
#ifdef ARCH_X86
1012
static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
1013
{
1014
        uint8_t *fragment;
1015
        int imm8OfPShufW1;
1016
        int imm8OfPShufW2;
1017
        int fragmentLength;
1018

    
1019
        int xpos, i;
1020

    
1021
        // create an optimized horizontal scaling routine
1022

    
1023
        //code fragment
1024

    
1025
        asm volatile(
1026
                "jmp 9f                                \n\t"
1027
        // Begin
1028
                "0:                                \n\t"
1029
                "movq (%%esi), %%mm0                \n\t" //FIXME Alignment
1030
                "movq %%mm0, %%mm1                \n\t"
1031
                "psrlq $8, %%mm0                \n\t"
1032
                "punpcklbw %%mm7, %%mm1        \n\t"
1033
                "movq %%mm2, %%mm3                \n\t"
1034
                "punpcklbw %%mm7, %%mm0        \n\t"
1035
                "addw %%bx, %%cx                \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
1036
                "pshufw $0xFF, %%mm1, %%mm1        \n\t"
1037
                "1:                                \n\t"
1038
                "adcl %%edx, %%esi                \n\t" //xx+= (4*lumXInc)>>16 + carry
1039
                "pshufw $0xFF, %%mm0, %%mm0        \n\t"
1040
                "2:                                \n\t"
1041
                "psrlw $9, %%mm3                \n\t"
1042
                "psubw %%mm1, %%mm0                \n\t"
1043
                "pmullw %%mm3, %%mm0                \n\t"
1044
                "paddw %%mm6, %%mm2                \n\t" // 2*alpha += xpos&0xFFFF
1045
                "psllw $7, %%mm1                \n\t"
1046
                "paddw %%mm1, %%mm0                \n\t"
1047

    
1048
                "movq %%mm0, (%%edi, %%eax)        \n\t"
1049

    
1050
                "addl $8, %%eax                        \n\t"
1051
        // End
1052
                "9:                                \n\t"
1053
//                "int $3\n\t"
1054
                "leal 0b, %0                        \n\t"
1055
                "leal 1b, %1                        \n\t"
1056
                "leal 2b, %2                        \n\t"
1057
                "decl %1                        \n\t"
1058
                "decl %2                        \n\t"
1059
                "subl %0, %1                        \n\t"
1060
                "subl %0, %2                        \n\t"
1061
                "leal 9b, %3                        \n\t"
1062
                "subl %0, %3                        \n\t"
1063
                :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
1064
                "=r" (fragmentLength)
1065
        );
1066

    
1067
        xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1068

    
1069
        for(i=0; i<dstW/8; i++)
1070
        {
1071
                int xx=xpos>>16;
1072

    
1073
                if((i&3) == 0)
1074
                {
1075
                        int a=0;
1076
                        int b=((xpos+xInc)>>16) - xx;
1077
                        int c=((xpos+xInc*2)>>16) - xx;
1078
                        int d=((xpos+xInc*3)>>16) - xx;
1079

    
1080
                        memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
1081

    
1082
                        funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
1083
                        funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
1084
                                a | (b<<2) | (c<<4) | (d<<6);
1085

    
1086
                        // if we dont need to read 8 bytes than dont :), reduces the chance of
1087
                        // crossing a cache line
1088
                        if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
1089

    
1090
                        funnyCode[fragmentLength*(i+4)/4]= RET;
1091
                }
1092
                xpos+=xInc;
1093
        }
1094
}
1095
#endif // ARCH_X86
1096

    
1097
//FIXME remove
1098
void SwScale_Init(){
1099
}
1100

    
1101
static void globalInit(){
1102
    // generating tables:
1103
    int i;
1104
    for(i=0; i<768; i++){
1105
        int c= MIN(MAX(i-256, 0), 255);
1106
        clip_table[i]=c;
1107
        yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1108
        yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1109
        yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1110
        yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1111
        yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
1112
    }
1113

    
1114
    for(i=0; i<768; i++)
1115
    {
1116
        int v= clip_table[i];
1117
        clip_table16b[i]= le2me_16( v>>3);
1118
        clip_table16g[i]= le2me_16((v<<3)&0x07E0);
1119
        clip_table16r[i]= le2me_16((v<<8)&0xF800);
1120
        clip_table15b[i]= le2me_16( v>>3);
1121
        clip_table15g[i]= le2me_16((v<<2)&0x03E0);
1122
        clip_table15r[i]= le2me_16((v<<7)&0x7C00);
1123
    }
1124

    
1125
cpuCaps= gCpuCaps;
1126

    
1127
#ifdef RUNTIME_CPUDETECT
1128
#ifdef CAN_COMPILE_X86_ASM
1129
        // ordered per speed fasterst first
1130
        if(gCpuCaps.hasMMX2)
1131
                swScale= swScale_MMX2;
1132
        else if(gCpuCaps.has3DNow)
1133
                swScale= swScale_3DNow;
1134
        else if(gCpuCaps.hasMMX)
1135
                swScale= swScale_MMX;
1136
        else
1137
                swScale= swScale_C;
1138

    
1139
#else
1140
        swScale= swScale_C;
1141
        cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1142
#endif
1143
#else //RUNTIME_CPUDETECT
1144
#ifdef HAVE_MMX2
1145
        swScale= swScale_MMX2;
1146
        cpuCaps.has3DNow = 0;
1147
#elif defined (HAVE_3DNOW)
1148
        swScale= swScale_3DNow;
1149
        cpuCaps.hasMMX2 = 0;
1150
#elif defined (HAVE_MMX)
1151
        swScale= swScale_MMX;
1152
        cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1153
#else
1154
        swScale= swScale_C;
1155
        cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1156
#endif
1157
#endif //!RUNTIME_CPUDETECT
1158
}
1159

    
1160
/* Warper functions for yuv2bgr */
1161
static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1162
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1163
        uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1164

    
1165
        if(c->srcFormat==IMGFMT_YV12)
1166
                yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1167
        else /* I420 & IYUV */
1168
                yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1169
}
1170

    
1171
static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1172
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1173
        
1174
        if(dstStride[0]*3==srcStride[0]*4)
1175
                rgb24to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1176
        else
1177
        {
1178
                int i;
1179
                uint8_t *srcPtr= src[0];
1180
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1181

    
1182
                for(i=0; i<srcSliceH; i++)
1183
                {
1184
                        rgb24to32(srcPtr, dstPtr, c->srcW*3);
1185
                        srcPtr+= srcStride[0];
1186
                        dstPtr+= dstStride[0];
1187
                }
1188
        }     
1189
}
1190

    
1191
static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1192
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1193
        
1194
        if(dstStride[0]*4==srcStride[0]*3)
1195
                rgb32to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1196
        else
1197
        {
1198
                int i;
1199
                uint8_t *srcPtr= src[0];
1200
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1201

    
1202
                for(i=0; i<srcSliceH; i++)
1203
                {
1204
                        rgb32to24(srcPtr, dstPtr, c->srcW<<2);
1205
                        srcPtr+= srcStride[0];
1206
                        dstPtr+= dstStride[0];
1207
                }
1208
        }     
1209
}
1210

    
1211
static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1212
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1213
        
1214
        if(dstStride[0]==srcStride[0])
1215
                rgb15to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1216
        else
1217
        {
1218
                int i;
1219
                uint8_t *srcPtr= src[0];
1220
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1221

    
1222
                for(i=0; i<srcSliceH; i++)
1223
                {
1224
                        rgb15to16(srcPtr, dstPtr, c->srcW<<1);
1225
                        srcPtr+= srcStride[0];
1226
                        dstPtr+= dstStride[0];
1227
                }
1228
        }     
1229
}
1230

    
1231
static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1232
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1233

    
1234
        rgb24toyv12(
1235
                src[0], 
1236
                dst[0]+ srcSliceY    *dstStride[0], 
1237
                dst[1]+(srcSliceY>>1)*dstStride[1], 
1238
                dst[2]+(srcSliceY>>1)*dstStride[2],
1239
                c->srcW, srcSliceH, 
1240
                dstStride[0], dstStride[1], srcStride[0]);
1241
}
1242

    
1243

    
1244
/* unscaled copy like stuff (assumes nearly identical formats) */
1245
static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
1246
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1247

    
1248
        int srcStride[3];
1249
        uint8_t *src[3];
1250
        uint8_t *dst[3];
1251

    
1252
        if(c->srcFormat == IMGFMT_I420){
1253
                src[0]= srcParam[0];
1254
                src[1]= srcParam[2];
1255
                src[2]= srcParam[1];
1256
                srcStride[0]= srcStrideParam[0];
1257
                srcStride[1]= srcStrideParam[2];
1258
                srcStride[2]= srcStrideParam[1];
1259
        }
1260
        else if(c->srcFormat==IMGFMT_YV12){
1261
                src[0]= srcParam[0];
1262
                src[1]= srcParam[1];
1263
                src[2]= srcParam[2];
1264
                srcStride[0]= srcStrideParam[0];
1265
                srcStride[1]= srcStrideParam[1];
1266
                srcStride[2]= srcStrideParam[2];
1267
        }
1268
        else if(isPacked(c->srcFormat) || isGray(c->srcFormat)){
1269
                src[0]= srcParam[0];
1270
                src[1]=
1271
                src[2]= NULL;
1272
                srcStride[0]= srcStrideParam[0];
1273
                srcStride[1]=
1274
                srcStride[2]= 0;
1275
        }
1276

    
1277
        if(c->dstFormat == IMGFMT_I420){
1278
                dst[0]= dstParam[0];
1279
                dst[1]= dstParam[2];
1280
                dst[2]= dstParam[1];
1281
                
1282
        }else{
1283
                dst[0]= dstParam[0];
1284
                dst[1]= dstParam[1];
1285
                dst[2]= dstParam[2];
1286
        }
1287

    
1288
        if(isPacked(c->srcFormat))
1289
        {
1290
                if(dstStride[0]==srcStride[0])
1291
                        memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1292
                else
1293
                {
1294
                        int i;
1295
                        uint8_t *srcPtr= src[0];
1296
                        uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1297
                        int length=0;
1298

    
1299
                        /* universal length finder */
1300
                        while(length+c->srcW <= dstStride[0] 
1301
                           && length+c->srcW <= srcStride[0]) length+= c->srcW;
1302
                        ASSERT(length!=0);
1303

    
1304
                        for(i=0; i<srcSliceH; i++)
1305
                        {
1306
                                memcpy(dstPtr, srcPtr, length);
1307
                                srcPtr+= srcStride[0];
1308
                                dstPtr+= dstStride[0];
1309
                        }
1310
                }
1311
        }
1312
        else 
1313
        { /* Planar YUV */
1314
                int plane;
1315
                for(plane=0; plane<3; plane++)
1316
                {
1317
                        int length= plane==0 ? c->srcW  : ((c->srcW+1)>>1);
1318
                        int y=      plane==0 ? srcSliceY: ((srcSliceY+1)>>1);
1319
                        int height= plane==0 ? srcSliceH: ((srcSliceH+1)>>1);
1320

    
1321
                        if(dstStride[plane]==srcStride[plane])
1322
                                memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1323
                        else
1324
                        {
1325
                                int i;
1326
                                uint8_t *srcPtr= src[plane];
1327
                                uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1328
                                for(i=0; i<height; i++)
1329
                                {
1330
                                        memcpy(dstPtr, srcPtr, length);
1331
                                        srcPtr+= srcStride[plane];
1332
                                        dstPtr+= dstStride[plane];
1333
                                }
1334
                        }
1335
                }
1336
        }
1337
}
1338

    
1339
SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1340
                         SwsFilter *srcFilter, SwsFilter *dstFilter){
1341

    
1342
        SwsContext *c;
1343
        int i;
1344
        int usesFilter;
1345
        SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1346

    
1347
#ifdef ARCH_X86
1348
        if(gCpuCaps.hasMMX)
1349
                asm volatile("emms\n\t"::: "memory");
1350
#endif
1351

    
1352
        if(swScale==NULL) globalInit();
1353

    
1354
        /* avoid dupplicate Formats, so we dont need to check to much */
1355
        if(srcFormat==IMGFMT_IYUV) srcFormat=IMGFMT_I420;
1356
        if(srcFormat==IMGFMT_Y8)   srcFormat=IMGFMT_Y800;
1357
        if(dstFormat==IMGFMT_Y8)   dstFormat=IMGFMT_Y800;
1358

    
1359
        if(!isSupportedIn(srcFormat)) 
1360
        {
1361
                fprintf(stderr, "swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
1362
                return NULL;
1363
        }
1364
        if(!isSupportedOut(dstFormat))
1365
        {
1366
                fprintf(stderr, "swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
1367
                return NULL;
1368
        }
1369

    
1370
        /* sanity check */
1371
        if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1372
        {
1373
                fprintf(stderr, "swScaler: %dx%d -> %dx%d is invalid scaling dimension\n", 
1374
                        srcW, srcH, dstW, dstH);
1375
                return NULL;
1376
        }
1377

    
1378
        if(!dstFilter) dstFilter= &dummyFilter;
1379
        if(!srcFilter) srcFilter= &dummyFilter;
1380

    
1381
        c= memalign(64, sizeof(SwsContext));
1382
        memset(c, 0, sizeof(SwsContext));
1383

    
1384
        c->srcW= srcW;
1385
        c->srcH= srcH;
1386
        c->dstW= dstW;
1387
        c->dstH= dstH;
1388
        c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1389
        c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
1390
        c->flags= flags;
1391
        c->dstFormat= dstFormat;
1392
        c->srcFormat= srcFormat;
1393

    
1394
        usesFilter=0;
1395
        if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
1396
        if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
1397
        if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
1398
        if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
1399
        if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
1400
        if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
1401
        if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
1402
        if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
1403
        
1404
        /* unscaled special Cases */
1405
        if(srcW==dstW && srcH==dstH && !usesFilter)
1406
        {
1407
                /* yuv2bgr */
1408
                if(isPlanarYUV(srcFormat) && isBGR(dstFormat))
1409
                {
1410
                        // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
1411
                        yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1412
                        c->swScale= planarYuvToBgr;
1413

    
1414
                        if(flags&SWS_PRINT_INFO)
1415
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1416
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1417
                        return c;
1418
                }
1419

    
1420
                /* simple copy */
1421
                if(srcFormat == dstFormat || (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat)))
1422
                {
1423
                        c->swScale= simpleCopy;
1424

    
1425
                        if(flags&SWS_PRINT_INFO)
1426
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1427
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1428
                        return c;
1429
                }
1430
                
1431
                /* bgr32to24 & rgb32to24*/
1432
                if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR24)
1433
                 ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB24))
1434
                {
1435
                        c->swScale= bgr32to24Wrapper;
1436

    
1437
                        if(flags&SWS_PRINT_INFO)
1438
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1439
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1440
                        return c;
1441
                }
1442
                
1443
                /* bgr24to32 & rgb24to32*/
1444
                if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR32)
1445
                 ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB32))
1446
                {
1447
                        c->swScale= bgr24to32Wrapper;
1448

    
1449
                        if(flags&SWS_PRINT_INFO)
1450
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1451
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1452
                        return c;
1453
                }
1454

    
1455
                /* bgr15to16 */
1456
                if(srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR16)
1457
                {
1458
                        c->swScale= bgr15to16Wrapper;
1459

    
1460
                        if(flags&SWS_PRINT_INFO)
1461
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1462
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1463
                        return c;
1464
                }
1465

    
1466
                /* bgr24toYV12 */
1467
                if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
1468
                {
1469
                        c->swScale= bgr24toyv12Wrapper;
1470

    
1471
                        if(flags&SWS_PRINT_INFO)
1472
                                printf("SwScaler: using unscaled %s -> %s special converter\n", 
1473
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1474
                        return c;
1475
                }
1476
        }
1477

    
1478
        if(cpuCaps.hasMMX2)
1479
        {
1480
                c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
1481
                if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
1482
                {
1483
                        if(flags&SWS_PRINT_INFO)
1484
                                fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
1485
                }
1486
        }
1487
        else
1488
                c->canMMX2BeUsed=0;
1489

    
1490

    
1491
        /* dont use full vertical UV input/internaly if the source doesnt even have it */
1492
        if(isHalfChrV(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_V);
1493
        /* dont use full horizontal UV input if the source doesnt even have it */
1494
        if(isHalfChrH(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INP);
1495
        /* dont use full horizontal UV internally if the destination doesnt even have it */
1496
        if(isHalfChrH(dstFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INT);
1497

    
1498
        if(flags&SWS_FULL_CHR_H_INP)        c->chrSrcW= srcW;
1499
        else                                c->chrSrcW= (srcW+1)>>1;
1500

    
1501
        if(flags&SWS_FULL_CHR_H_INT)        c->chrDstW= dstW;
1502
        else                                c->chrDstW= (dstW+1)>>1;
1503

    
1504
        if(flags&SWS_FULL_CHR_V)        c->chrSrcH= srcH;
1505
        else                                c->chrSrcH= (srcH+1)>>1;
1506

    
1507
        if(isHalfChrV(dstFormat))        c->chrDstH= (dstH+1)>>1;
1508
        else                                c->chrDstH= dstH;
1509

    
1510
        c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
1511
        c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
1512

    
1513

    
1514
        // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1515
        // but only for the FAST_BILINEAR mode otherwise do correct scaling
1516
        // n-2 is the last chrominance sample available
1517
        // this is not perfect, but noone shuld notice the difference, the more correct variant
1518
        // would be like the vertical one, but that would require some special code for the
1519
        // first and last pixel
1520
        if(flags&SWS_FAST_BILINEAR)
1521
        {
1522
                if(c->canMMX2BeUsed)
1523
                {
1524
                        c->lumXInc+= 20;
1525
                        c->chrXInc+= 20;
1526
                }
1527
                //we dont use the x86asm scaler if mmx is available
1528
                else if(cpuCaps.hasMMX)
1529
                {
1530
                        c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
1531
                        c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
1532
                }
1533
        }
1534

    
1535
        /* precalculate horizontal scaler filter coefficients */
1536
        {
1537
                const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
1538

    
1539
                initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
1540
                                 srcW      ,       dstW, filterAlign, 1<<14, flags,
1541
                                 srcFilter->lumH, dstFilter->lumH);
1542
                initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
1543
                                (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
1544
                                 srcFilter->chrH, dstFilter->chrH);
1545

    
1546
#ifdef ARCH_X86
1547
// cant downscale !!!
1548
                if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
1549
                {
1550
                        initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode);
1551
                        initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
1552
                }
1553
#endif
1554
        } // Init Horizontal stuff
1555

    
1556

    
1557

    
1558
        /* precalculate vertical scaler filter coefficients */
1559
        initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
1560
                        srcH      ,        dstH, 1, (1<<12)-4, flags,
1561
                        srcFilter->lumV, dstFilter->lumV);
1562
        initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
1563
                        (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
1564
                         srcFilter->chrV, dstFilter->chrV);
1565

    
1566
        // Calculate Buffer Sizes so that they wont run out while handling these damn slices
1567
        c->vLumBufSize= c->vLumFilterSize;
1568
        c->vChrBufSize= c->vChrFilterSize;
1569
        for(i=0; i<dstH; i++)
1570
        {
1571
                int chrI= i*c->chrDstH / dstH;
1572
                int nextSlice= MAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
1573
                                 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
1574
                nextSlice&= ~1; // Slices start at even boundaries
1575
                if(c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
1576
                        c->vLumBufSize= nextSlice - c->vLumFilterPos[i   ];
1577
                if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
1578
                        c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
1579
        }
1580

    
1581
        // allocate pixbufs (we use dynamic allocation because otherwise we would need to
1582
        c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
1583
        c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
1584
        //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
1585
        for(i=0; i<c->vLumBufSize; i++)
1586
                c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
1587
        for(i=0; i<c->vChrBufSize; i++)
1588
                c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
1589

    
1590
        //try to avoid drawing green stuff between the right end and the stride end
1591
        for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
1592
        for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
1593

    
1594
        ASSERT(c->chrDstH <= dstH)
1595

    
1596
        // pack filter data for mmx code
1597
        if(cpuCaps.hasMMX)
1598
        {
1599
                c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize*      dstH*4*sizeof(int16_t));
1600
                c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
1601
                for(i=0; i<c->vLumFilterSize*dstH; i++)
1602
                        c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
1603
                                c->vLumFilter[i];
1604
                for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
1605
                        c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
1606
                                c->vChrFilter[i];
1607
        }
1608

    
1609
        if(flags&SWS_PRINT_INFO)
1610
        {
1611
#ifdef DITHER1XBPP
1612
                char *dither= " dithered";
1613
#else
1614
                char *dither= "";
1615
#endif
1616
                if(flags&SWS_FAST_BILINEAR)
1617
                        fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler, ");
1618
                else if(flags&SWS_BILINEAR)
1619
                        fprintf(stderr, "\nSwScaler: BILINEAR scaler, ");
1620
                else if(flags&SWS_BICUBIC)
1621
                        fprintf(stderr, "\nSwScaler: BICUBIC scaler, ");
1622
                else if(flags&SWS_X)
1623
                        fprintf(stderr, "\nSwScaler: Experimental scaler, ");
1624
                else if(flags&SWS_POINT)
1625
                        fprintf(stderr, "\nSwScaler: Nearest Neighbor / POINT scaler, ");
1626
                else if(flags&SWS_AREA)
1627
                        fprintf(stderr, "\nSwScaler: Area Averageing scaler, ");
1628
                else
1629
                        fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
1630

    
1631
                if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
1632
                        fprintf(stderr, "from %s to%s %s ", 
1633
                                vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
1634
                else
1635
                        fprintf(stderr, "from %s to %s ", 
1636
                                vo_format_name(srcFormat), vo_format_name(dstFormat));
1637

    
1638
                if(cpuCaps.hasMMX2)
1639
                        fprintf(stderr, "using MMX2\n");
1640
                else if(cpuCaps.has3DNow)
1641
                        fprintf(stderr, "using 3DNOW\n");
1642
                else if(cpuCaps.hasMMX)
1643
                        fprintf(stderr, "using MMX\n");
1644
                else
1645
                        fprintf(stderr, "using C\n");
1646
        }
1647

    
1648
        if((flags & SWS_PRINT_INFO) && verbose)
1649
        {
1650
                if(cpuCaps.hasMMX)
1651
                {
1652
                        if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1653
                                printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1654
                        else
1655
                        {
1656
                                if(c->hLumFilterSize==4)
1657
                                        printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
1658
                                else if(c->hLumFilterSize==8)
1659
                                        printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
1660
                                else
1661
                                        printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
1662

    
1663
                                if(c->hChrFilterSize==4)
1664
                                        printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
1665
                                else if(c->hChrFilterSize==8)
1666
                                        printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
1667
                                else
1668
                                        printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
1669
                        }
1670
                }
1671
                else
1672
                {
1673
#ifdef ARCH_X86
1674
                        printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
1675
#else
1676
                        if(flags & SWS_FAST_BILINEAR)
1677
                                printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
1678
                        else
1679
                                printf("SwScaler: using C scaler for horizontal scaling\n");
1680
#endif
1681
                }
1682
                if(isPlanarYUV(dstFormat))
1683
                {
1684
                        if(c->vLumFilterSize==1)
1685
                                printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1686
                        else
1687
                                printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1688
                }
1689
                else
1690
                {
1691
                        if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
1692
                                printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1693
                                       "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
1694
                        else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
1695
                                printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1696
                        else
1697
                                printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1698
                }
1699

    
1700
                if(dstFormat==IMGFMT_BGR24)
1701
                        printf("SwScaler: using %s YV12->BGR24 Converter\n",
1702
                                cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
1703
                else if(dstFormat==IMGFMT_BGR32)
1704
                        printf("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1705
                else if(dstFormat==IMGFMT_BGR16)
1706
                        printf("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1707
                else if(dstFormat==IMGFMT_BGR15)
1708
                        printf("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1709

    
1710
                printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1711
        }
1712
        if((flags & SWS_PRINT_INFO) && verbose>1)
1713
        {
1714
                printf("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1715
                        c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
1716
                printf("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1717
                        c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
1718
        }
1719

    
1720
        c->swScale= swScale;
1721
        return c;
1722
}
1723

    
1724
/**
1725
 * returns a normalized gaussian curve used to filter stuff
1726
 * quality=3 is high quality, lowwer is lowwer quality
1727
 */
1728

    
1729
SwsVector *getGaussianVec(double variance, double quality){
1730
        const int length= (int)(variance*quality + 0.5) | 1;
1731
        int i;
1732
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1733
        double middle= (length-1)*0.5;
1734
        SwsVector *vec= malloc(sizeof(SwsVector));
1735

    
1736
        vec->coeff= coeff;
1737
        vec->length= length;
1738

    
1739
        for(i=0; i<length; i++)
1740
        {
1741
                double dist= i-middle;
1742
                coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1743
        }
1744

    
1745
        normalizeVec(vec, 1.0);
1746

    
1747
        return vec;
1748
}
1749

    
1750
SwsVector *getConstVec(double c, int length){
1751
        int i;
1752
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1753
        SwsVector *vec= malloc(sizeof(SwsVector));
1754

    
1755
        vec->coeff= coeff;
1756
        vec->length= length;
1757

    
1758
        for(i=0; i<length; i++)
1759
                coeff[i]= c;
1760

    
1761
        return vec;
1762
}
1763

    
1764

    
1765
SwsVector *getIdentityVec(void){
1766
        double *coeff= memalign(sizeof(double), sizeof(double));
1767
        SwsVector *vec= malloc(sizeof(SwsVector));
1768
        coeff[0]= 1.0;
1769

    
1770
        vec->coeff= coeff;
1771
        vec->length= 1;
1772

    
1773
        return vec;
1774
}
1775

    
1776
void normalizeVec(SwsVector *a, double height){
1777
        int i;
1778
        double sum=0;
1779
        double inv;
1780

    
1781
        for(i=0; i<a->length; i++)
1782
                sum+= a->coeff[i];
1783

    
1784
        inv= height/sum;
1785

    
1786
        for(i=0; i<a->length; i++)
1787
                a->coeff[i]*= height;
1788
}
1789

    
1790
void scaleVec(SwsVector *a, double scalar){
1791
        int i;
1792

    
1793
        for(i=0; i<a->length; i++)
1794
                a->coeff[i]*= scalar;
1795
}
1796

    
1797
static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
1798
        int length= a->length + b->length - 1;
1799
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1800
        int i, j;
1801
        SwsVector *vec= malloc(sizeof(SwsVector));
1802

    
1803
        vec->coeff= coeff;
1804
        vec->length= length;
1805

    
1806
        for(i=0; i<length; i++) coeff[i]= 0.0;
1807

    
1808
        for(i=0; i<a->length; i++)
1809
        {
1810
                for(j=0; j<b->length; j++)
1811
                {
1812
                        coeff[i+j]+= a->coeff[i]*b->coeff[j];
1813
                }
1814
        }
1815

    
1816
        return vec;
1817
}
1818

    
1819
static SwsVector *sumVec(SwsVector *a, SwsVector *b){
1820
        int length= MAX(a->length, b->length);
1821
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1822
        int i;
1823
        SwsVector *vec= malloc(sizeof(SwsVector));
1824

    
1825
        vec->coeff= coeff;
1826
        vec->length= length;
1827

    
1828
        for(i=0; i<length; i++) coeff[i]= 0.0;
1829

    
1830
        for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1831
        for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
1832

    
1833
        return vec;
1834
}
1835

    
1836
static SwsVector *diffVec(SwsVector *a, SwsVector *b){
1837
        int length= MAX(a->length, b->length);
1838
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1839
        int i;
1840
        SwsVector *vec= malloc(sizeof(SwsVector));
1841

    
1842
        vec->coeff= coeff;
1843
        vec->length= length;
1844

    
1845
        for(i=0; i<length; i++) coeff[i]= 0.0;
1846

    
1847
        for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1848
        for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
1849

    
1850
        return vec;
1851
}
1852

    
1853
/* shift left / or right if "shift" is negative */
1854
static SwsVector *getShiftedVec(SwsVector *a, int shift){
1855
        int length= a->length + ABS(shift)*2;
1856
        double *coeff= memalign(sizeof(double), length*sizeof(double));
1857
        int i;
1858
        SwsVector *vec= malloc(sizeof(SwsVector));
1859

    
1860
        vec->coeff= coeff;
1861
        vec->length= length;
1862

    
1863
        for(i=0; i<length; i++) coeff[i]= 0.0;
1864

    
1865
        for(i=0; i<a->length; i++)
1866
        {
1867
                coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
1868
        }
1869

    
1870
        return vec;
1871
}
1872

    
1873
void shiftVec(SwsVector *a, int shift){
1874
        SwsVector *shifted= getShiftedVec(a, shift);
1875
        free(a->coeff);
1876
        a->coeff= shifted->coeff;
1877
        a->length= shifted->length;
1878
        free(shifted);
1879
}
1880

    
1881
void addVec(SwsVector *a, SwsVector *b){
1882
        SwsVector *sum= sumVec(a, b);
1883
        free(a->coeff);
1884
        a->coeff= sum->coeff;
1885
        a->length= sum->length;
1886
        free(sum);
1887
}
1888

    
1889
void subVec(SwsVector *a, SwsVector *b){
1890
        SwsVector *diff= diffVec(a, b);
1891
        free(a->coeff);
1892
        a->coeff= diff->coeff;
1893
        a->length= diff->length;
1894
        free(diff);
1895
}
1896

    
1897
void convVec(SwsVector *a, SwsVector *b){
1898
        SwsVector *conv= getConvVec(a, b);
1899
        free(a->coeff);
1900
        a->coeff= conv->coeff;
1901
        a->length= conv->length;
1902
        free(conv);
1903
}
1904

    
1905
SwsVector *cloneVec(SwsVector *a){
1906
        double *coeff= memalign(sizeof(double), a->length*sizeof(double));
1907
        int i;
1908
        SwsVector *vec= malloc(sizeof(SwsVector));
1909

    
1910
        vec->coeff= coeff;
1911
        vec->length= a->length;
1912

    
1913
        for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
1914

    
1915
        return vec;
1916
}
1917

    
1918
void printVec(SwsVector *a){
1919
        int i;
1920
        double max=0;
1921
        double min=0;
1922
        double range;
1923

    
1924
        for(i=0; i<a->length; i++)
1925
                if(a->coeff[i]>max) max= a->coeff[i];
1926

    
1927
        for(i=0; i<a->length; i++)
1928
                if(a->coeff[i]<min) min= a->coeff[i];
1929

    
1930
        range= max - min;
1931

    
1932
        for(i=0; i<a->length; i++)
1933
        {
1934
                int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
1935
                printf("%1.3f ", a->coeff[i]);
1936
                for(;x>0; x--) printf(" ");
1937
                printf("|\n");
1938
        }
1939
}
1940

    
1941
void freeVec(SwsVector *a){
1942
        if(!a) return;
1943
        if(a->coeff) free(a->coeff);
1944
        a->coeff=NULL;
1945
        a->length=0;
1946
        free(a);
1947
}
1948

    
1949
void freeSwsContext(SwsContext *c){
1950
        int i;
1951

    
1952
        if(!c) return;
1953

    
1954
        if(c->lumPixBuf)
1955
        {
1956
                for(i=0; i<c->vLumBufSize; i++)
1957
                {
1958
                        if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
1959
                        c->lumPixBuf[i]=NULL;
1960
                }
1961
                free(c->lumPixBuf);
1962
                c->lumPixBuf=NULL;
1963
        }
1964

    
1965
        if(c->chrPixBuf)
1966
        {
1967
                for(i=0; i<c->vChrBufSize; i++)
1968
                {
1969
                        if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
1970
                        c->chrPixBuf[i]=NULL;
1971
                }
1972
                free(c->chrPixBuf);
1973
                c->chrPixBuf=NULL;
1974
        }
1975

    
1976
        if(c->vLumFilter) free(c->vLumFilter);
1977
        c->vLumFilter = NULL;
1978
        if(c->vChrFilter) free(c->vChrFilter);
1979
        c->vChrFilter = NULL;
1980
        if(c->hLumFilter) free(c->hLumFilter);
1981
        c->hLumFilter = NULL;
1982
        if(c->hChrFilter) free(c->hChrFilter);
1983
        c->hChrFilter = NULL;
1984

    
1985
        if(c->vLumFilterPos) free(c->vLumFilterPos);
1986
        c->vLumFilterPos = NULL;
1987
        if(c->vChrFilterPos) free(c->vChrFilterPos);
1988
        c->vChrFilterPos = NULL;
1989
        if(c->hLumFilterPos) free(c->hLumFilterPos);
1990
        c->hLumFilterPos = NULL;
1991
        if(c->hChrFilterPos) free(c->hChrFilterPos);
1992
        c->hChrFilterPos = NULL;
1993

    
1994
        if(c->lumMmxFilter) free(c->lumMmxFilter);
1995
        c->lumMmxFilter = NULL;
1996
        if(c->chrMmxFilter) free(c->chrMmxFilter);
1997
        c->chrMmxFilter = NULL;
1998

    
1999
        free(c);
2000
}
2001

    
2002