Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale.c @ c7a810cc

History | View | Annotate | Download (70.8 KB)

1
/*
2
    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
/*
20
  supported Input formats: YV12, I420, IYUV, YUY2, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8, Y800
21
  supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32, Y8, Y800
22
  BGR15/16 support dithering
23
  
24
  unscaled special converters
25
  YV12/I420/IYUV -> BGR15/BGR16/BGR24/BGR32
26
  YV12/I420/IYUV -> YV12/I420/IYUV
27
  YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
28
  BGR24 -> BGR32 & RGB24 -> RGB32
29
  BGR32 -> BGR24 & RGB32 -> RGB24
30
  BGR15 -> BGR16
31
*/
32

    
33
/* 
34
tested special converters
35
 YV12/I420 -> BGR16
36
 YV12 -> YV12
37
 BGR15 -> BGR16
38
 BGR16 -> BGR16
39

40
untested special converters
41
  YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
42
  YV12/I420 -> YV12/I420
43
  YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
44
  BGR24 -> BGR32 & RGB24 -> RGB32
45
  BGR32 -> BGR24 & RGB32 -> RGB24
46
  BGR24 -> YV12
47
*/
48

    
49
#include <inttypes.h>
50
#include <string.h>
51
#include <math.h>
52
#include <stdio.h>
53
#include "../config.h"
54
#include "../mangle.h"
55
#include <assert.h>
56
#ifdef HAVE_MALLOC_H
57
#include <malloc.h>
58
#else
59
#include <stdlib.h>
60
#endif
61
#include "swscale.h"
62
#include "../cpudetect.h"
63
#include "../bswap.h"
64
#include "../libvo/img_format.h"
65
#include "rgb2rgb.h"
66
#include "../libvo/fastmemcpy.h"
67
#include "../mp_msg.h"
68

    
69
#define MSG_WARN(args...) mp_msg(MSGT_SWS,MSGL_WARN, ##args )
70
#define MSG_FATAL(args...) mp_msg(MSGT_SWS,MSGL_FATAL, ##args )
71
#define MSG_ERR(args...) mp_msg(MSGT_SWS,MSGL_ERR, ##args )
72
#define MSG_V(args...) mp_msg(MSGT_SWS,MSGL_V, ##args )
73
#define MSG_DBG2(args...) mp_msg(MSGT_SWS,MSGL_DBG2, ##args )
74
#define MSG_INFO(args...) mp_msg(MSGT_SWS,MSGL_INFO, ##args )
75

    
76
#undef MOVNTQ
77
#undef PAVGB
78

    
79
//#undef HAVE_MMX2
80
//#define HAVE_3DNOW
81
//#undef HAVE_MMX
82
//#undef ARCH_X86
83
//#define WORDS_BIGENDIAN
84
#define DITHER1XBPP
85

    
86
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
87

    
88
#define RET 0xC3 //near return opcode for X86
89

    
90
#ifdef MP_DEBUG
91
#define ASSERT(x) assert(x);
92
#else
93
#define ASSERT(x) ;
94
#endif
95

    
96
#ifdef M_PI
97
#define PI M_PI
98
#else
99
#define PI 3.14159265358979323846
100
#endif
101

    
102
//FIXME replace this with something faster
103
#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YVU9)
104
#define isYUV(x)       ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
105
#define isHalfChrV(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
106
#define isHalfChrH(x)  ((x)==IMGFMT_YUY2 || (x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
107
#define isGray(x)      ((x)==IMGFMT_Y800)
108
#define isSupportedIn(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
109
                        || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
110
                        || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
111
                        || (x)==IMGFMT_Y800)
112
#define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
113
                        || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
114
                        || (x)==IMGFMT_Y800)
115
#define isRGB(x)       (((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
116
#define isBGR(x)       (((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR)
117
#define isPacked(x)    ((x)==IMGFMT_YUY2 || isRGB(x) || isBGR(x))
118

    
119
#define RGB2YUV_SHIFT 16
120
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
121
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
122
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
123
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
124
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
125
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
126
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
127
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
128
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
129

    
130
extern int verbose; // defined in mplayer.c
131
/*
132
NOTES
133
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
134

135
TODO
136
more intelligent missalignment avoidance for the horizontal scaler
137
write special vertical cubic upscale version
138
Optimize C code (yv12 / minmax)
139
add support for packed pixel yuv input & output
140
add support for Y8 output
141
optimize bgr24 & bgr32
142
add BGR4 output support
143
write special BGR->BGR scaler
144
deglobalize yuv2rgb*.c
145
*/
146

    
147
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
148
#define MIN(a,b) ((a) > (b) ? (b) : (a))
149
#define MAX(a,b) ((a) < (b) ? (b) : (a))
150

    
151
#ifdef ARCH_X86
152
#define CAN_COMPILE_X86_ASM
153
#endif
154

    
155
#ifdef CAN_COMPILE_X86_ASM
156
static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
157
static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
158
static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
159
static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
160
static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
161
static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
162
static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
163
static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
164
static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
165
static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
166
static uint64_t __attribute__((aligned(8))) w02=       0x0002000200020002LL;
167
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
168
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
169
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
170
static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
171

    
172
static volatile uint64_t __attribute__((aligned(8))) b5Dither;
173
static volatile uint64_t __attribute__((aligned(8))) g5Dither;
174
static volatile uint64_t __attribute__((aligned(8))) g6Dither;
175
static volatile uint64_t __attribute__((aligned(8))) r5Dither;
176

    
177
static uint64_t __attribute__((aligned(8))) dither4[2]={
178
        0x0103010301030103LL,
179
        0x0200020002000200LL,};
180

    
181
static uint64_t __attribute__((aligned(8))) dither8[2]={
182
        0x0602060206020602LL,
183
        0x0004000400040004LL,};
184

    
185
static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
186
static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
187
static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
188
static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
189
static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
190
static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
191

    
192
static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
193
static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
194
static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
195

    
196
#ifdef FAST_BGR2YV12
197
static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000000210041000DULL;
198
static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
199
static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
200
#else
201
static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000020E540830C8BULL;
202
static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
203
static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
204
#endif
205
static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
206
static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
207
static const uint64_t w1111       __attribute__((aligned(8))) = 0x0001000100010001ULL;
208

    
209
// FIXME remove
210
static uint64_t __attribute__((aligned(8))) asm_yalpha1;
211
static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
212
#endif
213

    
214
// clipping helper table for C implementations:
215
static unsigned char clip_table[768];
216

    
217
static unsigned short clip_table16b[768];
218
static unsigned short clip_table16g[768];
219
static unsigned short clip_table16r[768];
220
static unsigned short clip_table15b[768];
221
static unsigned short clip_table15g[768];
222
static unsigned short clip_table15r[768];
223

    
224
// yuv->rgb conversion tables:
225
static    int yuvtab_2568[256];
226
static    int yuvtab_3343[256];
227
static    int yuvtab_0c92[256];
228
static    int yuvtab_1a1e[256];
229
static    int yuvtab_40cf[256];
230
// Needed for cubic scaler to catch overflows
231
static    int clip_yuvtab_2568[768];
232
static    int clip_yuvtab_3343[768];
233
static    int clip_yuvtab_0c92[768];
234
static    int clip_yuvtab_1a1e[768];
235
static    int clip_yuvtab_40cf[768];
236

    
237
//global sws_flags from the command line
238
int sws_flags=2;
239

    
240
//global srcFilter
241
SwsFilter src_filter= {NULL, NULL, NULL, NULL};
242

    
243
float sws_lum_gblur= 0.0;
244
float sws_chr_gblur= 0.0;
245
int sws_chr_vshift= 0;
246
int sws_chr_hshift= 0;
247
float sws_chr_sharpen= 0.0;
248
float sws_lum_sharpen= 0.0;
249

    
250
/* cpuCaps combined from cpudetect and whats actually compiled in
251
   (if there is no support for something compiled in it wont appear here) */
252
static CpuCaps cpuCaps;
253

    
254
void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
255
             int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
256

    
257
static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
258

    
259
#ifdef CAN_COMPILE_X86_ASM
260
void in_asm_used_var_warning_killer()
261
{
262
 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
263
 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
264
 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
265
 if(i) i=0;
266
}
267
#endif
268

    
269
static int testFormat[]={
270
IMGFMT_YV12,
271
//IMGFMT_IYUV,
272
IMGFMT_I420,
273
IMGFMT_BGR15,
274
IMGFMT_BGR16,
275
IMGFMT_BGR24,
276
IMGFMT_BGR32,
277
//IMGFMT_Y8,
278
IMGFMT_Y800,
279
//IMGFMT_YUY2,
280
0
281
};
282

    
283
static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, int w, int h){
284
        int x,y;
285
        uint64_t ssd=0;
286

    
287
        for(y=0; y<h; y++){
288
                for(x=0; x<w; x++){
289
                        int d= src1[x + y*stride1] - src2[x + y*stride2];
290
                        ssd+= d*d;
291
                }
292
        }
293
        return ssd;
294
}
295

    
296
// test by ref -> src -> dst -> out & compare out against ref
297
// ref & out are YV12
298
static void doTest(uint8_t *ref[3], int refStride[3], int w, int h, int srcFormat, int dstFormat, 
299
                   int srcW, int srcH, int dstW, int dstH, int flags){
300
        uint8_t *src[3];
301
        uint8_t *dst[3];
302
        uint8_t *out[3];
303
        int srcStride[3], dstStride[3];
304
        int i;
305
        uint64_t ssdY, ssdU, ssdV;
306
        SwsContext *srcContext, *dstContext, *outContext;
307
        
308
        for(i=0; i<3; i++){
309
                srcStride[i]= srcW*4;
310
                dstStride[i]= dstW*4;
311
                src[i]= malloc(srcStride[i]*srcH);
312
                dst[i]= malloc(dstStride[i]*dstH);
313
                out[i]= malloc(refStride[i]*h);
314
        }
315

    
316
        srcContext= getSwsContext(w, h, IMGFMT_YV12, srcW, srcH, srcFormat, flags, NULL, NULL);
317
        dstContext= getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, NULL, NULL);
318
        outContext= getSwsContext(dstW, dstH, dstFormat, w, h, IMGFMT_YV12, flags, NULL, NULL);
319
        if(srcContext==NULL ||dstContext==NULL ||outContext==NULL){
320
                printf("Failed allocating swsContext\n");
321
                goto end;
322
        }
323
//        printf("test %X %X %X -> %X %X %X\n", (int)ref[0], (int)ref[1], (int)ref[2],
324
//                (int)src[0], (int)src[1], (int)src[2]);
325

    
326
        srcContext->swScale(srcContext, ref, refStride, 0, h   , src, srcStride);
327
        dstContext->swScale(dstContext, src, srcStride, 0, srcH, dst, dstStride);
328
        outContext->swScale(outContext, dst, dstStride, 0, dstH, out, refStride);
329
             
330
        ssdY= getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
331
        ssdU= getSSD(ref[1], out[1], refStride[1], refStride[1], (w+1)>>1, (h+1)>>1);
332
        ssdV= getSSD(ref[2], out[2], refStride[2], refStride[2], (w+1)>>1, (h+1)>>1);
333
        
334
        if(isGray(srcFormat) || isGray(dstFormat)) ssdU=ssdV=0; //FIXME check that output is really gray
335
        
336
        ssdY/= w*h;
337
        ssdU/= w*h/4;
338
        ssdV/= w*h/4;
339
        
340
        if(ssdY>10 || ssdU>10 || ssdV>10){
341
                printf(" %s %dx%d -> %s %4dx%4d flags=%2d SSD=%5lld,%5lld,%5lld\n", 
342
                        vo_format_name(srcFormat), srcW, srcH, 
343
                        vo_format_name(dstFormat), dstW, dstH,
344
                        flags,
345
                        ssdY, ssdU, ssdV);
346
        }
347

    
348
        end:
349
        
350
        freeSwsContext(srcContext);
351
        freeSwsContext(dstContext);
352
        freeSwsContext(outContext);
353

    
354
        for(i=0; i<3; i++){
355
                free(src[i]);
356
                free(dst[i]);
357
                free(out[i]);
358
        }
359
}
360

    
361
static void selfTest(uint8_t *src[3], int stride[3], int w, int h){
362
        int srcFormat, dstFormat, srcFormatIndex, dstFormatIndex;
363
        int srcW, srcH, dstW, dstH;
364
        int flags;
365

    
366
        for(srcFormatIndex=0; ;srcFormatIndex++){
367
                srcFormat= testFormat[srcFormatIndex];
368
                if(!srcFormat) break;
369
                for(dstFormatIndex=0; ;dstFormatIndex++){
370
                        dstFormat= testFormat[dstFormatIndex];
371
                        if(!dstFormat) break;
372
                        if(!isSupportedOut(dstFormat)) continue;
373

    
374
                        srcW= w+w/3;
375
                        srcH= h+h/3;
376
                        for(dstW=w; dstW<w*2; dstW+= dstW/3){
377
                                for(dstH=h; dstH<h*2; dstH+= dstH/3){
378
                                        for(flags=1; flags<33; flags*=2)
379
                                                doTest(src, stride, w, h, srcFormat, dstFormat,
380
                                                        srcW, srcH, dstW, dstH, flags);
381
                                }
382
                        }
383
                }
384
        }
385
}
386

    
387
static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
388
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
389
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
390
{
391
        //FIXME Optimize (just quickly writen not opti..)
392
        int i;
393
        for(i=0; i<dstW; i++)
394
        {
395
                int val=0;
396
                int j;
397
                for(j=0; j<lumFilterSize; j++)
398
                        val += lumSrc[j][i] * lumFilter[j];
399

    
400
                dest[i]= MIN(MAX(val>>19, 0), 255);
401
        }
402

    
403
        if(uDest != NULL)
404
                for(i=0; i<(dstW>>1); i++)
405
                {
406
                        int u=0;
407
                        int v=0;
408
                        int j;
409
                        for(j=0; j<chrFilterSize; j++)
410
                        {
411
                                u += chrSrc[j][i] * chrFilter[j];
412
                                v += chrSrc[j][i + 2048] * chrFilter[j];
413
                        }
414

    
415
                        uDest[i]= MIN(MAX(u>>19, 0), 255);
416
                        vDest[i]= MIN(MAX(v>>19, 0), 255);
417
                }
418
}
419

    
420
static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
421
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
422
                                    uint8_t *dest, int dstW, int dstFormat)
423
{
424
        if(dstFormat==IMGFMT_BGR32)
425
        {
426
                int i;
427
#ifdef WORDS_BIGENDIAN
428
        dest++;
429
#endif
430
                for(i=0; i<(dstW>>1); i++){
431
                        int j;
432
                        int Y1=0;
433
                        int Y2=0;
434
                        int U=0;
435
                        int V=0;
436
                        int Cb, Cr, Cg;
437
                        for(j=0; j<lumFilterSize; j++)
438
                        {
439
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
440
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
441
                        }
442
                        for(j=0; j<chrFilterSize; j++)
443
                        {
444
                                U += chrSrc[j][i] * chrFilter[j];
445
                                V += chrSrc[j][i+2048] * chrFilter[j];
446
                        }
447
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
448
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
449
                        U >>= 19;
450
                        V >>= 19;
451

    
452
                        Cb= clip_yuvtab_40cf[U+ 256];
453
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
454
                        Cr= clip_yuvtab_3343[V+ 256];
455

    
456
                        dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
457
                        dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
458
                        dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
459

    
460
                        dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
461
                        dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
462
                        dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
463
                }
464
        }
465
        else if(dstFormat==IMGFMT_BGR24)
466
        {
467
                int i;
468
                for(i=0; i<(dstW>>1); i++){
469
                        int j;
470
                        int Y1=0;
471
                        int Y2=0;
472
                        int U=0;
473
                        int V=0;
474
                        int Cb, Cr, Cg;
475
                        for(j=0; j<lumFilterSize; j++)
476
                        {
477
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
478
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
479
                        }
480
                        for(j=0; j<chrFilterSize; j++)
481
                        {
482
                                U += chrSrc[j][i] * chrFilter[j];
483
                                V += chrSrc[j][i+2048] * chrFilter[j];
484
                        }
485
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
486
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
487
                        U >>= 19;
488
                        V >>= 19;
489

    
490
                        Cb= clip_yuvtab_40cf[U+ 256];
491
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
492
                        Cr= clip_yuvtab_3343[V+ 256];
493

    
494
                        dest[0]=clip_table[((Y1 + Cb) >>13)];
495
                        dest[1]=clip_table[((Y1 + Cg) >>13)];
496
                        dest[2]=clip_table[((Y1 + Cr) >>13)];
497

    
498
                        dest[3]=clip_table[((Y2 + Cb) >>13)];
499
                        dest[4]=clip_table[((Y2 + Cg) >>13)];
500
                        dest[5]=clip_table[((Y2 + Cr) >>13)];
501
                        dest+=6;
502
                }
503
        }
504
        else if(dstFormat==IMGFMT_BGR16)
505
        {
506
                int i;
507
#ifdef DITHER1XBPP
508
                static int ditherb1=1<<14;
509
                static int ditherg1=1<<13;
510
                static int ditherr1=2<<14;
511
                static int ditherb2=3<<14;
512
                static int ditherg2=3<<13;
513
                static int ditherr2=0<<14;
514

    
515
                ditherb1 ^= (1^2)<<14;
516
                ditherg1 ^= (1^2)<<13;
517
                ditherr1 ^= (1^2)<<14;
518
                ditherb2 ^= (3^0)<<14;
519
                ditherg2 ^= (3^0)<<13;
520
                ditherr2 ^= (3^0)<<14;
521
#else
522
                const int ditherb1=0;
523
                const int ditherg1=0;
524
                const int ditherr1=0;
525
                const int ditherb2=0;
526
                const int ditherg2=0;
527
                const int ditherr2=0;
528
#endif
529
                for(i=0; i<(dstW>>1); i++){
530
                        int j;
531
                        int Y1=0;
532
                        int Y2=0;
533
                        int U=0;
534
                        int V=0;
535
                        int Cb, Cr, Cg;
536
                        for(j=0; j<lumFilterSize; j++)
537
                        {
538
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
539
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
540
                        }
541
                        for(j=0; j<chrFilterSize; j++)
542
                        {
543
                                U += chrSrc[j][i] * chrFilter[j];
544
                                V += chrSrc[j][i+2048] * chrFilter[j];
545
                        }
546
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
547
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
548
                        U >>= 19;
549
                        V >>= 19;
550

    
551
                        Cb= clip_yuvtab_40cf[U+ 256];
552
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
553
                        Cr= clip_yuvtab_3343[V+ 256];
554

    
555
                        ((uint16_t*)dest)[2*i] =
556
                                clip_table16b[(Y1 + Cb + ditherb1) >>13] |
557
                                clip_table16g[(Y1 + Cg + ditherg1) >>13] |
558
                                clip_table16r[(Y1 + Cr + ditherr1) >>13];
559

    
560
                        ((uint16_t*)dest)[2*i+1] =
561
                                clip_table16b[(Y2 + Cb + ditherb2) >>13] |
562
                                clip_table16g[(Y2 + Cg + ditherg2) >>13] |
563
                                clip_table16r[(Y2 + Cr + ditherr2) >>13];
564
                }
565
        }
566
        else if(dstFormat==IMGFMT_BGR15)
567
        {
568
                int i;
569
#ifdef DITHER1XBPP
570
                static int ditherb1=1<<14;
571
                static int ditherg1=1<<14;
572
                static int ditherr1=2<<14;
573
                static int ditherb2=3<<14;
574
                static int ditherg2=3<<14;
575
                static int ditherr2=0<<14;
576

    
577
                ditherb1 ^= (1^2)<<14;
578
                ditherg1 ^= (1^2)<<14;
579
                ditherr1 ^= (1^2)<<14;
580
                ditherb2 ^= (3^0)<<14;
581
                ditherg2 ^= (3^0)<<14;
582
                ditherr2 ^= (3^0)<<14;
583
#else
584
                const int ditherb1=0;
585
                const int ditherg1=0;
586
                const int ditherr1=0;
587
                const int ditherb2=0;
588
                const int ditherg2=0;
589
                const int ditherr2=0;
590
#endif
591
                for(i=0; i<(dstW>>1); i++){
592
                        int j;
593
                        int Y1=0;
594
                        int Y2=0;
595
                        int U=0;
596
                        int V=0;
597
                        int Cb, Cr, Cg;
598
                        for(j=0; j<lumFilterSize; j++)
599
                        {
600
                                Y1 += lumSrc[j][2*i] * lumFilter[j];
601
                                Y2 += lumSrc[j][2*i+1] * lumFilter[j];
602
                        }
603
                        for(j=0; j<chrFilterSize; j++)
604
                        {
605
                                U += chrSrc[j][i] * chrFilter[j];
606
                                V += chrSrc[j][i+2048] * chrFilter[j];
607
                        }
608
                        Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
609
                        Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
610
                        U >>= 19;
611
                        V >>= 19;
612

    
613
                        Cb= clip_yuvtab_40cf[U+ 256];
614
                        Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
615
                        Cr= clip_yuvtab_3343[V+ 256];
616

    
617
                        ((uint16_t*)dest)[2*i] =
618
                                clip_table15b[(Y1 + Cb + ditherb1) >>13] |
619
                                clip_table15g[(Y1 + Cg + ditherg1) >>13] |
620
                                clip_table15r[(Y1 + Cr + ditherr1) >>13];
621

    
622
                        ((uint16_t*)dest)[2*i+1] =
623
                                clip_table15b[(Y2 + Cb + ditherb2) >>13] |
624
                                clip_table15g[(Y2 + Cg + ditherg2) >>13] |
625
                                clip_table15r[(Y2 + Cr + ditherr2) >>13];
626
                }
627
        }
628
}
629

    
630

    
631
//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
632
//Plain C versions
633
#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
634
#define COMPILE_C
635
#endif
636

    
637
#ifdef CAN_COMPILE_X86_ASM
638

    
639
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
640
#define COMPILE_MMX
641
#endif
642

    
643
#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
644
#define COMPILE_MMX2
645
#endif
646

    
647
#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
648
#define COMPILE_3DNOW
649
#endif
650
#endif //CAN_COMPILE_X86_ASM
651

    
652
#undef HAVE_MMX
653
#undef HAVE_MMX2
654
#undef HAVE_3DNOW
655

    
656
#ifdef COMPILE_C
657
#undef HAVE_MMX
658
#undef HAVE_MMX2
659
#undef HAVE_3DNOW
660
#define RENAME(a) a ## _C
661
#include "swscale_template.c"
662
#endif
663

    
664
#ifdef CAN_COMPILE_X86_ASM
665

    
666
//X86 versions
667
/*
668
#undef RENAME
669
#undef HAVE_MMX
670
#undef HAVE_MMX2
671
#undef HAVE_3DNOW
672
#define ARCH_X86
673
#define RENAME(a) a ## _X86
674
#include "swscale_template.c"
675
*/
676
//MMX versions
677
#ifdef COMPILE_MMX
678
#undef RENAME
679
#define HAVE_MMX
680
#undef HAVE_MMX2
681
#undef HAVE_3DNOW
682
#define RENAME(a) a ## _MMX
683
#include "swscale_template.c"
684
#endif
685

    
686
//MMX2 versions
687
#ifdef COMPILE_MMX2
688
#undef RENAME
689
#define HAVE_MMX
690
#define HAVE_MMX2
691
#undef HAVE_3DNOW
692
#define RENAME(a) a ## _MMX2
693
#include "swscale_template.c"
694
#endif
695

    
696
//3DNOW versions
697
#ifdef COMPILE_3DNOW
698
#undef RENAME
699
#define HAVE_MMX
700
#undef HAVE_MMX2
701
#define HAVE_3DNOW
702
#define RENAME(a) a ## _3DNow
703
#include "swscale_template.c"
704
#endif
705

    
706
#endif //CAN_COMPILE_X86_ASM
707

    
708
// minor note: the HAVE_xyz is messed up after that line so dont use it
709

    
710

    
711
// old global scaler, dont use for new code
712
// will use sws_flags from the command line
713
void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
714
                             int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
715
                             int srcW, int srcH, int dstW, int dstH){
716

    
717
        static SwsContext *context=NULL;
718
        int dstFormat;
719
        int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
720

    
721
        switch(dstbpp)
722
        {
723
                case 8 : dstFormat= IMGFMT_Y8;                break;
724
                case 12: dstFormat= IMGFMT_YV12;        break;
725
                case 15: dstFormat= IMGFMT_BGR15;        break;
726
                case 16: dstFormat= IMGFMT_BGR16;        break;
727
                case 24: dstFormat= IMGFMT_BGR24;        break;
728
                case 32: dstFormat= IMGFMT_BGR32;        break;
729
                default: return;
730
        }
731

    
732
        if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
733

    
734
        context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
735
}
736

    
737
// will use sws_flags & src_filter (from cmd line)
738
SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
739
{
740
        int flags=0;
741
        static int firstTime=1;
742

    
743
#ifdef ARCH_X86
744
        if(gCpuCaps.hasMMX)
745
                asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
746
#endif
747
        if(firstTime)
748
        {
749
                firstTime=0;
750
                flags= SWS_PRINT_INFO;
751
        }
752
        else if(verbose>1) flags= SWS_PRINT_INFO;
753

    
754
        if(src_filter.lumH) freeVec(src_filter.lumH);
755
        if(src_filter.lumV) freeVec(src_filter.lumV);
756
        if(src_filter.chrH) freeVec(src_filter.chrH);
757
        if(src_filter.chrV) freeVec(src_filter.chrV);
758

    
759
        if(sws_lum_gblur!=0.0){
760
                src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
761
                src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
762
        }else{
763
                src_filter.lumH= getIdentityVec();
764
                src_filter.lumV= getIdentityVec();
765
        }
766

    
767
        if(sws_chr_gblur!=0.0){
768
                src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
769
                src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
770
        }else{
771
                src_filter.chrH= getIdentityVec();
772
                src_filter.chrV= getIdentityVec();
773
        }
774

    
775
        if(sws_chr_sharpen!=0.0){
776
                SwsVector *g= getConstVec(-1.0, 3);
777
                SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
778
                g->coeff[1]=2.0;
779
                addVec(id, g);
780
                convVec(src_filter.chrH, id);
781
                convVec(src_filter.chrV, id);
782
                freeVec(g);
783
                freeVec(id);
784
        }
785

    
786
        if(sws_lum_sharpen!=0.0){
787
                SwsVector *g= getConstVec(-1.0, 3);
788
                SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
789
                g->coeff[1]=2.0;
790
                addVec(id, g);
791
                convVec(src_filter.lumH, id);
792
                convVec(src_filter.lumV, id);
793
                freeVec(g);
794
                freeVec(id);
795
        }
796

    
797
        if(sws_chr_hshift)
798
                shiftVec(src_filter.chrH, sws_chr_hshift);
799

    
800
        if(sws_chr_vshift)
801
                shiftVec(src_filter.chrV, sws_chr_vshift);
802

    
803
        normalizeVec(src_filter.chrH, 1.0);
804
        normalizeVec(src_filter.chrV, 1.0);
805
        normalizeVec(src_filter.lumH, 1.0);
806
        normalizeVec(src_filter.lumV, 1.0);
807

    
808
        if(verbose > 1) printVec(src_filter.chrH);
809
        if(verbose > 1) printVec(src_filter.lumH);
810

    
811
        switch(sws_flags)
812
        {
813
                case 0: flags|= SWS_FAST_BILINEAR; break;
814
                case 1: flags|= SWS_BILINEAR; break;
815
                case 2: flags|= SWS_BICUBIC; break;
816
                case 3: flags|= SWS_X; break;
817
                case 4: flags|= SWS_POINT; break;
818
                case 5: flags|= SWS_AREA; break;
819
                default:flags|= SWS_BILINEAR; break;
820
        }
821

    
822
        return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
823
}
824

    
825

    
826
static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
827
                              int srcW, int dstW, int filterAlign, int one, int flags,
828
                              SwsVector *srcFilter, SwsVector *dstFilter)
829
{
830
        int i;
831
        int filterSize;
832
        int filter2Size;
833
        int minFilterSize;
834
        double *filter=NULL;
835
        double *filter2=NULL;
836
#ifdef ARCH_X86
837
        if(gCpuCaps.hasMMX)
838
                asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
839
#endif
840

    
841
        // Note the +1 is for the MMXscaler which reads over the end
842
        *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
843

    
844
        if(ABS(xInc - 0x10000) <10) // unscaled
845
        {
846
                int i;
847
                filterSize= 1;
848
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
849
                for(i=0; i<dstW*filterSize; i++) filter[i]=0;
850

    
851
                for(i=0; i<dstW; i++)
852
                {
853
                        filter[i*filterSize]=1;
854
                        (*filterPos)[i]=i;
855
                }
856

    
857
        }
858
        else if(flags&SWS_POINT) // lame looking point sampling mode
859
        {
860
                int i;
861
                int xDstInSrc;
862
                filterSize= 1;
863
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
864
                
865
                xDstInSrc= xInc/2 - 0x8000;
866
                for(i=0; i<dstW; i++)
867
                {
868
                        int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
869

    
870
                        (*filterPos)[i]= xx;
871
                        filter[i]= 1.0;
872
                        xDstInSrc+= xInc;
873
                }
874
        }
875
        else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
876
        {
877
                int i;
878
                int xDstInSrc;
879
                if     (flags&SWS_BICUBIC) filterSize= 4;
880
                else if(flags&SWS_X      ) filterSize= 4;
881
                else                           filterSize= 2; // SWS_BILINEAR / SWS_AREA 
882
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
883

    
884
                xDstInSrc= xInc/2 - 0x8000;
885
                for(i=0; i<dstW; i++)
886
                {
887
                        int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
888
                        int j;
889

    
890
                        (*filterPos)[i]= xx;
891
                        if((flags & SWS_BICUBIC) || (flags & SWS_X))
892
                        {
893
                                double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
894
                                double y1,y2,y3,y4;
895
                                double A= -0.6;
896
                                if(flags & SWS_BICUBIC){
897
                                                // Equation is from VirtualDub
898
                                        y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
899
                                        y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
900
                                        y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
901
                                        y4 = (                  +           A*d*d -       A*d*d*d);
902
                                }else{
903
                                                // cubic interpolation (derived it myself)
904
                                        y1 = (    -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
905
                                        y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
906
                                        y3 = (    +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
907
                                        y4 = (    -1.0*d           + 1.0*d*d*d)/6.0;
908
                                }
909

    
910
                                filter[i*filterSize + 0]= y1;
911
                                filter[i*filterSize + 1]= y2;
912
                                filter[i*filterSize + 2]= y3;
913
                                filter[i*filterSize + 3]= y4;
914
                        }
915
                        else
916
                        {
917
                                //Bilinear upscale / linear interpolate / Area averaging
918
                                for(j=0; j<filterSize; j++)
919
                                {
920
                                        double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
921
                                        double coeff= 1.0 - d;
922
                                        if(coeff<0) coeff=0;
923
                                        filter[i*filterSize + j]= coeff;
924
                                        xx++;
925
                                }
926
                        }
927
                        xDstInSrc+= xInc;
928
                }
929
        }
930
        else // downscale
931
        {
932
                int xDstInSrc;
933
                ASSERT(dstW <= srcW)
934

    
935
                if(flags&SWS_BICUBIC)        filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
936
                else if(flags&SWS_X)        filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
937
                else if(flags&SWS_AREA)        filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
938
                else /* BILINEAR */        filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
939
                filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
940

    
941
                xDstInSrc= xInc/2 - 0x8000;
942
                for(i=0; i<dstW; i++)
943
                {
944
                        int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
945
                        int j;
946
                        (*filterPos)[i]= xx;
947
                        for(j=0; j<filterSize; j++)
948
                        {
949
                                double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
950
                                double coeff;
951
                                if((flags & SWS_BICUBIC) || (flags & SWS_X))
952
                                {
953
                                        double A= -0.75;
954
//                                        d*=2;
955
                                        // Equation is from VirtualDub
956
                                        if(d<1.0)
957
                                                coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
958
                                        else if(d<2.0)
959
                                                coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
960
                                        else
961
                                                coeff=0.0;
962
                                }
963
                                else if(flags & SWS_AREA)
964
                                {
965
                                        double srcPixelSize= (1<<16)/(double)xInc;
966
                                        if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
967
                                        else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
968
                                        else coeff=0.0;
969
                                }
970
                                else
971
                                {
972
                                        coeff= 1.0 - d;
973
                                        if(coeff<0) coeff=0;
974
                                }
975
                                filter[i*filterSize + j]= coeff;
976
                                xx++;
977
                        }
978
                        xDstInSrc+= xInc;
979
                }
980
        }
981

    
982
        /* apply src & dst Filter to filter -> filter2
983
           free(filter);
984
        */
985
        ASSERT(filterSize>0)
986
        filter2Size= filterSize;
987
        if(srcFilter) filter2Size+= srcFilter->length - 1;
988
        if(dstFilter) filter2Size+= dstFilter->length - 1;
989
        ASSERT(filter2Size>0)
990
        filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
991

    
992
        for(i=0; i<dstW; i++)
993
        {
994
                int j;
995
                SwsVector scaleFilter;
996
                SwsVector *outVec;
997

    
998
                scaleFilter.coeff= filter + i*filterSize;
999
                scaleFilter.length= filterSize;
1000

    
1001
                if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
1002
                else              outVec= &scaleFilter;
1003

    
1004
                ASSERT(outVec->length == filter2Size)
1005
                //FIXME dstFilter
1006

    
1007
                for(j=0; j<outVec->length; j++)
1008
                {
1009
                        filter2[i*filter2Size + j]= outVec->coeff[j];
1010
                }
1011

    
1012
                (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
1013

    
1014
                if(outVec != &scaleFilter) freeVec(outVec);
1015
        }
1016
        free(filter); filter=NULL;
1017

    
1018
        /* try to reduce the filter-size (step1 find size and shift left) */
1019
        // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
1020
        minFilterSize= 0;
1021
        for(i=dstW-1; i>=0; i--)
1022
        {
1023
                int min= filter2Size;
1024
                int j;
1025
                double cutOff=0.0;
1026

    
1027
                /* get rid off near zero elements on the left by shifting left */
1028
                for(j=0; j<filter2Size; j++)
1029
                {
1030
                        int k;
1031
                        cutOff += ABS(filter2[i*filter2Size]);
1032

    
1033
                        if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
1034

    
1035
                        /* preserve Monotonicity because the core cant handle the filter otherwise */
1036
                        if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
1037

    
1038
                        // Move filter coeffs left
1039
                        for(k=1; k<filter2Size; k++)
1040
                                filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
1041
                        filter2[i*filter2Size + k - 1]= 0.0;
1042
                        (*filterPos)[i]++;
1043
                }
1044

    
1045
                cutOff=0.0;
1046
                /* count near zeros on the right */
1047
                for(j=filter2Size-1; j>0; j--)
1048
                {
1049
                        cutOff += ABS(filter2[i*filter2Size + j]);
1050

    
1051
                        if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
1052
                        min--;
1053
                }
1054

    
1055
                if(min>minFilterSize) minFilterSize= min;
1056
        }
1057

    
1058
        ASSERT(minFilterSize > 0)
1059
        filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
1060
        ASSERT(filterSize > 0)
1061
        filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
1062
        *outFilterSize= filterSize;
1063

    
1064
        if(flags&SWS_PRINT_INFO)
1065
                MSG_INFO("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
1066
        /* try to reduce the filter-size (step2 reduce it) */
1067
        for(i=0; i<dstW; i++)
1068
        {
1069
                int j;
1070

    
1071
                for(j=0; j<filterSize; j++)
1072
                {
1073
                        if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
1074
                        else                   filter[i*filterSize + j]= filter2[i*filter2Size + j];
1075
                }
1076
        }
1077
        free(filter2); filter2=NULL;
1078
        
1079

    
1080
        //FIXME try to align filterpos if possible
1081

    
1082
        //fix borders
1083
        for(i=0; i<dstW; i++)
1084
        {
1085
                int j;
1086
                if((*filterPos)[i] < 0)
1087
                {
1088
                        // Move filter coeffs left to compensate for filterPos
1089
                        for(j=1; j<filterSize; j++)
1090
                        {
1091
                                int left= MAX(j + (*filterPos)[i], 0);
1092
                                filter[i*filterSize + left] += filter[i*filterSize + j];
1093
                                filter[i*filterSize + j]=0;
1094
                        }
1095
                        (*filterPos)[i]= 0;
1096
                }
1097

    
1098
                if((*filterPos)[i] + filterSize > srcW)
1099
                {
1100
                        int shift= (*filterPos)[i] + filterSize - srcW;
1101
                        // Move filter coeffs right to compensate for filterPos
1102
                        for(j=filterSize-2; j>=0; j--)
1103
                        {
1104
                                int right= MIN(j + shift, filterSize-1);
1105
                                filter[i*filterSize +right] += filter[i*filterSize +j];
1106
                                filter[i*filterSize +j]=0;
1107
                        }
1108
                        (*filterPos)[i]= srcW - filterSize;
1109
                }
1110
        }
1111

    
1112
        // Note the +1 is for the MMXscaler which reads over the end
1113
        *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
1114
        memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
1115

    
1116
        /* Normalize & Store in outFilter */
1117
        for(i=0; i<dstW; i++)
1118
        {
1119
                int j;
1120
                double sum=0;
1121
                double scale= one;
1122
                for(j=0; j<filterSize; j++)
1123
                {
1124
                        sum+= filter[i*filterSize + j];
1125
                }
1126
                scale/= sum;
1127
                for(j=0; j<filterSize; j++)
1128
                {
1129
                        (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
1130
                }
1131
        }
1132
        
1133
        (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1134
        for(i=0; i<*outFilterSize; i++)
1135
        {
1136
                int j= dstW*(*outFilterSize);
1137
                (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1138
        }
1139

    
1140
        free(filter);
1141
}
1142

    
1143
#ifdef ARCH_X86
1144
static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
1145
{
1146
        uint8_t *fragmentA;
1147
        int imm8OfPShufW1A;
1148
        int imm8OfPShufW2A;
1149
        int fragmentLengthA;
1150
        uint8_t *fragmentB;
1151
        int imm8OfPShufW1B;
1152
        int imm8OfPShufW2B;
1153
        int fragmentLengthB;
1154
        int fragmentPos;
1155

    
1156
        int xpos, i;
1157

    
1158
        // create an optimized horizontal scaling routine
1159

    
1160
        //code fragment
1161

    
1162
        asm volatile(
1163
                "jmp 9f                                \n\t"
1164
        // Begin
1165
                "0:                                \n\t"
1166
                "movq (%%edx, %%eax), %%mm3        \n\t" 
1167
                "movd (%%ecx, %%esi), %%mm0        \n\t" 
1168
                "movd 1(%%ecx, %%esi), %%mm1        \n\t"
1169
                "punpcklbw %%mm7, %%mm1                \n\t"
1170
                "punpcklbw %%mm7, %%mm0                \n\t"
1171
                "pshufw $0xFF, %%mm1, %%mm1        \n\t"
1172
                "1:                                \n\t"
1173
                "pshufw $0xFF, %%mm0, %%mm0        \n\t"
1174
                "2:                                \n\t"
1175
                "psubw %%mm1, %%mm0                \n\t"
1176
                "movl 8(%%ebx, %%eax), %%esi        \n\t"
1177
                "pmullw %%mm3, %%mm0                \n\t"
1178
                "psllw $7, %%mm1                \n\t"
1179
                "paddw %%mm1, %%mm0                \n\t"
1180

    
1181
                "movq %%mm0, (%%edi, %%eax)        \n\t"
1182

    
1183
                "addl $8, %%eax                        \n\t"
1184
        // End
1185
                "9:                                \n\t"
1186
//                "int $3\n\t"
1187
                "leal 0b, %0                        \n\t"
1188
                "leal 1b, %1                        \n\t"
1189
                "leal 2b, %2                        \n\t"
1190
                "decl %1                        \n\t"
1191
                "decl %2                        \n\t"
1192
                "subl %0, %1                        \n\t"
1193
                "subl %0, %2                        \n\t"
1194
                "leal 9b, %3                        \n\t"
1195
                "subl %0, %3                        \n\t"
1196

    
1197

    
1198
                :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
1199
                "=r" (fragmentLengthA)
1200
        );
1201

    
1202
        asm volatile(
1203
                "jmp 9f                                \n\t"
1204
        // Begin
1205
                "0:                                \n\t"
1206
                "movq (%%edx, %%eax), %%mm3        \n\t" 
1207
                "movd (%%ecx, %%esi), %%mm0        \n\t" 
1208
                "punpcklbw %%mm7, %%mm0                \n\t"
1209
                "pshufw $0xFF, %%mm0, %%mm1        \n\t"
1210
                "1:                                \n\t"
1211
                "pshufw $0xFF, %%mm0, %%mm0        \n\t"
1212
                "2:                                \n\t"
1213
                "psubw %%mm1, %%mm0                \n\t"
1214
                "movl 8(%%ebx, %%eax), %%esi        \n\t"
1215
                "pmullw %%mm3, %%mm0                \n\t"
1216
                "psllw $7, %%mm1                \n\t"
1217
                "paddw %%mm1, %%mm0                \n\t"
1218

    
1219
                "movq %%mm0, (%%edi, %%eax)        \n\t"
1220

    
1221
                "addl $8, %%eax                        \n\t"
1222
        // End
1223
                "9:                                \n\t"
1224
//                "int $3\n\t"
1225
                "leal 0b, %0                        \n\t"
1226
                "leal 1b, %1                        \n\t"
1227
                "leal 2b, %2                        \n\t"
1228
                "decl %1                        \n\t"
1229
                "decl %2                        \n\t"
1230
                "subl %0, %1                        \n\t"
1231
                "subl %0, %2                        \n\t"
1232
                "leal 9b, %3                        \n\t"
1233
                "subl %0, %3                        \n\t"
1234

    
1235

    
1236
                :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
1237
                "=r" (fragmentLengthB)
1238
        );
1239

    
1240
        xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1241
        fragmentPos=0;
1242
        
1243
        for(i=0; i<dstW/numSplits; i++)
1244
        {
1245
                int xx=xpos>>16;
1246

    
1247
                if((i&3) == 0)
1248
                {
1249
                        int a=0;
1250
                        int b=((xpos+xInc)>>16) - xx;
1251
                        int c=((xpos+xInc*2)>>16) - xx;
1252
                        int d=((xpos+xInc*3)>>16) - xx;
1253

    
1254
                        filter[i  ] = (( xpos         & 0xFFFF) ^ 0xFFFF)>>9;
1255
                        filter[i+1] = (((xpos+xInc  ) & 0xFFFF) ^ 0xFFFF)>>9;
1256
                        filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
1257
                        filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
1258
                        filterPos[i/2]= xx;
1259

    
1260
                        if(d+1<4)
1261
                        {
1262
                                int maxShift= 3-(d+1);
1263
                                int shift=0;
1264

    
1265
                                memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
1266

    
1267
                                funnyCode[fragmentPos + imm8OfPShufW1B]=
1268
                                        (a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
1269
                                funnyCode[fragmentPos + imm8OfPShufW2B]=
1270
                                        a | (b<<2) | (c<<4) | (d<<6);
1271

    
1272
                                if(i+3>=dstW) shift=maxShift; //avoid overread
1273
                                else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
1274

    
1275
                                if(shift && i>=shift)
1276
                                {
1277
                                        funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
1278
                                        funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
1279
                                        filterPos[i/2]-=shift;
1280
                                }
1281

    
1282
                                fragmentPos+= fragmentLengthB;
1283
                        }
1284
                        else
1285
                        {
1286
                                int maxShift= 3-d;
1287
                                int shift=0;
1288

    
1289
                                memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
1290

    
1291
                                funnyCode[fragmentPos + imm8OfPShufW1A]=
1292
                                funnyCode[fragmentPos + imm8OfPShufW2A]=
1293
                                        a | (b<<2) | (c<<4) | (d<<6);
1294

    
1295
                                if(i+4>=dstW) shift=maxShift; //avoid overread
1296
                                else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
1297

    
1298
                                if(shift && i>=shift)
1299
                                {
1300
                                        funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
1301
                                        funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
1302
                                        filterPos[i/2]-=shift;
1303
                                }
1304

    
1305
                                fragmentPos+= fragmentLengthA;
1306
                        }
1307

    
1308
                        funnyCode[fragmentPos]= RET;
1309
                }
1310
                xpos+=xInc;
1311
        }
1312
        filterPos[i/2]= xpos>>16; // needed to jump to the next part
1313
}
1314
#endif // ARCH_X86
1315

    
1316
//FIXME remove
1317
void SwScale_Init(){
1318
}
1319

    
1320
static void globalInit(){
1321
    // generating tables:
1322
    int i;
1323
    for(i=0; i<768; i++){
1324
        int c= MIN(MAX(i-256, 0), 255);
1325
        clip_table[i]=c;
1326
        yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1327
        yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1328
        yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1329
        yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1330
        yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
1331
    }
1332

    
1333
    for(i=0; i<768; i++)
1334
    {
1335
        int v= clip_table[i];
1336
        clip_table16b[i]=  v>>3;
1337
        clip_table16g[i]= (v<<3)&0x07E0;
1338
        clip_table16r[i]= (v<<8)&0xF800;
1339
        clip_table15b[i]=  v>>3;
1340
        clip_table15g[i]= (v<<2)&0x03E0;
1341
        clip_table15r[i]= (v<<7)&0x7C00;
1342
    }
1343

    
1344
cpuCaps= gCpuCaps;
1345

    
1346
#ifdef RUNTIME_CPUDETECT
1347
#ifdef CAN_COMPILE_X86_ASM
1348
        // ordered per speed fasterst first
1349
        if(gCpuCaps.hasMMX2)
1350
                swScale= swScale_MMX2;
1351
        else if(gCpuCaps.has3DNow)
1352
                swScale= swScale_3DNow;
1353
        else if(gCpuCaps.hasMMX)
1354
                swScale= swScale_MMX;
1355
        else
1356
                swScale= swScale_C;
1357

    
1358
#else
1359
        swScale= swScale_C;
1360
        cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1361
#endif
1362
#else //RUNTIME_CPUDETECT
1363
#ifdef HAVE_MMX2
1364
        swScale= swScale_MMX2;
1365
        cpuCaps.has3DNow = 0;
1366
#elif defined (HAVE_3DNOW)
1367
        swScale= swScale_3DNow;
1368
        cpuCaps.hasMMX2 = 0;
1369
#elif defined (HAVE_MMX)
1370
        swScale= swScale_MMX;
1371
        cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1372
#else
1373
        swScale= swScale_C;
1374
        cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1375
#endif
1376
#endif //!RUNTIME_CPUDETECT
1377
}
1378

    
1379
static void PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1380
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1381
        uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1382
        /* Copy Y plane */
1383
        if(dstStride[0]==srcStride[0])
1384
                memcpy(dst, src[0], srcSliceH*dstStride[0]);
1385
        else
1386
        {
1387
                int i;
1388
                uint8_t *srcPtr= src[0];
1389
                uint8_t *dstPtr= dst;
1390
                for(i=0; i<srcSliceH; i++)
1391
                {
1392
                        memcpy(dstPtr, srcPtr, srcStride[0]);
1393
                        srcPtr+= srcStride[0];
1394
                        dstPtr+= dstStride[0];
1395
                }
1396
        }
1397
        dst = dstParam[1] + dstStride[1]*srcSliceY;
1398
        if(c->srcFormat==IMGFMT_YV12)
1399
                interleaveBytes( src[1],src[2],dst,c->srcW,srcSliceH,srcStride[1],srcStride[2],dstStride[0] );
1400
        else /* I420 & IYUV */
1401
                interleaveBytes( src[2],src[1],dst,c->srcW,srcSliceH,srcStride[2],srcStride[1],dstStride[0] );
1402
}
1403

    
1404

    
1405
/* Warper functions for yuv2bgr */
1406
static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1407
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1408
        uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1409

    
1410
        if(c->srcFormat==IMGFMT_YV12)
1411
                yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1412
        else /* I420 & IYUV */
1413
                yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1414
}
1415

    
1416
static void PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1417
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1418
        uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1419

    
1420
        if(c->srcFormat==IMGFMT_YV12)
1421
                yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
1422
        else /* I420 & IYUV */
1423
                yv12toyuy2( src[0],src[2],src[1],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
1424
}
1425

    
1426
static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1427
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1428
        
1429
        if(dstStride[0]*3==srcStride[0]*4)
1430
                rgb24to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1431
        else
1432
        {
1433
                int i;
1434
                uint8_t *srcPtr= src[0];
1435
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1436

    
1437
                for(i=0; i<srcSliceH; i++)
1438
                {
1439
                        rgb24to32(srcPtr, dstPtr, c->srcW*3);
1440
                        srcPtr+= srcStride[0];
1441
                        dstPtr+= dstStride[0];
1442
                }
1443
        }     
1444
}
1445

    
1446
static void bgr24to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1447
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1448
        
1449
        if(dstStride[0]*3==srcStride[0]*2)
1450
                rgb24to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1451
        else
1452
        {
1453
                int i;
1454
                uint8_t *srcPtr= src[0];
1455
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1456

    
1457
                for(i=0; i<srcSliceH; i++)
1458
                {
1459
                        rgb24to16(srcPtr, dstPtr, c->srcW*3);
1460
                        srcPtr+= srcStride[0];
1461
                        dstPtr+= dstStride[0];
1462
                }
1463
        }     
1464
}
1465

    
1466
static void bgr24to15Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1467
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1468
        
1469
        if(dstStride[0]*3==srcStride[0]*2)
1470
                rgb24to15(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1471
        else
1472
        {
1473
                int i;
1474
                uint8_t *srcPtr= src[0];
1475
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1476

    
1477
                for(i=0; i<srcSliceH; i++)
1478
                {
1479
                        rgb24to15(srcPtr, dstPtr, c->srcW*3);
1480
                        srcPtr+= srcStride[0];
1481
                        dstPtr+= dstStride[0];
1482
                }
1483
        }     
1484
}
1485

    
1486
static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1487
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1488
        
1489
        if(dstStride[0]*4==srcStride[0]*3)
1490
                rgb32to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1491
        else
1492
        {
1493
                int i;
1494
                uint8_t *srcPtr= src[0];
1495
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1496

    
1497
                for(i=0; i<srcSliceH; i++)
1498
                {
1499
                        rgb32to24(srcPtr, dstPtr, c->srcW<<2);
1500
                        srcPtr+= srcStride[0];
1501
                        dstPtr+= dstStride[0];
1502
                }
1503
        }     
1504
}
1505

    
1506
static void bgr32to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1507
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1508
        
1509
        if(dstStride[0]*4==srcStride[0]*2)
1510
                rgb32to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1511
        else
1512
        {
1513
                int i;
1514
                uint8_t *srcPtr= src[0];
1515
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1516

    
1517
                for(i=0; i<srcSliceH; i++)
1518
                {
1519
                        rgb32to16(srcPtr, dstPtr, c->srcW<<2);
1520
                        srcPtr+= srcStride[0];
1521
                        dstPtr+= dstStride[0];
1522
                }
1523
        }     
1524
}
1525

    
1526
static void bgr32to15Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1527
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1528
        
1529
        if(dstStride[0]*4==srcStride[0]*2)
1530
                rgb32to15(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1531
        else
1532
        {
1533
                int i;
1534
                uint8_t *srcPtr= src[0];
1535
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1536

    
1537
                for(i=0; i<srcSliceH; i++)
1538
                {
1539
                        rgb32to15(srcPtr, dstPtr, c->srcW<<2);
1540
                        srcPtr+= srcStride[0];
1541
                        dstPtr+= dstStride[0];
1542
                }
1543
        }     
1544
}
1545

    
1546
static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1547
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1548
        
1549
        if(dstStride[0]==srcStride[0])
1550
                rgb15to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1551
        else
1552
        {
1553
                int i;
1554
                uint8_t *srcPtr= src[0];
1555
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1556

    
1557
                for(i=0; i<srcSliceH; i++)
1558
                {
1559
                        rgb15to16(srcPtr, dstPtr, c->srcW<<1);
1560
                        srcPtr+= srcStride[0];
1561
                        dstPtr+= dstStride[0];
1562
                }
1563
        }     
1564
}
1565

    
1566
static void bgr15to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1567
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1568
        
1569
        if(dstStride[0]*2==srcStride[0]*3)
1570
                rgb15to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1571
        else
1572
        {
1573
                int i;
1574
                uint8_t *srcPtr= src[0];
1575
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1576

    
1577
                for(i=0; i<srcSliceH; i++)
1578
                {
1579
                        rgb15to24(srcPtr, dstPtr, c->srcW<<1);
1580
                        srcPtr+= srcStride[0];
1581
                        dstPtr+= dstStride[0];
1582
                }
1583
        }     
1584
}
1585

    
1586
static void bgr15to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1587
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1588
        
1589
        if(dstStride[0]*2==srcStride[0]*4)
1590
                rgb15to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1591
        else
1592
        {
1593
                int i;
1594
                uint8_t *srcPtr= src[0];
1595
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1596

    
1597
                for(i=0; i<srcSliceH; i++)
1598
                {
1599
                        rgb15to32(srcPtr, dstPtr, c->srcW<<1);
1600
                        srcPtr+= srcStride[0];
1601
                        dstPtr+= dstStride[0];
1602
                }
1603
        }     
1604
}
1605

    
1606
static void bgr16to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1607
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1608
        
1609
        if(dstStride[0]*2==srcStride[0]*3)
1610
                rgb16to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1611
        else
1612
        {
1613
                int i;
1614
                uint8_t *srcPtr= src[0];
1615
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1616

    
1617
                for(i=0; i<srcSliceH; i++)
1618
                {
1619
                        rgb16to24(srcPtr, dstPtr, c->srcW<<1);
1620
                        srcPtr+= srcStride[0];
1621
                        dstPtr+= dstStride[0];
1622
                }
1623
        }     
1624
}
1625

    
1626
static void bgr16to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1627
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1628
        
1629
        if(dstStride[0]*2==srcStride[0]*4)
1630
                rgb16to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1631
        else
1632
        {
1633
                int i;
1634
                uint8_t *srcPtr= src[0];
1635
                uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1636

    
1637
                for(i=0; i<srcSliceH; i++)
1638
                {
1639
                        rgb16to32(srcPtr, dstPtr, c->srcW<<1);
1640
                        srcPtr+= srcStride[0];
1641
                        dstPtr+= dstStride[0];
1642
                }
1643
        }     
1644
}
1645

    
1646
static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1647
             int srcSliceH, uint8_t* dst[], int dstStride[]){
1648

    
1649
        rgb24toyv12(
1650
                src[0], 
1651
                dst[0]+ srcSliceY    *dstStride[0], 
1652
                dst[1]+(srcSliceY>>1)*dstStride[1], 
1653
                dst[2]+(srcSliceY>>1)*dstStride[2],
1654
                c->srcW, srcSliceH, 
1655
                dstStride[0], dstStride[1], srcStride[0]);
1656
}
1657

    
1658
/**
1659
 * bring pointers in YUV order instead of YVU
1660
 */
1661
static inline void orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
1662
        if(format == IMGFMT_YV12){
1663
                sortedP[0]= p[0];
1664
                sortedP[1]= p[1];
1665
                sortedP[2]= p[2];
1666
                sortedStride[0]= stride[0];
1667
                sortedStride[1]= stride[1];
1668
                sortedStride[2]= stride[2];
1669
        }
1670
        else if(isPacked(format) || isGray(format))
1671
        {
1672
                sortedP[0]= p[0];
1673
                sortedP[1]= 
1674
                sortedP[2]= NULL;
1675
                sortedStride[0]= stride[0];
1676
                sortedStride[1]= 
1677
                sortedStride[2]= 0;
1678
        }
1679
        else /* I420 */
1680
        {
1681
                sortedP[0]= p[0];
1682
                sortedP[1]= p[2];
1683
                sortedP[2]= p[1];
1684
                sortedStride[0]= stride[0];
1685
                sortedStride[1]= stride[2];
1686
                sortedStride[2]= stride[1];
1687
        }
1688
}
1689

    
1690
/* unscaled copy like stuff (assumes nearly identical formats) */
1691
static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
1692
             int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
1693

    
1694
        int srcStride[3];
1695
        int dstStride[3];
1696
        uint8_t *src[3];
1697
        uint8_t *dst[3];
1698

    
1699
        orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
1700
        orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
1701

    
1702
        if(isPacked(c->srcFormat))
1703
        {
1704
                if(dstStride[0]==srcStride[0])
1705
                        memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1706
                else
1707
                {
1708
                        int i;
1709
                        uint8_t *srcPtr= src[0];
1710
                        uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1711
                        int length=0;
1712

    
1713
                        /* universal length finder */
1714
                        while(length+c->srcW <= ABS(dstStride[0]) 
1715
                           && length+c->srcW <= ABS(srcStride[0])) length+= c->srcW;
1716
                        ASSERT(length!=0);
1717

    
1718
                        for(i=0; i<srcSliceH; i++)
1719
                        {
1720
                                memcpy(dstPtr, srcPtr, length);
1721
                                srcPtr+= srcStride[0];
1722
                                dstPtr+= dstStride[0];
1723
                        }
1724
                }
1725
        }
1726
        else 
1727
        { /* Planar YUV or gray */
1728
                int plane;
1729
                for(plane=0; plane<3; plane++)
1730
                {
1731
                        int length= plane==0 ? c->srcW  : ((c->srcW+1)>>1);
1732
                        int y=      plane==0 ? srcSliceY: ((srcSliceY+1)>>1);
1733
                        int height= plane==0 ? srcSliceH: ((srcSliceH+1)>>1);
1734

    
1735
                        if((isGray(c->srcFormat) || isGray(c->dstFormat)) && plane>0)
1736
                        {
1737
                                if(!isGray(c->dstFormat))
1738
                                        memset(dst[plane], 0, dstStride[plane]*height);
1739
                        }
1740
                        else
1741
                        {
1742
                                if(dstStride[plane]==srcStride[plane])
1743
                                        memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1744
                                else
1745
                                {
1746
                                        int i;
1747
                                        uint8_t *srcPtr= src[plane];
1748
                                        uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1749
                                        for(i=0; i<height; i++)
1750
                                        {
1751
                                                memcpy(dstPtr, srcPtr, length);
1752
                                                srcPtr+= srcStride[plane];
1753
                                                dstPtr+= dstStride[plane];
1754
                                        }
1755
                                }
1756
                        }
1757
                }
1758
        }
1759
}
1760

    
1761
static int remove_dup_fourcc(int fourcc)
1762
{
1763
        switch(fourcc)
1764
        {
1765
            case IMGFMT_IYUV: return IMGFMT_I420;
1766
            case IMGFMT_Y8  : return IMGFMT_Y800;
1767
            default: return fourcc;
1768
        }
1769
}
1770

    
1771
static void getSubSampleFactors(int *h, int *v, int format){
1772
        switch(format){
1773
        case IMGFMT_YUY2:
1774
                *h=1;
1775
                *v=0;
1776
                break;
1777
        case IMGFMT_YV12:
1778
        case IMGFMT_I420:
1779
                *h=1;
1780
                *v=1;
1781
                break;
1782
        case IMGFMT_YVU9:
1783
                *h=2;
1784
                *v=2;
1785
                break;
1786
        default:
1787
                *h=0;
1788
                *v=0;
1789
                break;
1790
        }
1791
}
1792

    
1793
SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1794
                         SwsFilter *srcFilter, SwsFilter *dstFilter){
1795

    
1796
        SwsContext *c;
1797
        int i;
1798
        int usesFilter;
1799
        int unscaled;
1800
        SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1801
#ifdef ARCH_X86
1802
        if(gCpuCaps.hasMMX)
1803
                asm volatile("emms\n\t"::: "memory");
1804
#endif
1805
        if(swScale==NULL) globalInit();
1806

    
1807
        /* avoid dupplicate Formats, so we dont need to check to much */
1808
        srcFormat = remove_dup_fourcc(srcFormat);
1809
        dstFormat = remove_dup_fourcc(dstFormat);
1810

    
1811
        unscaled = (srcW == dstW && srcH == dstH);
1812

    
1813
        if(!isSupportedIn(srcFormat)) 
1814
        {
1815
                MSG_ERR("swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
1816
                return NULL;
1817
        }
1818
        if(!isSupportedOut(dstFormat))
1819
        {
1820
                MSG_ERR("swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
1821
                return NULL;
1822
        }
1823

    
1824
        /* sanity check */
1825
        if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1826
        {
1827
                 MSG_ERR("swScaler: %dx%d -> %dx%d is invalid scaling dimension\n", 
1828
                        srcW, srcH, dstW, dstH);
1829
                return NULL;
1830
        }
1831

    
1832
        if(!dstFilter) dstFilter= &dummyFilter;
1833
        if(!srcFilter) srcFilter= &dummyFilter;
1834

    
1835
        c= memalign(64, sizeof(SwsContext));
1836
        memset(c, 0, sizeof(SwsContext));
1837

    
1838
        c->srcW= srcW;
1839
        c->srcH= srcH;
1840
        c->dstW= dstW;
1841
        c->dstH= dstH;
1842
        c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1843
        c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
1844
        c->flags= flags;
1845
        c->dstFormat= dstFormat;
1846
        c->srcFormat= srcFormat;
1847

    
1848
        usesFilter=0;
1849
        if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
1850
        if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
1851
        if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
1852
        if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
1853
        if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
1854
        if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
1855
        if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
1856
        if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
1857
        
1858
        /* unscaled special Cases */
1859
        if(unscaled && !usesFilter)
1860
        {
1861
                /* yv12_to_nv12 */
1862
                if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_NV12)
1863
                {
1864
                        c->swScale= PlanarToNV12Wrapper;
1865

    
1866
                        if(flags&SWS_PRINT_INFO)
1867
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1868
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1869
                        return c;
1870
                }
1871
                /* yv12_to_yuy2 */
1872
                if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_YUY2)
1873
                {
1874
                        c->swScale= PlanarToYuy2Wrapper;
1875

    
1876
                        if(flags&SWS_PRINT_INFO)
1877
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1878
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1879
                        return c;
1880
                }
1881
                /* yuv2bgr */
1882
                if(isPlanarYUV(srcFormat) && isBGR(dstFormat))
1883
                {
1884
                        // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
1885
#ifdef WORDS_BIGENDIAN
1886
                        if(dstFormat==IMGFMT_BGR32)
1887
                                yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
1888
                        else
1889
                                yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1890
#else
1891
                        yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1892
#endif
1893
                        c->swScale= planarYuvToBgr;
1894

    
1895
                        if(flags&SWS_PRINT_INFO)
1896
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1897
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1898
                        return c;
1899
                }
1900

    
1901
                /* simple copy */
1902
                if(srcFormat == dstFormat 
1903
                   || ((isPlanarYUV(srcFormat)||isGray(srcFormat)) && (isPlanarYUV(dstFormat)||isGray(dstFormat))))
1904
                {
1905
                        c->swScale= simpleCopy;
1906

    
1907
                        if(flags&SWS_PRINT_INFO)
1908
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1909
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1910
                        return c;
1911
                }
1912

    
1913
                /* bgr32to24 & rgb32to24*/
1914
                if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR24)
1915
                 ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB24))
1916
                {
1917
                        c->swScale= bgr32to24Wrapper;
1918

    
1919
                        if(flags&SWS_PRINT_INFO)
1920
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1921
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1922
                        return c;
1923
                }
1924

    
1925
                /* bgr32to16 & rgb32to16*/
1926
                if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR16)
1927
                 ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB16))
1928
                {
1929
                        c->swScale= bgr32to16Wrapper;
1930

    
1931
                        if(flags&SWS_PRINT_INFO)
1932
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1933
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1934
                        return c;
1935
                }
1936

    
1937
                /* bgr32to15 & rgb32to15*/
1938
                if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR15)
1939
                 ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB15))
1940
                {
1941
                        c->swScale= bgr32to15Wrapper;
1942

    
1943
                        if(flags&SWS_PRINT_INFO)
1944
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1945
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1946
                        return c;
1947
                }
1948
                
1949
                /* bgr24to32 & rgb24to32*/
1950
                if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR32)
1951
                 ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB32))
1952
                {
1953
                        c->swScale= bgr24to32Wrapper;
1954

    
1955
                        if(flags&SWS_PRINT_INFO)
1956
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1957
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1958
                        return c;
1959
                }
1960

    
1961
                /* bgr24to16 & rgb24to16*/
1962
                if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR16)
1963
                 ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB16))
1964
                {
1965
                        c->swScale= bgr24to16Wrapper;
1966

    
1967
                        if(flags&SWS_PRINT_INFO)
1968
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1969
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1970
                        return c;
1971
                }
1972

    
1973
                /* bgr24to15 & rgb24to15*/
1974
                if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR15)
1975
                 ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB15))
1976
                {
1977
                        c->swScale= bgr24to15Wrapper;
1978

    
1979
                        if(flags&SWS_PRINT_INFO)
1980
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1981
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1982
                        return c;
1983
                }
1984

    
1985
                /* bgr15to16 */
1986
                if(srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR16)
1987
                {
1988
                        c->swScale= bgr15to16Wrapper;
1989

    
1990
                        if(flags&SWS_PRINT_INFO)
1991
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1992
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
1993
                        return c;
1994
                }
1995

    
1996
                /* bgr15to24 */
1997
                if((srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR24)
1998
                 ||(srcFormat==IMGFMT_RGB15 && dstFormat==IMGFMT_RGB24))
1999
                {
2000
                        c->swScale= bgr15to24Wrapper;
2001

    
2002
                        if(flags&SWS_PRINT_INFO)
2003
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2004
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
2005
                        return c;
2006
                }
2007

    
2008
                /* bgr15to32 */
2009
                if((srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR32)
2010
                 ||(srcFormat==IMGFMT_RGB15 && dstFormat==IMGFMT_RGB32))
2011
                {
2012
                        c->swScale= bgr15to32Wrapper;
2013

    
2014
                        if(flags&SWS_PRINT_INFO)
2015
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2016
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
2017
                        return c;
2018
                }
2019

    
2020
                /* bgr16to24 */
2021
                if((srcFormat==IMGFMT_BGR16 && dstFormat==IMGFMT_BGR24)
2022
                 ||(srcFormat==IMGFMT_RGB16 && dstFormat==IMGFMT_RGB24))
2023
                {
2024
                        c->swScale= bgr16to24Wrapper;
2025

    
2026
                        if(flags&SWS_PRINT_INFO)
2027
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2028
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
2029
                        return c;
2030
                }
2031

    
2032
                /* bgr16to32 */
2033
                if((srcFormat==IMGFMT_BGR16 && dstFormat==IMGFMT_BGR32)
2034
                 ||(srcFormat==IMGFMT_RGB16 && dstFormat==IMGFMT_RGB32))
2035
                {
2036
                        c->swScale= bgr16to32Wrapper;
2037

    
2038
                        if(flags&SWS_PRINT_INFO)
2039
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2040
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
2041
                        return c;
2042
                }
2043

    
2044
                /* bgr24toYV12 */
2045
                if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
2046
                {
2047
                        c->swScale= bgr24toyv12Wrapper;
2048

    
2049
                        if(flags&SWS_PRINT_INFO)
2050
                                MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2051
                                        vo_format_name(srcFormat), vo_format_name(dstFormat));
2052
                        return c;
2053
                }
2054
        }
2055

    
2056
        if(cpuCaps.hasMMX2)
2057
        {
2058
                c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
2059
                if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
2060
                {
2061
                        if(flags&SWS_PRINT_INFO)
2062
                                MSG_INFO("SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
2063
                }
2064
        }
2065
        else
2066
                c->canMMX2BeUsed=0;
2067

    
2068
        getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
2069
        getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
2070

    
2071
        // reuse chroma for 2 pixles rgb/bgr unless user wants full chroma interpolation
2072
        if((isBGR(dstFormat) || isRGB(dstFormat)) && !(flags&SWS_FULL_CHR_H_INT)) c->chrDstHSubSample=1;
2073

    
2074
        // drop eery 2. pixel for chroma calculation unless user wants full chroma
2075
        if((isBGR(srcFormat) || isRGB(srcFormat) || srcFormat==IMGFMT_YUY2) && !(flags&SWS_FULL_CHR_V)) 
2076
                c->chrSrcVSubSample=1;
2077

    
2078
        // drop eery 2. pixel for chroma calculation unless user wants full chroma
2079
        if((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP)) 
2080
                c->chrSrcHSubSample=1;
2081

    
2082
        c->chrIntHSubSample= c->chrDstHSubSample;
2083
        c->chrIntVSubSample= c->chrSrcVSubSample;
2084
        
2085
        // note the -((-x)>>y) is so that we allways round toward +inf
2086
        c->chrSrcW= -((-srcW) >> c->chrSrcHSubSample);
2087
        c->chrSrcH= -((-srcH) >> c->chrSrcVSubSample);
2088
        c->chrDstW= -((-dstW) >> c->chrDstHSubSample);
2089
        c->chrDstH= -((-dstH) >> c->chrDstVSubSample);
2090
/*        printf("%d %d %d %d / %d %d %d %d //\n", 
2091
        c->chrSrcW,
2092
c->chrSrcH,
2093
c->chrDstW,
2094
c->chrDstH,
2095
srcW,
2096
srcH,
2097
dstW,
2098
dstH);*/
2099
        c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
2100
        c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
2101

    
2102
        // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
2103
        // but only for the FAST_BILINEAR mode otherwise do correct scaling
2104
        // n-2 is the last chrominance sample available
2105
        // this is not perfect, but noone shuld notice the difference, the more correct variant
2106
        // would be like the vertical one, but that would require some special code for the
2107
        // first and last pixel
2108
        if(flags&SWS_FAST_BILINEAR)
2109
        {
2110
                if(c->canMMX2BeUsed)
2111
                {
2112
                        c->lumXInc+= 20;
2113
                        c->chrXInc+= 20;
2114
                }
2115
                //we dont use the x86asm scaler if mmx is available
2116
                else if(cpuCaps.hasMMX)
2117
                {
2118
                        c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
2119
                        c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
2120
                }
2121
        }
2122

    
2123
        /* precalculate horizontal scaler filter coefficients */
2124
        {
2125
                const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
2126

    
2127
                initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
2128
                                 srcW      ,       dstW, filterAlign, 1<<14, flags,
2129
                                 srcFilter->lumH, dstFilter->lumH);
2130
                initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
2131
                                (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
2132
                                 srcFilter->chrH, dstFilter->chrH);
2133

    
2134
#ifdef ARCH_X86
2135
// cant downscale !!!
2136
                if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
2137
                {
2138
                        c->lumMmx2Filter   = (int16_t*)memalign(8, (dstW        /8+8)*sizeof(int16_t));
2139
                        c->chrMmx2Filter   = (int16_t*)memalign(8, (c->chrDstW  /4+8)*sizeof(int16_t));
2140
                        c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW      /2/8+8)*sizeof(int32_t));
2141
                        c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
2142

    
2143
                        initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
2144
                        initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
2145
                }
2146
#endif
2147
        } // Init Horizontal stuff
2148

    
2149

    
2150

    
2151
        /* precalculate vertical scaler filter coefficients */
2152
        initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
2153
                        srcH      ,        dstH, 1, (1<<12)-4, flags,
2154
                        srcFilter->lumV, dstFilter->lumV);
2155
        initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
2156
                        (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
2157
                         srcFilter->chrV, dstFilter->chrV);
2158

    
2159
        // Calculate Buffer Sizes so that they wont run out while handling these damn slices
2160
        c->vLumBufSize= c->vLumFilterSize;
2161
        c->vChrBufSize= c->vChrFilterSize;
2162
        for(i=0; i<dstH; i++)
2163
        {
2164
                int chrI= i*c->chrDstH / dstH;
2165
                int nextSlice= MAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
2166
                                 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
2167
                nextSlice&= ~1; // Slices start at even boundaries
2168
                if(c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
2169
                        c->vLumBufSize= nextSlice - c->vLumFilterPos[i   ];
2170
                if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
2171
                        c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
2172
        }
2173

    
2174
        // allocate pixbufs (we use dynamic allocation because otherwise we would need to
2175
        c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
2176
        c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
2177
        //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
2178
        for(i=0; i<c->vLumBufSize; i++)
2179
                c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
2180
        for(i=0; i<c->vChrBufSize; i++)
2181
                c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
2182

    
2183
        //try to avoid drawing green stuff between the right end and the stride end
2184
        for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
2185
        for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
2186

    
2187
        ASSERT(c->chrDstH <= dstH)
2188

    
2189
        // pack filter data for mmx code
2190
        if(cpuCaps.hasMMX)
2191
        {
2192
                c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize*      dstH*4*sizeof(int16_t));
2193
                c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
2194
                for(i=0; i<c->vLumFilterSize*dstH; i++)
2195
                        c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
2196
                                c->vLumFilter[i];
2197
                for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
2198
                        c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
2199
                                c->vChrFilter[i];
2200
        }
2201

    
2202
        if(flags&SWS_PRINT_INFO)
2203
        {
2204
#ifdef DITHER1XBPP
2205
                char *dither= " dithered";
2206
#else
2207
                char *dither= "";
2208
#endif
2209
                if(flags&SWS_FAST_BILINEAR)
2210
                        MSG_INFO("\nSwScaler: FAST_BILINEAR scaler, ");
2211
                else if(flags&SWS_BILINEAR)
2212
                        MSG_INFO("\nSwScaler: BILINEAR scaler, ");
2213
                else if(flags&SWS_BICUBIC)
2214
                        MSG_INFO("\nSwScaler: BICUBIC scaler, ");
2215
                else if(flags&SWS_X)
2216
                        MSG_INFO("\nSwScaler: Experimental scaler, ");
2217
                else if(flags&SWS_POINT)
2218
                        MSG_INFO("\nSwScaler: Nearest Neighbor / POINT scaler, ");
2219
                else if(flags&SWS_AREA)
2220
                        MSG_INFO("\nSwScaler: Area Averageing scaler, ");
2221
                else
2222
                        MSG_INFO("\nSwScaler: ehh flags invalid?! ");
2223

    
2224
                if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
2225
                        MSG_INFO("from %s to%s %s ", 
2226
                                vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
2227
                else
2228
                        MSG_INFO("from %s to %s ", 
2229
                                vo_format_name(srcFormat), vo_format_name(dstFormat));
2230

    
2231
                if(cpuCaps.hasMMX2)
2232
                        MSG_INFO("using MMX2\n");
2233
                else if(cpuCaps.has3DNow)
2234
                        MSG_INFO("using 3DNOW\n");
2235
                else if(cpuCaps.hasMMX)
2236
                        MSG_INFO("using MMX\n");
2237
                else
2238
                        MSG_INFO("using C\n");
2239
        }
2240

    
2241
        if((flags & SWS_PRINT_INFO) && verbose)
2242
        {
2243
                if(cpuCaps.hasMMX)
2244
                {
2245
                        if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
2246
                                MSG_V("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
2247
                        else
2248
                        {
2249
                                if(c->hLumFilterSize==4)
2250
                                        MSG_V("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
2251
                                else if(c->hLumFilterSize==8)
2252
                                        MSG_V("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
2253
                                else
2254
                                        MSG_V("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
2255

    
2256
                                if(c->hChrFilterSize==4)
2257
                                        MSG_V("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
2258
                                else if(c->hChrFilterSize==8)
2259
                                        MSG_V("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
2260
                                else
2261
                                        MSG_V("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
2262
                        }
2263
                }
2264
                else
2265
                {
2266
#ifdef ARCH_X86
2267
                        MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n");
2268
#else
2269
                        if(flags & SWS_FAST_BILINEAR)
2270
                                MSG_V("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
2271
                        else
2272
                                MSG_V("SwScaler: using C scaler for horizontal scaling\n");
2273
#endif
2274
                }
2275
                if(isPlanarYUV(dstFormat))
2276
                {
2277
                        if(c->vLumFilterSize==1)
2278
                                MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
2279
                        else
2280
                                MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
2281
                }
2282
                else
2283
                {
2284
                        if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
2285
                                MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
2286
                                       "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
2287
                        else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
2288
                                MSG_V("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
2289
                        else
2290
                                MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
2291
                }
2292

    
2293
                if(dstFormat==IMGFMT_BGR24)
2294
                        MSG_V("SwScaler: using %s YV12->BGR24 Converter\n",
2295
                                cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
2296
                else if(dstFormat==IMGFMT_BGR32)
2297
                        MSG_V("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
2298
                else if(dstFormat==IMGFMT_BGR16)
2299
                        MSG_V("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
2300
                else if(dstFormat==IMGFMT_BGR15)
2301
                        MSG_V("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
2302

    
2303
                MSG_V("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
2304
        }
2305
        if((flags & SWS_PRINT_INFO) && verbose>1)
2306
        {
2307
                MSG_DBG2("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
2308
                        c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
2309
                MSG_DBG2("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
2310
                        c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
2311
        }
2312

    
2313
        c->swScale= swScale;
2314
        return c;
2315
}
2316

    
2317
/**
2318
 * returns a normalized gaussian curve used to filter stuff
2319
 * quality=3 is high quality, lowwer is lowwer quality
2320
 */
2321

    
2322
SwsVector *getGaussianVec(double variance, double quality){
2323
        const int length= (int)(variance*quality + 0.5) | 1;
2324
        int i;
2325
        double *coeff= memalign(sizeof(double), length*sizeof(double));
2326
        double middle= (length-1)*0.5;
2327
        SwsVector *vec= malloc(sizeof(SwsVector));
2328

    
2329
        vec->coeff= coeff;
2330
        vec->length= length;
2331

    
2332
        for(i=0; i<length; i++)
2333
        {
2334
                double dist= i-middle;
2335
                coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
2336
        }
2337

    
2338
        normalizeVec(vec, 1.0);
2339

    
2340
        return vec;
2341
}
2342

    
2343
SwsVector *getConstVec(double c, int length){
2344
        int i;
2345
        double *coeff= memalign(sizeof(double), length*sizeof(double));
2346
        SwsVector *vec= malloc(sizeof(SwsVector));
2347

    
2348
        vec->coeff= coeff;
2349
        vec->length= length;
2350

    
2351
        for(i=0; i<length; i++)
2352
                coeff[i]= c;
2353

    
2354
        return vec;
2355
}
2356

    
2357

    
2358
SwsVector *getIdentityVec(void){
2359
        double *coeff= memalign(sizeof(double), sizeof(double));
2360
        SwsVector *vec= malloc(sizeof(SwsVector));
2361
        coeff[0]= 1.0;
2362

    
2363
        vec->coeff= coeff;
2364
        vec->length= 1;
2365

    
2366
        return vec;
2367
}
2368

    
2369
void normalizeVec(SwsVector *a, double height){
2370
        int i;
2371
        double sum=0;
2372
        double inv;
2373

    
2374
        for(i=0; i<a->length; i++)
2375
                sum+= a->coeff[i];
2376

    
2377
        inv= height/sum;
2378

    
2379
        for(i=0; i<a->length; i++)
2380
                a->coeff[i]*= height;
2381
}
2382

    
2383
void scaleVec(SwsVector *a, double scalar){
2384
        int i;
2385

    
2386
        for(i=0; i<a->length; i++)
2387
                a->coeff[i]*= scalar;
2388
}
2389

    
2390
static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
2391
        int length= a->length + b->length - 1;
2392
        double *coeff= memalign(sizeof(double), length*sizeof(double));
2393
        int i, j;
2394
        SwsVector *vec= malloc(sizeof(SwsVector));
2395

    
2396
        vec->coeff= coeff;
2397
        vec->length= length;
2398

    
2399
        for(i=0; i<length; i++) coeff[i]= 0.0;
2400

    
2401
        for(i=0; i<a->length; i++)
2402
        {
2403
                for(j=0; j<b->length; j++)
2404
                {
2405
                        coeff[i+j]+= a->coeff[i]*b->coeff[j];
2406
                }
2407
        }
2408

    
2409
        return vec;
2410
}
2411

    
2412
static SwsVector *sumVec(SwsVector *a, SwsVector *b){
2413
        int length= MAX(a->length, b->length);
2414
        double *coeff= memalign(sizeof(double), length*sizeof(double));
2415
        int i;
2416
        SwsVector *vec= malloc(sizeof(SwsVector));
2417

    
2418
        vec->coeff= coeff;
2419
        vec->length= length;
2420

    
2421
        for(i=0; i<length; i++) coeff[i]= 0.0;
2422

    
2423
        for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
2424
        for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
2425

    
2426
        return vec;
2427
}
2428

    
2429
static SwsVector *diffVec(SwsVector *a, SwsVector *b){
2430
        int length= MAX(a->length, b->length);
2431
        double *coeff= memalign(sizeof(double), length*sizeof(double));
2432
        int i;
2433
        SwsVector *vec= malloc(sizeof(SwsVector));
2434

    
2435
        vec->coeff= coeff;
2436
        vec->length= length;
2437

    
2438
        for(i=0; i<length; i++) coeff[i]= 0.0;
2439

    
2440
        for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
2441
        for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
2442

    
2443
        return vec;
2444
}
2445

    
2446
/* shift left / or right if "shift" is negative */
2447
static SwsVector *getShiftedVec(SwsVector *a, int shift){
2448
        int length= a->length + ABS(shift)*2;
2449
        double *coeff= memalign(sizeof(double), length*sizeof(double));
2450
        int i;
2451
        SwsVector *vec= malloc(sizeof(SwsVector));
2452

    
2453
        vec->coeff= coeff;
2454
        vec->length= length;
2455

    
2456
        for(i=0; i<length; i++) coeff[i]= 0.0;
2457

    
2458
        for(i=0; i<a->length; i++)
2459
        {
2460
                coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
2461
        }
2462

    
2463
        return vec;
2464
}
2465

    
2466
void shiftVec(SwsVector *a, int shift){
2467
        SwsVector *shifted= getShiftedVec(a, shift);
2468
        free(a->coeff);
2469
        a->coeff= shifted->coeff;
2470
        a->length= shifted->length;
2471
        free(shifted);
2472
}
2473

    
2474
void addVec(SwsVector *a, SwsVector *b){
2475
        SwsVector *sum= sumVec(a, b);
2476
        free(a->coeff);
2477
        a->coeff= sum->coeff;
2478
        a->length= sum->length;
2479
        free(sum);
2480
}
2481

    
2482
void subVec(SwsVector *a, SwsVector *b){
2483
        SwsVector *diff= diffVec(a, b);
2484
        free(a->coeff);
2485
        a->coeff= diff->coeff;
2486
        a->length= diff->length;
2487
        free(diff);
2488
}
2489

    
2490
void convVec(SwsVector *a, SwsVector *b){
2491
        SwsVector *conv= getConvVec(a, b);
2492
        free(a->coeff);
2493
        a->coeff= conv->coeff;
2494
        a->length= conv->length;
2495
        free(conv);
2496
}
2497

    
2498
SwsVector *cloneVec(SwsVector *a){
2499
        double *coeff= memalign(sizeof(double), a->length*sizeof(double));
2500
        int i;
2501
        SwsVector *vec= malloc(sizeof(SwsVector));
2502

    
2503
        vec->coeff= coeff;
2504
        vec->length= a->length;
2505

    
2506
        for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
2507

    
2508
        return vec;
2509
}
2510

    
2511
void printVec(SwsVector *a){
2512
        int i;
2513
        double max=0;
2514
        double min=0;
2515
        double range;
2516

    
2517
        for(i=0; i<a->length; i++)
2518
                if(a->coeff[i]>max) max= a->coeff[i];
2519

    
2520
        for(i=0; i<a->length; i++)
2521
                if(a->coeff[i]<min) min= a->coeff[i];
2522

    
2523
        range= max - min;
2524

    
2525
        for(i=0; i<a->length; i++)
2526
        {
2527
                int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
2528
                MSG_DBG2("%1.3f ", a->coeff[i]);
2529
                for(;x>0; x--) MSG_DBG2(" ");
2530
                MSG_DBG2("|\n");
2531
        }
2532
}
2533

    
2534
void freeVec(SwsVector *a){
2535
        if(!a) return;
2536
        if(a->coeff) free(a->coeff);
2537
        a->coeff=NULL;
2538
        a->length=0;
2539
        free(a);
2540
}
2541

    
2542
void freeSwsContext(SwsContext *c){
2543
        int i;
2544
        if(!c) return;
2545

    
2546
        if(c->lumPixBuf)
2547
        {
2548
                for(i=0; i<c->vLumBufSize; i++)
2549
                {
2550
                        if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
2551
                        c->lumPixBuf[i]=NULL;
2552
                }
2553
                free(c->lumPixBuf);
2554
                c->lumPixBuf=NULL;
2555
        }
2556

    
2557
        if(c->chrPixBuf)
2558
        {
2559
                for(i=0; i<c->vChrBufSize; i++)
2560
                {
2561
                        if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
2562
                        c->chrPixBuf[i]=NULL;
2563
                }
2564
                free(c->chrPixBuf);
2565
                c->chrPixBuf=NULL;
2566
        }
2567

    
2568
        if(c->vLumFilter) free(c->vLumFilter);
2569
        c->vLumFilter = NULL;
2570
        if(c->vChrFilter) free(c->vChrFilter);
2571
        c->vChrFilter = NULL;
2572
        if(c->hLumFilter) free(c->hLumFilter);
2573
        c->hLumFilter = NULL;
2574
        if(c->hChrFilter) free(c->hChrFilter);
2575
        c->hChrFilter = NULL;
2576

    
2577
        if(c->vLumFilterPos) free(c->vLumFilterPos);
2578
        c->vLumFilterPos = NULL;
2579
        if(c->vChrFilterPos) free(c->vChrFilterPos);
2580
        c->vChrFilterPos = NULL;
2581
        if(c->hLumFilterPos) free(c->hLumFilterPos);
2582
        c->hLumFilterPos = NULL;
2583
        if(c->hChrFilterPos) free(c->hChrFilterPos);
2584
        c->hChrFilterPos = NULL;
2585

    
2586
        if(c->lumMmxFilter) free(c->lumMmxFilter);
2587
        c->lumMmxFilter = NULL;
2588
        if(c->chrMmxFilter) free(c->chrMmxFilter);
2589
        c->chrMmxFilter = NULL;
2590

    
2591
        if(c->lumMmx2Filter) free(c->lumMmx2Filter);
2592
        c->lumMmx2Filter=NULL;
2593
        if(c->chrMmx2Filter) free(c->chrMmx2Filter);
2594
        c->chrMmx2Filter=NULL;
2595
        if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
2596
        c->lumMmx2FilterPos=NULL;
2597
        if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
2598
        c->chrMmx2FilterPos=NULL;
2599

    
2600
        free(c);
2601
}
2602

    
2603