Statistics
| Branch: | Revision:

ffmpeg / libavcodec / libpostproc / postprocess.c @ 792a5a7c

History | View | Annotate | Download (31.9 KB)

1
/*
2
    Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3

4
    AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5

6
    This program is free software; you can redistribute it and/or modify
7
    it under the terms of the GNU General Public License as published by
8
    the Free Software Foundation; either version 2 of the License, or
9
    (at your option) any later version.
10

11
    This program is distributed in the hope that it will be useful,
12
    but WITHOUT ANY WARRANTY; without even the implied warranty of
13
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
    GNU General Public License for more details.
15

16
    You should have received a copy of the GNU General Public License
17
    along with this program; if not, write to the Free Software
18
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
*/
20

    
21
/**
22
 * @file postprocess.c
23
 * postprocessing.
24
 */
25
 
26
/*
27
                        C        MMX        MMX2        3DNow        AltiVec
28
isVertDC                Ec        Ec                        Ec
29
isVertMinMaxOk                Ec        Ec                        Ec
30
doVertLowPass                E                e        e        Ec
31
doVertDefFilter                Ec        Ec        e        e        Ec
32
isHorizDC                Ec        Ec
33
isHorizMinMaxOk                a        E
34
doHorizLowPass                E                e        e
35
doHorizDefFilter        Ec        Ec        e        e
36
do_a_deblock                Ec        E        Ec        E
37
deRing                        E                e        e*        Ecp
38
Vertical RKAlgo1        E                a        a
39
Horizontal RKAlgo1                        a        a
40
Vertical X1#                a                E        E
41
Horizontal X1#                a                E        E
42
LinIpolDeinterlace        e                E        E*
43
CubicIpolDeinterlace        a                e        e*
44
LinBlendDeinterlace        e                E        E*
45
MedianDeinterlace#        E        Ec        Ec
46
TempDeNoiser#                E                e        e
47

48
* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
49
# more or less selfinvented filters so the exactness isnt too meaningfull
50
E = Exact implementation
51
e = allmost exact implementation (slightly different rounding,...)
52
a = alternative / approximate impl
53
c = checked against the other implementations (-vo md5)
54
p = partially optimized, still some work to do
55
*/
56

    
57
/*
58
TODO:
59
reduce the time wasted on the mem transfer
60
unroll stuff if instructions depend too much on the prior one
61
move YScale thing to the end instead of fixing QP
62
write a faster and higher quality deblocking filter :)
63
make the mainloop more flexible (variable number of blocks at once
64
        (the if/else stuff per block is slowing things down)
65
compare the quality & speed of all filters
66
split this huge file
67
optimize c versions
68
try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
69
...
70
*/
71

    
72
//Changelog: use the CVS log
73

    
74
#include "config.h"
75
#include <inttypes.h>
76
#include <stdio.h>
77
#include <stdlib.h>
78
#include <string.h>
79
#ifdef HAVE_MALLOC_H
80
#include <malloc.h>
81
#endif
82
//#undef HAVE_MMX2
83
//#define HAVE_3DNOW
84
//#undef HAVE_MMX
85
//#undef ARCH_X86
86
//#define DEBUG_BRIGHTNESS
87
#ifdef USE_FASTMEMCPY
88
#include "fastmemcpy.h"
89
#endif
90
#include "postprocess.h"
91
#include "postprocess_internal.h"
92

    
93
#include "mangle.h" //FIXME should be supressed
94

    
95
#ifndef HAVE_MEMALIGN
96
#define memalign(a,b) malloc(b)
97
#endif
98

    
99
#define MIN(a,b) ((a) > (b) ? (b) : (a))
100
#define MAX(a,b) ((a) < (b) ? (b) : (a))
101
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
102
#define SIGN(a) ((a) > 0 ? 1 : -1)
103

    
104
#define GET_MODE_BUFFER_SIZE 500
105
#define OPTIONS_ARRAY_SIZE 10
106
#define BLOCK_SIZE 8
107
#define TEMP_STRIDE 8
108
//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
109

    
110
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
111
#    define attribute_used __attribute__((used))
112
#    define always_inline __attribute__((always_inline)) inline
113
#else
114
#    define attribute_used
115
#    define always_inline inline
116
#endif
117

    
118
#ifdef ARCH_X86
119
static uint64_t __attribute__((aligned(8))) attribute_used w05=                0x0005000500050005LL;
120
static uint64_t __attribute__((aligned(8))) attribute_used w20=                0x0020002000200020LL;
121
static uint64_t __attribute__((aligned(8))) attribute_used b00=                 0x0000000000000000LL;
122
static uint64_t __attribute__((aligned(8))) attribute_used b01=                 0x0101010101010101LL;
123
static uint64_t __attribute__((aligned(8))) attribute_used b02=                 0x0202020202020202LL;
124
static uint64_t __attribute__((aligned(8))) attribute_used b08=                 0x0808080808080808LL;
125
static uint64_t __attribute__((aligned(8))) attribute_used b80=                 0x8080808080808080LL;
126
#endif
127

    
128
static uint8_t clip_table[3*256];
129
static uint8_t * const clip_tab= clip_table + 256;
130

    
131
static const int verbose= 0;
132

    
133
static const int attribute_used deringThreshold= 20;
134

    
135

    
136
static struct PPFilter filters[]=
137
{
138
        {"hb", "hdeblock",                 1, 1, 3, H_DEBLOCK},
139
        {"vb", "vdeblock",                 1, 2, 4, V_DEBLOCK},
140
/*        {"hr", "rkhdeblock",                 1, 1, 3, H_RK1_FILTER},
141
        {"vr", "rkvdeblock",                 1, 2, 4, V_RK1_FILTER},*/
142
        {"h1", "x1hdeblock",                 1, 1, 3, H_X1_FILTER},
143
        {"v1", "x1vdeblock",                 1, 2, 4, V_X1_FILTER},
144
        {"ha", "ahdeblock",                 1, 1, 3, H_A_DEBLOCK},
145
        {"va", "avdeblock",                 1, 2, 4, V_A_DEBLOCK},
146
        {"dr", "dering",                 1, 5, 6, DERING},
147
        {"al", "autolevels",                 0, 1, 2, LEVEL_FIX},
148
        {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
149
        {"li", "linipoldeint",                 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
150
        {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
151
        {"md", "mediandeint",                 1, 1, 4, MEDIAN_DEINT_FILTER},
152
        {"fd", "ffmpegdeint",                 1, 1, 4, FFMPEG_DEINT_FILTER},
153
        {"l5", "lowpass5",                 1, 1, 4, LOWPASS5_DEINT_FILTER},
154
        {"tn", "tmpnoise",                 1, 7, 8, TEMP_NOISE_FILTER},
155
        {"fq", "forcequant",                 1, 0, 0, FORCE_QUANT},
156
        {NULL, NULL,0,0,0,0} //End Marker
157
};
158

    
159
static char *replaceTable[]=
160
{
161
        "default",         "hdeblock:a,vdeblock:a,dering:a",
162
        "de",                 "hdeblock:a,vdeblock:a,dering:a",
163
        "fast",         "x1hdeblock:a,x1vdeblock:a,dering:a",
164
        "fa",                 "x1hdeblock:a,x1vdeblock:a,dering:a",
165
        "ac",                 "ha:a:128:7,va:a,dering:a",
166
        NULL //End Marker
167
};
168

    
169

    
170
#ifdef ARCH_X86
171
static inline void prefetchnta(void *p)
172
{
173
        asm volatile(        "prefetchnta (%0)\n\t"
174
                : : "r" (p)
175
        );
176
}
177

    
178
static inline void prefetcht0(void *p)
179
{
180
        asm volatile(        "prefetcht0 (%0)\n\t"
181
                : : "r" (p)
182
        );
183
}
184

    
185
static inline void prefetcht1(void *p)
186
{
187
        asm volatile(        "prefetcht1 (%0)\n\t"
188
                : : "r" (p)
189
        );
190
}
191

    
192
static inline void prefetcht2(void *p)
193
{
194
        asm volatile(        "prefetcht2 (%0)\n\t"
195
                : : "r" (p)
196
        );
197
}
198
#endif
199

    
200
// The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
201

    
202
/**
203
 * Check if the given 8x8 Block is mostly "flat"
204
 */
205
static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
206
{
207
        int numEq= 0;
208
        int y;
209
        const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
210
        const int dcThreshold= dcOffset*2 + 1;
211

    
212
        for(y=0; y<BLOCK_SIZE; y++)
213
        {
214
                if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
215
                if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
216
                if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
217
                if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
218
                if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
219
                if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
220
                if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
221
                src+= stride;
222
        }
223
        return numEq > c->ppMode.flatnessThreshold;
224
}
225

    
226
/**
227
 * Check if the middle 8x8 Block in the given 8x16 block is flat
228
 */
229
static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
230
        int numEq= 0;
231
        int y;
232
        const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
233
        const int dcThreshold= dcOffset*2 + 1;
234

    
235
        src+= stride*4; // src points to begin of the 8x8 Block
236
        for(y=0; y<BLOCK_SIZE-1; y++)
237
        {
238
                if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
239
                if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
240
                if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
241
                if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
242
                if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
243
                if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
244
                if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
245
                if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
246
                src+= stride;
247
        }
248
        return numEq > c->ppMode.flatnessThreshold;
249
}
250

    
251
static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
252
{
253
        int i;
254
#if 1
255
        for(i=0; i<2; i++){
256
                if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
257
                src += stride;
258
                if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
259
                src += stride;
260
                if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
261
                src += stride;
262
                if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
263
                src += stride;
264
        }
265
#else        
266
        for(i=0; i<8; i++){
267
                if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
268
                src += stride;
269
        }
270
#endif
271
        return 1;
272
}
273

    
274
static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
275
{
276
#if 1
277
#if 1
278
        int x;
279
        src+= stride*4;
280
        for(x=0; x<BLOCK_SIZE; x+=4)
281
        {
282
                if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
283
                if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
284
                if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
285
                if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
286
        }
287
#else
288
        int x;
289
        src+= stride*3;
290
        for(x=0; x<BLOCK_SIZE; x++)
291
        {
292
                if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
293
        }
294
#endif
295
        return 1;
296
#else
297
        int x;
298
        src+= stride*4;
299
        for(x=0; x<BLOCK_SIZE; x++)
300
        {
301
                int min=255;
302
                int max=0;
303
                int y;
304
                for(y=0; y<8; y++){
305
                        int v= src[x + y*stride];
306
                        if(v>max) max=v;
307
                        if(v<min) min=v;
308
                }
309
                if(max-min > 2*QP) return 0;
310
        }
311
        return 1;
312
#endif
313
}
314

    
315
static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
316
        if( isHorizDC_C(src, stride, c) ){
317
                if( isHorizMinMaxOk_C(src, stride, c->QP) )
318
                        return 1;
319
                else
320
                        return 0;
321
        }else{
322
                return 2;
323
        }
324
}
325

    
326
static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
327
        if( isVertDC_C(src, stride, c) ){
328
                if( isVertMinMaxOk_C(src, stride, c->QP) )
329
                        return 1;
330
                else
331
                        return 0;
332
        }else{
333
                return 2;
334
        }
335
}
336

    
337
static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
338
{
339
        int y;
340
        for(y=0; y<BLOCK_SIZE; y++)
341
        {
342
                const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
343

    
344
                if(ABS(middleEnergy) < 8*c->QP)
345
                {
346
                        const int q=(dst[3] - dst[4])/2;
347
                        const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
348
                        const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
349

    
350
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
351
                        d= MAX(d, 0);
352

    
353
                        d= (5*d + 32) >> 6;
354
                        d*= SIGN(-middleEnergy);
355

    
356
                        if(q>0)
357
                        {
358
                                d= d<0 ? 0 : d;
359
                                d= d>q ? q : d;
360
                        }
361
                        else
362
                        {
363
                                d= d>0 ? 0 : d;
364
                                d= d<q ? q : d;
365
                        }
366

    
367
                        dst[3]-= d;
368
                        dst[4]+= d;
369
                }
370
                dst+= stride;
371
        }
372
}
373

    
374
/**
375
 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
376
 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
377
 */
378
static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
379
{
380
        int y;
381
        for(y=0; y<BLOCK_SIZE; y++)
382
        {
383
                const int first= ABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
384
                const int last= ABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
385

    
386
                int sums[10];
387
                sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
388
                sums[1] = sums[0] - first  + dst[3];
389
                sums[2] = sums[1] - first  + dst[4];
390
                sums[3] = sums[2] - first  + dst[5];
391
                sums[4] = sums[3] - first  + dst[6];
392
                sums[5] = sums[4] - dst[0] + dst[7];
393
                sums[6] = sums[5] - dst[1] + last;
394
                sums[7] = sums[6] - dst[2] + last;
395
                sums[8] = sums[7] - dst[3] + last;
396
                sums[9] = sums[8] - dst[4] + last;
397

    
398
                dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
399
                dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
400
                dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
401
                dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
402
                dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
403
                dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
404
                dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
405
                dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
406

    
407
                dst+= stride;
408
        }
409
}
410

    
411
/**
412
 * Experimental Filter 1 (Horizontal)
413
 * will not damage linear gradients
414
 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
415
 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
416
 * MMX2 version does correct clipping C version doesnt
417
 * not identical with the vertical one
418
 */
419
static inline void horizX1Filter(uint8_t *src, int stride, int QP)
420
{
421
        int y;
422
        static uint64_t *lut= NULL;
423
        if(lut==NULL)
424
        {
425
                int i;
426
                lut= (uint64_t*)memalign(8, 256*8);
427
                for(i=0; i<256; i++)
428
                {
429
                        int v= i < 128 ? 2*i : 2*(i-256);
430
/*
431
//Simulate 112242211 9-Tap filter
432
                        uint64_t a= (v/16) & 0xFF;
433
                        uint64_t b= (v/8) & 0xFF;
434
                        uint64_t c= (v/4) & 0xFF;
435
                        uint64_t d= (3*v/8) & 0xFF;
436
*/
437
//Simulate piecewise linear interpolation
438
                        uint64_t a= (v/16) & 0xFF;
439
                        uint64_t b= (v*3/16) & 0xFF;
440
                        uint64_t c= (v*5/16) & 0xFF;
441
                        uint64_t d= (7*v/16) & 0xFF;
442
                        uint64_t A= (0x100 - a)&0xFF;
443
                        uint64_t B= (0x100 - b)&0xFF;
444
                        uint64_t C= (0x100 - c)&0xFF;
445
                        uint64_t D= (0x100 - c)&0xFF;
446

    
447
                        lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
448
                                (D<<24) | (C<<16) | (B<<8) | (A);
449
                        //lut[i] = (v<<32) | (v<<24);
450
                }
451
        }
452

    
453
        for(y=0; y<BLOCK_SIZE; y++)
454
        {
455
                int a= src[1] - src[2];
456
                int b= src[3] - src[4];
457
                int c= src[5] - src[6];
458

    
459
                int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
460

    
461
                if(d < QP)
462
                {
463
                        int v = d * SIGN(-b);
464

    
465
                        src[1] +=v/8;
466
                        src[2] +=v/4;
467
                        src[3] +=3*v/8;
468
                        src[4] -=3*v/8;
469
                        src[5] -=v/4;
470
                        src[6] -=v/8;
471

    
472
                }
473
                src+=stride;
474
        }
475
}
476

    
477
/**
478
 * accurate deblock filter
479
 */
480
static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
481
        int y;
482
        const int QP= c->QP;
483
        const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
484
        const int dcThreshold= dcOffset*2 + 1;
485
//START_TIMER
486
        src+= step*4; // src points to begin of the 8x8 Block
487
        for(y=0; y<8; y++){
488
                int numEq= 0;
489

    
490
                if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
491
                if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
492
                if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
493
                if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
494
                if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
495
                if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
496
                if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
497
                if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
498
                if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
499
                if(numEq > c->ppMode.flatnessThreshold){
500
                        int min, max, x;
501
                        
502
                        if(src[0] > src[step]){
503
                            max= src[0];
504
                            min= src[step];
505
                        }else{
506
                            max= src[step];
507
                            min= src[0];
508
                        }
509
                        for(x=2; x<8; x+=2){
510
                                if(src[x*step] > src[(x+1)*step]){
511
                                        if(src[x    *step] > max) max= src[ x   *step];
512
                                        if(src[(x+1)*step] < min) min= src[(x+1)*step];
513
                                }else{
514
                                        if(src[(x+1)*step] > max) max= src[(x+1)*step];
515
                                        if(src[ x   *step] < min) min= src[ x   *step];
516
                                }
517
                        }
518
                        if(max-min < 2*QP){
519
                                const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
520
                                const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
521
                                
522
                                int sums[10];
523
                                sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
524
                                sums[1] = sums[0] - first       + src[3*step];
525
                                sums[2] = sums[1] - first       + src[4*step];
526
                                sums[3] = sums[2] - first       + src[5*step];
527
                                sums[4] = sums[3] - first       + src[6*step];
528
                                sums[5] = sums[4] - src[0*step] + src[7*step];
529
                                sums[6] = sums[5] - src[1*step] + last;
530
                                sums[7] = sums[6] - src[2*step] + last;
531
                                sums[8] = sums[7] - src[3*step] + last;
532
                                sums[9] = sums[8] - src[4*step] + last;
533

    
534
                                src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
535
                                src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
536
                                src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
537
                                src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
538
                                src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
539
                                src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
540
                                src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
541
                                src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
542
                        }
543
                }else{
544
                        const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
545

    
546
                        if(ABS(middleEnergy) < 8*QP)
547
                        {
548
                                const int q=(src[3*step] - src[4*step])/2;
549
                                const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
550
                                const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
551

    
552
                                int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
553
                                d= MAX(d, 0);
554
        
555
                                d= (5*d + 32) >> 6;
556
                                d*= SIGN(-middleEnergy);
557
        
558
                                if(q>0)
559
                                {
560
                                        d= d<0 ? 0 : d;
561
                                        d= d>q ? q : d;
562
                                }
563
                                else
564
                                {
565
                                        d= d>0 ? 0 : d;
566
                                        d= d<q ? q : d;
567
                                }
568
        
569
                                src[3*step]-= d;
570
                                src[4*step]+= d;
571
                        }
572
                }
573

    
574
                src += stride;
575
        }
576
/*if(step==16){
577
    STOP_TIMER("step16")
578
}else{
579
    STOP_TIMER("stepX")
580
}*/
581
}
582

    
583
//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
584
//Plain C versions
585
#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
586
#define COMPILE_C
587
#endif
588

    
589
#ifdef ARCH_POWERPC
590
#ifdef HAVE_ALTIVEC
591
#define COMPILE_ALTIVEC
592
#ifndef CONFIG_DARWIN
593
#warning "################################################################################"
594
#warning  "WARNING: No gcc available as of today (2004-05-25) seems to be able to compile properly some of the code under non-Darwin PPC OSes. Some functions result in wrong results, while others simply won't compile (gcc explodes after allocating 1GiB+)."
595
#warning "################################################################################"
596
#endif //CONFIG_DARWIN
597
#endif //HAVE_ALTIVEC
598
#endif //ARCH_POWERPC
599

    
600
#ifdef ARCH_X86
601

    
602
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
603
#define COMPILE_MMX
604
#endif
605

    
606
#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
607
#define COMPILE_MMX2
608
#endif
609

    
610
#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
611
#define COMPILE_3DNOW
612
#endif
613
#endif //ARCH_X86
614

    
615
#undef HAVE_MMX
616
#undef HAVE_MMX2
617
#undef HAVE_3DNOW
618
#undef HAVE_ALTIVEC
619
#undef ARCH_X86
620

    
621
#ifdef COMPILE_C
622
#undef HAVE_MMX
623
#undef HAVE_MMX2
624
#undef HAVE_3DNOW
625
#undef ARCH_X86
626
#define RENAME(a) a ## _C
627
#include "postprocess_template.c"
628
#endif
629

    
630
#ifdef ARCH_POWERPC
631
#ifdef COMPILE_ALTIVEC
632
#undef RENAME
633
#define HAVE_ALTIVEC
634
#define RENAME(a) a ## _altivec
635
#include "postprocess_altivec_template.c"
636
#include "postprocess_template.c"
637
#endif
638
#endif //ARCH_POWERPC
639

    
640
//MMX versions
641
#ifdef COMPILE_MMX
642
#undef RENAME
643
#define HAVE_MMX
644
#undef HAVE_MMX2
645
#undef HAVE_3DNOW
646
#define ARCH_X86
647
#define RENAME(a) a ## _MMX
648
#include "postprocess_template.c"
649
#endif
650

    
651
//MMX2 versions
652
#ifdef COMPILE_MMX2
653
#undef RENAME
654
#define HAVE_MMX
655
#define HAVE_MMX2
656
#undef HAVE_3DNOW
657
#define ARCH_X86
658
#define RENAME(a) a ## _MMX2
659
#include "postprocess_template.c"
660
#endif
661

    
662
//3DNOW versions
663
#ifdef COMPILE_3DNOW
664
#undef RENAME
665
#define HAVE_MMX
666
#undef HAVE_MMX2
667
#define HAVE_3DNOW
668
#define ARCH_X86
669
#define RENAME(a) a ## _3DNow
670
#include "postprocess_template.c"
671
#endif
672

    
673
// minor note: the HAVE_xyz is messed up after that line so dont use it
674

    
675
static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
676
        QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
677
{
678
        PPContext *c= (PPContext *)vc;
679
        PPMode *ppMode= (PPMode *)vm;
680
        c->ppMode= *ppMode; //FIXME
681

    
682
        // useing ifs here as they are faster than function pointers allthough the
683
        // difference wouldnt be messureable here but its much better because
684
        // someone might exchange the cpu whithout restarting mplayer ;)
685
#ifdef RUNTIME_CPUDETECT
686
#ifdef ARCH_X86
687
        // ordered per speed fasterst first
688
        if(c->cpuCaps & PP_CPU_CAPS_MMX2)
689
                postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
690
        else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
691
                postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
692
        else if(c->cpuCaps & PP_CPU_CAPS_MMX)
693
                postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
694
        else
695
                postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
696
#else
697
#ifdef ARCH_POWERPC
698
#ifdef HAVE_ALTIVEC
699
        else if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
700
                postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
701
        else
702
#endif
703
#endif
704
                postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
705
#endif
706
#else //RUNTIME_CPUDETECT
707
#ifdef HAVE_MMX2
708
                postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
709
#elif defined (HAVE_3DNOW)
710
                postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
711
#elif defined (HAVE_MMX)
712
                postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
713
#elif defined (HAVE_ALTIVEC)
714
                postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
715
#else
716
                postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
717
#endif
718
#endif //!RUNTIME_CPUDETECT
719
}
720

    
721
//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
722
//        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
723

    
724
/* -pp Command line Help
725
*/
726
char *pp_help=
727
"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
728
"long form example:\n"
729
"vdeblock:autoq/hdeblock:autoq/linblenddeint        default,-vdeblock\n"
730
"short form example:\n"
731
"vb:a/hb:a/lb                                        de,-vb\n"
732
"more examples:\n"
733
"tn:64:128:256\n"
734
"Filters                        Options\n"
735
"short        long name        short        long option        Description\n"
736
"*        *                a        autoq                CPU power dependent enabler\n"
737
"                        c        chrom                chrominance filtering enabled\n"
738
"                        y        nochrom                chrominance filtering disabled\n"
739
"hb        hdeblock        (2 threshold)                horizontal deblocking filter\n"
740
"        1. difference factor: default=32, higher -> more deblocking\n"
741
"        2. flatness threshold: default=39, lower -> more deblocking\n"
742
"                        the h & v deblocking filters share these\n"
743
"                        so you can't set different thresholds for h / v\n"
744
"vb        vdeblock        (2 threshold)                vertical deblocking filter\n"
745
"ha        hadeblock        (2 threshold)                horizontal deblocking filter\n"
746
"va        vadeblock        (2 threshold)                vertical deblocking filter\n"
747
"h1        x1hdeblock                                experimental h deblock filter 1\n"
748
"v1        x1vdeblock                                experimental v deblock filter 1\n"
749
"dr        dering                                        deringing filter\n"
750
"al        autolevels                                automatic brightness / contrast\n"
751
"                        f        fullyrange        stretch luminance to (0..255)\n"
752
"lb        linblenddeint                                linear blend deinterlacer\n"
753
"li        linipoldeint                                linear interpolating deinterlace\n"
754
"ci        cubicipoldeint                                cubic interpolating deinterlacer\n"
755
"md        mediandeint                                median deinterlacer\n"
756
"fd        ffmpegdeint                                ffmpeg deinterlacer\n"
757
"de        default                                        hb:a,vb:a,dr:a\n"
758
"fa        fast                                        h1:a,v1:a,dr:a\n"
759
"tn        tmpnoise        (3 threshold)                temporal noise reducer\n"
760
"                        1. <= 2. <= 3.                larger -> stronger filtering\n"
761
"fq        forceQuant        <quantizer>                force quantizer\n"
762
;
763

    
764
pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
765
{
766
        char temp[GET_MODE_BUFFER_SIZE];
767
        char *p= temp;
768
        char *filterDelimiters= ",/";
769
        char *optionDelimiters= ":";
770
        struct PPMode *ppMode;
771
        char *filterToken;
772

    
773
        ppMode= memalign(8, sizeof(PPMode));
774
        
775
        ppMode->lumMode= 0;
776
        ppMode->chromMode= 0;
777
        ppMode->maxTmpNoise[0]= 700;
778
        ppMode->maxTmpNoise[1]= 1500;
779
        ppMode->maxTmpNoise[2]= 3000;
780
        ppMode->maxAllowedY= 234;
781
        ppMode->minAllowedY= 16;
782
        ppMode->baseDcDiff= 256/8;
783
        ppMode->flatnessThreshold= 56-16-1;
784
        ppMode->maxClippedThreshold= 0.01;
785
        ppMode->error=0;
786

    
787
        strncpy(temp, name, GET_MODE_BUFFER_SIZE);
788

    
789
        if(verbose>1) printf("pp: %s\n", name);
790

    
791
        for(;;){
792
                char *filterName;
793
                int q= 1000000; //PP_QUALITY_MAX;
794
                int chrom=-1;
795
                char *option;
796
                char *options[OPTIONS_ARRAY_SIZE];
797
                int i;
798
                int filterNameOk=0;
799
                int numOfUnknownOptions=0;
800
                int enable=1; //does the user want us to enabled or disabled the filter
801

    
802
                filterToken= strtok(p, filterDelimiters);
803
                if(filterToken == NULL) break;
804
                p+= strlen(filterToken) + 1; // p points to next filterToken
805
                filterName= strtok(filterToken, optionDelimiters);
806
                if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
807

    
808
                if(*filterName == '-')
809
                {
810
                        enable=0;
811
                        filterName++;
812
                }
813

    
814
                for(;;){ //for all options
815
                        option= strtok(NULL, optionDelimiters);
816
                        if(option == NULL) break;
817

    
818
                        if(verbose>1) printf("pp: option: %s\n", option);
819
                        if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
820
                        else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
821
                        else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
822
                        else
823
                        {
824
                                options[numOfUnknownOptions] = option;
825
                                numOfUnknownOptions++;
826
                        }
827
                        if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
828
                }
829
                options[numOfUnknownOptions] = NULL;
830

    
831
                /* replace stuff from the replace Table */
832
                for(i=0; replaceTable[2*i]!=NULL; i++)
833
                {
834
                        if(!strcmp(replaceTable[2*i], filterName))
835
                        {
836
                                int newlen= strlen(replaceTable[2*i + 1]);
837
                                int plen;
838
                                int spaceLeft;
839

    
840
                                if(p==NULL) p= temp, *p=0;         //last filter
841
                                else p--, *p=',';                //not last filter
842

    
843
                                plen= strlen(p);
844
                                spaceLeft= p - temp + plen;
845
                                if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
846
                                {
847
                                        ppMode->error++;
848
                                        break;
849
                                }
850
                                memmove(p + newlen, p, plen+1);
851
                                memcpy(p, replaceTable[2*i + 1], newlen);
852
                                filterNameOk=1;
853
                        }
854
                }
855

    
856
                for(i=0; filters[i].shortName!=NULL; i++)
857
                {
858
//                        printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
859
                        if(   !strcmp(filters[i].longName, filterName)
860
                           || !strcmp(filters[i].shortName, filterName))
861
                        {
862
                                ppMode->lumMode &= ~filters[i].mask;
863
                                ppMode->chromMode &= ~filters[i].mask;
864

    
865
                                filterNameOk=1;
866
                                if(!enable) break; // user wants to disable it
867

    
868
                                if(q >= filters[i].minLumQuality)
869
                                        ppMode->lumMode|= filters[i].mask;
870
                                if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
871
                                        if(q >= filters[i].minChromQuality)
872
                                                ppMode->chromMode|= filters[i].mask;
873

    
874
                                if(filters[i].mask == LEVEL_FIX)
875
                                {
876
                                        int o;
877
                                        ppMode->minAllowedY= 16;
878
                                        ppMode->maxAllowedY= 234;
879
                                        for(o=0; options[o]!=NULL; o++)
880
                                        {
881
                                                if(  !strcmp(options[o],"fullyrange")
882
                                                   ||!strcmp(options[o],"f"))
883
                                                {
884
                                                        ppMode->minAllowedY= 0;
885
                                                        ppMode->maxAllowedY= 255;
886
                                                        numOfUnknownOptions--;
887
                                                }
888
                                        }
889
                                }
890
                                else if(filters[i].mask == TEMP_NOISE_FILTER)
891
                                {
892
                                        int o;
893
                                        int numOfNoises=0;
894

    
895
                                        for(o=0; options[o]!=NULL; o++)
896
                                        {
897
                                                char *tail;
898
                                                ppMode->maxTmpNoise[numOfNoises]=
899
                                                        strtol(options[o], &tail, 0);
900
                                                if(tail!=options[o])
901
                                                {
902
                                                        numOfNoises++;
903
                                                        numOfUnknownOptions--;
904
                                                        if(numOfNoises >= 3) break;
905
                                                }
906
                                        }
907
                                }
908
                                else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK 
909
                                     || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
910
                                {
911
                                        int o;
912

    
913
                                        for(o=0; options[o]!=NULL && o<2; o++)
914
                                        {
915
                                                char *tail;
916
                                                int val= strtol(options[o], &tail, 0);
917
                                                if(tail==options[o]) break;
918

    
919
                                                numOfUnknownOptions--;
920
                                                if(o==0) ppMode->baseDcDiff= val;
921
                                                else ppMode->flatnessThreshold= val;
922
                                        }
923
                                }
924
                                else if(filters[i].mask == FORCE_QUANT)
925
                                {
926
                                        int o;
927
                                        ppMode->forcedQuant= 15;
928

    
929
                                        for(o=0; options[o]!=NULL && o<1; o++)
930
                                        {
931
                                                char *tail;
932
                                                int val= strtol(options[o], &tail, 0);
933
                                                if(tail==options[o]) break;
934

    
935
                                                numOfUnknownOptions--;
936
                                                ppMode->forcedQuant= val;
937
                                        }
938
                                }
939
                        }
940
                }
941
                if(!filterNameOk) ppMode->error++;
942
                ppMode->error += numOfUnknownOptions;
943
        }
944

    
945
        if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
946
        if(ppMode->error)
947
        {
948
                fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
949
                free(ppMode);
950
                return NULL;
951
        }
952
        return ppMode;
953
}
954

    
955
void pp_free_mode(pp_mode_t *mode){
956
    if(mode) free(mode);
957
}
958

    
959
static void reallocAlign(void **p, int alignment, int size){
960
        if(*p) free(*p);
961
        *p= memalign(alignment, size);
962
        memset(*p, 0, size);
963
}
964

    
965
static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
966
        int mbWidth = (width+15)>>4;
967
        int mbHeight= (height+15)>>4;
968
        int i;
969

    
970
        c->stride= stride;
971
        c->qpStride= qpStride;
972

    
973
        reallocAlign((void **)&c->tempDst, 8, stride*24);
974
        reallocAlign((void **)&c->tempSrc, 8, stride*24);
975
        reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
976
        reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
977
        for(i=0; i<256; i++)
978
                c->yHistogram[i]= width*height/64*15/256;
979

    
980
        for(i=0; i<3; i++)
981
        {
982
                //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
983
                reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
984
                reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
985
        }
986

    
987
        reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
988
        reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
989
        reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
990
        reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
991
}
992

    
993
static void global_init(void){
994
        int i;
995
        memset(clip_table, 0, 256);
996
        for(i=256; i<512; i++)
997
                clip_table[i]= i;
998
        memset(clip_table+512, 0, 256);
999
}
1000

    
1001
pp_context_t *pp_get_context(int width, int height, int cpuCaps){
1002
        PPContext *c= memalign(32, sizeof(PPContext));
1003
        int stride= (width+15)&(~15); //assumed / will realloc if needed
1004
        int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
1005
        
1006
        global_init();
1007

    
1008
        memset(c, 0, sizeof(PPContext));
1009
        c->cpuCaps= cpuCaps;
1010
        if(cpuCaps&PP_FORMAT){
1011
                c->hChromaSubSample= cpuCaps&0x3;
1012
                c->vChromaSubSample= (cpuCaps>>4)&0x3;
1013
        }else{
1014
                c->hChromaSubSample= 1;
1015
                c->vChromaSubSample= 1;
1016
        }
1017

    
1018
        reallocBuffers(c, width, height, stride, qpStride);
1019
        
1020
        c->frameNum=-1;
1021

    
1022
        return c;
1023
}
1024

    
1025
void pp_free_context(void *vc){
1026
        PPContext *c = (PPContext*)vc;
1027
        int i;
1028
        
1029
        for(i=0; i<3; i++) free(c->tempBlured[i]);
1030
        for(i=0; i<3; i++) free(c->tempBluredPast[i]);
1031
        
1032
        free(c->tempBlocks);
1033
        free(c->yHistogram);
1034
        free(c->tempDst);
1035
        free(c->tempSrc);
1036
        free(c->deintTemp);
1037
        free(c->stdQPTable);
1038
        free(c->nonBQPTable);
1039
        free(c->forcedQPTable);
1040
        
1041
        memset(c, 0, sizeof(PPContext));
1042

    
1043
        free(c);
1044
}
1045

    
1046
void  pp_postprocess(uint8_t * src[3], int srcStride[3],
1047
                 uint8_t * dst[3], int dstStride[3],
1048
                 int width, int height,
1049
                 QP_STORE_T *QP_store,  int QPStride,
1050
                 pp_mode_t *vm,  void *vc, int pict_type)
1051
{
1052
        int mbWidth = (width+15)>>4;
1053
        int mbHeight= (height+15)>>4;
1054
        PPMode *mode = (PPMode*)vm;
1055
        PPContext *c = (PPContext*)vc;
1056
        int minStride= MAX(srcStride[0], dstStride[0]);
1057

    
1058
        if(c->stride < minStride || c->qpStride < QPStride)
1059
                reallocBuffers(c, width, height, 
1060
                                MAX(minStride, c->stride), 
1061
                                MAX(c->qpStride, QPStride));
1062

    
1063
        if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)) 
1064
        {
1065
                int i;
1066
                QP_store= c->forcedQPTable;
1067
                QPStride= 0;
1068
                if(mode->lumMode & FORCE_QUANT)
1069
                        for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
1070
                else
1071
                        for(i=0; i<mbWidth; i++) QP_store[i]= 1;
1072
        }
1073
//printf("pict_type:%d\n", pict_type);
1074

    
1075
        if(pict_type & PP_PICT_TYPE_QP2){
1076
                int i;
1077
                const int count= mbHeight * QPStride;
1078
                for(i=0; i<(count>>2); i++){
1079
                        ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1080
                }
1081
                for(i<<=2; i<count; i++){
1082
                        c->stdQPTable[i] = QP_store[i]>>1;
1083
                }
1084
                QP_store= c->stdQPTable;
1085
        }
1086

    
1087
if(0){
1088
int x,y;
1089
for(y=0; y<mbHeight; y++){
1090
        for(x=0; x<mbWidth; x++){
1091
                printf("%2d ", QP_store[x + y*QPStride]);
1092
        }
1093
        printf("\n");
1094
}
1095
        printf("\n");
1096
}
1097

    
1098
        if((pict_type&7)!=3)
1099
        {
1100
                int i;
1101
                const int count= mbHeight * QPStride;
1102
                for(i=0; i<(count>>2); i++){
1103
                        ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1104
                }
1105
                for(i<<=2; i<count; i++){
1106
                        c->nonBQPTable[i] = QP_store[i] & 0x3F;
1107
                }
1108
        }
1109

    
1110
        if(verbose>2)
1111
        {
1112
                printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
1113
        }
1114

    
1115
        postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1116
                width, height, QP_store, QPStride, 0, mode, c);
1117

    
1118
        width  = (width )>>c->hChromaSubSample;
1119
        height = (height)>>c->vChromaSubSample;
1120

    
1121
        if(mode->chromMode)
1122
        {
1123
                postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1124
                        width, height, QP_store, QPStride, 1, mode, c);
1125
                postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1126
                        width, height, QP_store, QPStride, 2, mode, c);
1127
        }
1128
        else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1129
        {
1130
                memcpy(dst[1], src[1], srcStride[1]*height);
1131
                memcpy(dst[2], src[2], srcStride[2]*height);
1132
        }
1133
        else
1134
        {
1135
                int y;
1136
                for(y=0; y<height; y++)
1137
                {
1138
                        memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1139
                        memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1140
                }
1141
        }
1142
}
1143