Statistics
| Branch: | Revision:

ffmpeg / libavcodec / libpostproc / postprocess.c @ 5509bffa

History | View | Annotate | Download (44.5 KB)

1
/*
2
    Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3

4
    AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5

6
    This program is free software; you can redistribute it and/or modify
7
    it under the terms of the GNU General Public License as published by
8
    the Free Software Foundation; either version 2 of the License, or
9
    (at your option) any later version.
10

11
    This program is distributed in the hope that it will be useful,
12
    but WITHOUT ANY WARRANTY; without even the implied warranty of
13
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
    GNU General Public License for more details.
15

16
    You should have received a copy of the GNU General Public License
17
    along with this program; if not, write to the Free Software
18
    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
*/
20

    
21
/**
22
 * @file postprocess.c
23
 * postprocessing.
24
 */
25

    
26
/*
27
                        C       MMX     MMX2    3DNow   AltiVec
28
isVertDC                Ec      Ec                      Ec
29
isVertMinMaxOk          Ec      Ec                      Ec
30
doVertLowPass           E               e       e       Ec
31
doVertDefFilter         Ec      Ec      e       e       Ec
32
isHorizDC               Ec      Ec                      Ec
33
isHorizMinMaxOk         a       E                       Ec
34
doHorizLowPass          E               e       e       Ec
35
doHorizDefFilter        Ec      Ec      e       e       Ec
36
do_a_deblock            Ec      E       Ec      E
37
deRing                  E               e       e*      Ecp
38
Vertical RKAlgo1        E               a       a
39
Horizontal RKAlgo1                      a       a
40
Vertical X1#            a               E       E
41
Horizontal X1#          a               E       E
42
LinIpolDeinterlace      e               E       E*
43
CubicIpolDeinterlace    a               e       e*
44
LinBlendDeinterlace     e               E       E*
45
MedianDeinterlace#      E       Ec      Ec
46
TempDeNoiser#           E               e       e       Ec
47

48
* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
49
# more or less selfinvented filters so the exactness isnt too meaningfull
50
E = Exact implementation
51
e = allmost exact implementation (slightly different rounding,...)
52
a = alternative / approximate impl
53
c = checked against the other implementations (-vo md5)
54
p = partially optimized, still some work to do
55
*/
56

    
57
/*
58
TODO:
59
reduce the time wasted on the mem transfer
60
unroll stuff if instructions depend too much on the prior one
61
move YScale thing to the end instead of fixing QP
62
write a faster and higher quality deblocking filter :)
63
make the mainloop more flexible (variable number of blocks at once
64
        (the if/else stuff per block is slowing things down)
65
compare the quality & speed of all filters
66
split this huge file
67
optimize c versions
68
try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
69
...
70
*/
71

    
72
//Changelog: use the CVS log
73

    
74
#include "config.h"
75
#include <inttypes.h>
76
#include <stdio.h>
77
#include <stdlib.h>
78
#include <string.h>
79
#ifdef HAVE_MALLOC_H
80
#include <malloc.h>
81
#endif
82
//#undef HAVE_MMX2
83
//#define HAVE_3DNOW
84
//#undef HAVE_MMX
85
//#undef ARCH_X86
86
//#define DEBUG_BRIGHTNESS
87
#ifdef USE_FASTMEMCPY
88
#include "fastmemcpy.h"
89
#endif
90
#include "postprocess.h"
91
#include "postprocess_internal.h"
92

    
93
#include "mangle.h" //FIXME should be supressed
94

    
95
#ifdef HAVE_ALTIVEC_H
96
#include <altivec.h>
97
#endif
98

    
99
#ifndef HAVE_MEMALIGN
100
#define memalign(a,b) malloc(b)
101
#endif
102

    
103
#define MIN(a,b) ((a) > (b) ? (b) : (a))
104
#define MAX(a,b) ((a) < (b) ? (b) : (a))
105
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
106
#define SIGN(a) ((a) > 0 ? 1 : -1)
107

    
108
#define GET_MODE_BUFFER_SIZE 500
109
#define OPTIONS_ARRAY_SIZE 10
110
#define BLOCK_SIZE 8
111
#define TEMP_STRIDE 8
112
//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
113

    
114
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
115
#    define attribute_used __attribute__((used))
116
#    define always_inline __attribute__((always_inline)) inline
117
#else
118
#    define attribute_used
119
#    define always_inline inline
120
#endif
121

    
122
#if defined(ARCH_X86) || defined(ARCH_X86_64)
123
static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
124
static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL;
125
static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
126
static uint64_t __attribute__((aligned(8))) attribute_used b00= 0x0000000000000000LL;
127
static uint64_t __attribute__((aligned(8))) attribute_used b01= 0x0101010101010101LL;
128
static uint64_t __attribute__((aligned(8))) attribute_used b02= 0x0202020202020202LL;
129
static uint64_t __attribute__((aligned(8))) attribute_used b08= 0x0808080808080808LL;
130
static uint64_t __attribute__((aligned(8))) attribute_used b80= 0x8080808080808080LL;
131
#endif
132

    
133
static uint8_t clip_table[3*256];
134
static uint8_t * const clip_tab= clip_table + 256;
135

    
136
static const int verbose= 0;
137

    
138
static const int attribute_used deringThreshold= 20;
139

    
140

    
141
static struct PPFilter filters[]=
142
{
143
        {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
144
        {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
145
/*      {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
146
        {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
147
        {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
148
        {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
149
        {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
150
        {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
151
        {"dr", "dering",                1, 5, 6, DERING},
152
        {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
153
        {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
154
        {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
155
        {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
156
        {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
157
        {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
158
        {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
159
        {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
160
        {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
161
        {NULL, NULL,0,0,0,0} //End Marker
162
};
163

    
164
static char *replaceTable[]=
165
{
166
        "default",      "hdeblock:a,vdeblock:a,dering:a",
167
        "de",           "hdeblock:a,vdeblock:a,dering:a",
168
        "fast",         "x1hdeblock:a,x1vdeblock:a,dering:a",
169
        "fa",           "x1hdeblock:a,x1vdeblock:a,dering:a",
170
        "ac",           "ha:a:128:7,va:a,dering:a",
171
        NULL //End Marker
172
};
173

    
174

    
175
#if defined(ARCH_X86) || defined(ARCH_X86_64)
176
static inline void prefetchnta(void *p)
177
{
178
        asm volatile(   "prefetchnta (%0)\n\t"
179
                : : "r" (p)
180
        );
181
}
182

    
183
static inline void prefetcht0(void *p)
184
{
185
        asm volatile(   "prefetcht0 (%0)\n\t"
186
                : : "r" (p)
187
        );
188
}
189

    
190
static inline void prefetcht1(void *p)
191
{
192
        asm volatile(   "prefetcht1 (%0)\n\t"
193
                : : "r" (p)
194
        );
195
}
196

    
197
static inline void prefetcht2(void *p)
198
{
199
        asm volatile(   "prefetcht2 (%0)\n\t"
200
                : : "r" (p)
201
        );
202
}
203
#endif
204

    
205
// The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
206

    
207
/**
208
 * Check if the given 8x8 Block is mostly "flat"
209
 */
210
static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
211
{
212
        int numEq= 0;
213
        int y;
214
        const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
215
        const int dcThreshold= dcOffset*2 + 1;
216

    
217
        for(y=0; y<BLOCK_SIZE; y++)
218
        {
219
                if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
220
                if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
221
                if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
222
                if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
223
                if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
224
                if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
225
                if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
226
                src+= stride;
227
        }
228
        return numEq > c->ppMode.flatnessThreshold;
229
}
230

    
231
/**
232
 * Check if the middle 8x8 Block in the given 8x16 block is flat
233
 */
234
static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
235
        int numEq= 0;
236
        int y;
237
        const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
238
        const int dcThreshold= dcOffset*2 + 1;
239

    
240
        src+= stride*4; // src points to begin of the 8x8 Block
241
        for(y=0; y<BLOCK_SIZE-1; y++)
242
        {
243
                if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
244
                if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
245
                if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
246
                if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
247
                if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
248
                if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
249
                if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
250
                if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
251
                src+= stride;
252
        }
253
        return numEq > c->ppMode.flatnessThreshold;
254
}
255

    
256
static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
257
{
258
        int i;
259
#if 1
260
        for(i=0; i<2; i++){
261
                if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
262
                src += stride;
263
                if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
264
                src += stride;
265
                if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
266
                src += stride;
267
                if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
268
                src += stride;
269
        }
270
#else
271
        for(i=0; i<8; i++){
272
                if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
273
                src += stride;
274
        }
275
#endif
276
        return 1;
277
}
278

    
279
static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
280
{
281
#if 1
282
#if 1
283
        int x;
284
        src+= stride*4;
285
        for(x=0; x<BLOCK_SIZE; x+=4)
286
        {
287
                if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
288
                if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
289
                if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
290
                if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
291
        }
292
#else
293
        int x;
294
        src+= stride*3;
295
        for(x=0; x<BLOCK_SIZE; x++)
296
        {
297
                if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
298
        }
299
#endif
300
        return 1;
301
#else
302
        int x;
303
        src+= stride*4;
304
        for(x=0; x<BLOCK_SIZE; x++)
305
        {
306
                int min=255;
307
                int max=0;
308
                int y;
309
                for(y=0; y<8; y++){
310
                        int v= src[x + y*stride];
311
                        if(v>max) max=v;
312
                        if(v<min) min=v;
313
                }
314
                if(max-min > 2*QP) return 0;
315
        }
316
        return 1;
317
#endif
318
}
319

    
320
static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
321
        if( isHorizDC_C(src, stride, c) ){
322
                if( isHorizMinMaxOk_C(src, stride, c->QP) )
323
                        return 1;
324
                else
325
                        return 0;
326
        }else{
327
                return 2;
328
        }
329
}
330

    
331
static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
332
        if( isVertDC_C(src, stride, c) ){
333
                if( isVertMinMaxOk_C(src, stride, c->QP) )
334
                        return 1;
335
                else
336
                        return 0;
337
        }else{
338
                return 2;
339
        }
340
}
341

    
342
static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
343
{
344
        int y;
345
        for(y=0; y<BLOCK_SIZE; y++)
346
        {
347
                const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
348

    
349
                if(ABS(middleEnergy) < 8*c->QP)
350
                {
351
                        const int q=(dst[3] - dst[4])/2;
352
                        const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
353
                        const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
354

    
355
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
356
                        d= MAX(d, 0);
357

    
358
                        d= (5*d + 32) >> 6;
359
                        d*= SIGN(-middleEnergy);
360

    
361
                        if(q>0)
362
                        {
363
                                d= d<0 ? 0 : d;
364
                                d= d>q ? q : d;
365
                        }
366
                        else
367
                        {
368
                                d= d>0 ? 0 : d;
369
                                d= d<q ? q : d;
370
                        }
371

    
372
                        dst[3]-= d;
373
                        dst[4]+= d;
374
                }
375
                dst+= stride;
376
        }
377
}
378

    
379
/**
380
 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
381
 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
382
 */
383
static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
384
{
385
        int y;
386
        for(y=0; y<BLOCK_SIZE; y++)
387
        {
388
                const int first= ABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
389
                const int last= ABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
390

    
391
                int sums[10];
392
                sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
393
                sums[1] = sums[0] - first  + dst[3];
394
                sums[2] = sums[1] - first  + dst[4];
395
                sums[3] = sums[2] - first  + dst[5];
396
                sums[4] = sums[3] - first  + dst[6];
397
                sums[5] = sums[4] - dst[0] + dst[7];
398
                sums[6] = sums[5] - dst[1] + last;
399
                sums[7] = sums[6] - dst[2] + last;
400
                sums[8] = sums[7] - dst[3] + last;
401
                sums[9] = sums[8] - dst[4] + last;
402

    
403
                dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
404
                dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
405
                dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
406
                dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
407
                dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
408
                dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
409
                dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
410
                dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
411

    
412
                dst+= stride;
413
        }
414
}
415

    
416
/**
417
 * Experimental Filter 1 (Horizontal)
418
 * will not damage linear gradients
419
 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
420
 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
421
 * MMX2 version does correct clipping C version doesnt
422
 * not identical with the vertical one
423
 */
424
static inline void horizX1Filter(uint8_t *src, int stride, int QP)
425
{
426
        int y;
427
        static uint64_t *lut= NULL;
428
        if(lut==NULL)
429
        {
430
                int i;
431
                lut= (uint64_t*)memalign(8, 256*8);
432
                for(i=0; i<256; i++)
433
                {
434
                        int v= i < 128 ? 2*i : 2*(i-256);
435
/*
436
//Simulate 112242211 9-Tap filter
437
                        uint64_t a= (v/16) & 0xFF;
438
                        uint64_t b= (v/8) & 0xFF;
439
                        uint64_t c= (v/4) & 0xFF;
440
                        uint64_t d= (3*v/8) & 0xFF;
441
*/
442
//Simulate piecewise linear interpolation
443
                        uint64_t a= (v/16) & 0xFF;
444
                        uint64_t b= (v*3/16) & 0xFF;
445
                        uint64_t c= (v*5/16) & 0xFF;
446
                        uint64_t d= (7*v/16) & 0xFF;
447
                        uint64_t A= (0x100 - a)&0xFF;
448
                        uint64_t B= (0x100 - b)&0xFF;
449
                        uint64_t C= (0x100 - c)&0xFF;
450
                        uint64_t D= (0x100 - c)&0xFF;
451

    
452
                        lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
453
                                (D<<24) | (C<<16) | (B<<8) | (A);
454
                        //lut[i] = (v<<32) | (v<<24);
455
                }
456
        }
457

    
458
        for(y=0; y<BLOCK_SIZE; y++)
459
        {
460
                int a= src[1] - src[2];
461
                int b= src[3] - src[4];
462
                int c= src[5] - src[6];
463

    
464
                int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
465

    
466
                if(d < QP)
467
                {
468
                        int v = d * SIGN(-b);
469

    
470
                        src[1] +=v/8;
471
                        src[2] +=v/4;
472
                        src[3] +=3*v/8;
473
                        src[4] -=3*v/8;
474
                        src[5] -=v/4;
475
                        src[6] -=v/8;
476

    
477
                }
478
                src+=stride;
479
        }
480
}
481

    
482
/**
483
 * accurate deblock filter
484
 */
485
static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
486
        int y;
487
        const int QP= c->QP;
488
        const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
489
        const int dcThreshold= dcOffset*2 + 1;
490
//START_TIMER
491
        src+= step*4; // src points to begin of the 8x8 Block
492
        for(y=0; y<8; y++){
493
                int numEq= 0;
494

    
495
                if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
496
                if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
497
                if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
498
                if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
499
                if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
500
                if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
501
                if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
502
                if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
503
                if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
504
                if(numEq > c->ppMode.flatnessThreshold){
505
                        int min, max, x;
506

    
507
                        if(src[0] > src[step]){
508
                            max= src[0];
509
                            min= src[step];
510
                        }else{
511
                            max= src[step];
512
                            min= src[0];
513
                        }
514
                        for(x=2; x<8; x+=2){
515
                                if(src[x*step] > src[(x+1)*step]){
516
                                        if(src[x    *step] > max) max= src[ x   *step];
517
                                        if(src[(x+1)*step] < min) min= src[(x+1)*step];
518
                                }else{
519
                                        if(src[(x+1)*step] > max) max= src[(x+1)*step];
520
                                        if(src[ x   *step] < min) min= src[ x   *step];
521
                                }
522
                        }
523
                        if(max-min < 2*QP){
524
                                const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
525
                                const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
526

    
527
                                int sums[10];
528
                                sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
529
                                sums[1] = sums[0] - first       + src[3*step];
530
                                sums[2] = sums[1] - first       + src[4*step];
531
                                sums[3] = sums[2] - first       + src[5*step];
532
                                sums[4] = sums[3] - first       + src[6*step];
533
                                sums[5] = sums[4] - src[0*step] + src[7*step];
534
                                sums[6] = sums[5] - src[1*step] + last;
535
                                sums[7] = sums[6] - src[2*step] + last;
536
                                sums[8] = sums[7] - src[3*step] + last;
537
                                sums[9] = sums[8] - src[4*step] + last;
538

    
539
                                src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
540
                                src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
541
                                src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
542
                                src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
543
                                src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
544
                                src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
545
                                src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
546
                                src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
547
                        }
548
                }else{
549
                        const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
550

    
551
                        if(ABS(middleEnergy) < 8*QP)
552
                        {
553
                                const int q=(src[3*step] - src[4*step])/2;
554
                                const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
555
                                const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
556

    
557
                                int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
558
                                d= MAX(d, 0);
559

    
560
                                d= (5*d + 32) >> 6;
561
                                d*= SIGN(-middleEnergy);
562

    
563
                                if(q>0)
564
                                {
565
                                        d= d<0 ? 0 : d;
566
                                        d= d>q ? q : d;
567
                                }
568
                                else
569
                                {
570
                                        d= d>0 ? 0 : d;
571
                                        d= d<q ? q : d;
572
                                }
573

    
574
                                src[3*step]-= d;
575
                                src[4*step]+= d;
576
                        }
577
                }
578

    
579
                src += stride;
580
        }
581
/*if(step==16){
582
    STOP_TIMER("step16")
583
}else{
584
    STOP_TIMER("stepX")
585
}*/
586
}
587

    
588
//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
589
//Plain C versions
590
#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
591
#define COMPILE_C
592
#endif
593

    
594
#ifdef ARCH_POWERPC
595
#ifdef HAVE_ALTIVEC
596
#define COMPILE_ALTIVEC
597
#endif //HAVE_ALTIVEC
598
#endif //ARCH_POWERPC
599

    
600
#if defined(ARCH_X86) || defined(ARCH_X86_64)
601

    
602
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
603
#define COMPILE_MMX
604
#endif
605

    
606
#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
607
#define COMPILE_MMX2
608
#endif
609

    
610
#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
611
#define COMPILE_3DNOW
612
#endif
613
#endif //ARCH_X86
614

    
615
#undef HAVE_MMX
616
#undef HAVE_MMX2
617
#undef HAVE_3DNOW
618
#undef HAVE_ALTIVEC
619

    
620
#ifdef COMPILE_C
621
#undef HAVE_MMX
622
#undef HAVE_MMX2
623
#undef HAVE_3DNOW
624
#define RENAME(a) a ## _C
625
#include "postprocess_template.c"
626
#endif
627

    
628
#ifdef ARCH_POWERPC
629
#ifdef COMPILE_ALTIVEC
630
#undef RENAME
631
#define HAVE_ALTIVEC
632
#define RENAME(a) a ## _altivec
633
#include "postprocess_altivec_template.c"
634
#include "postprocess_template.c"
635
#endif
636
#endif //ARCH_POWERPC
637

    
638
//MMX versions
639
#ifdef COMPILE_MMX
640
#undef RENAME
641
#define HAVE_MMX
642
#undef HAVE_MMX2
643
#undef HAVE_3DNOW
644
#define RENAME(a) a ## _MMX
645
#include "postprocess_template.c"
646
#endif
647

    
648
//MMX2 versions
649
#ifdef COMPILE_MMX2
650
#undef RENAME
651
#define HAVE_MMX
652
#define HAVE_MMX2
653
#undef HAVE_3DNOW
654
#define RENAME(a) a ## _MMX2
655
#include "postprocess_template.c"
656
#endif
657

    
658
//3DNOW versions
659
#ifdef COMPILE_3DNOW
660
#undef RENAME
661
#define HAVE_MMX
662
#undef HAVE_MMX2
663
#define HAVE_3DNOW
664
#define RENAME(a) a ## _3DNow
665
#include "postprocess_template.c"
666
#endif
667

    
668
// minor note: the HAVE_xyz is messed up after that line so dont use it
669

    
670
static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
671
        QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
672
{
673
        PPContext *c= (PPContext *)vc;
674
        PPMode *ppMode= (PPMode *)vm;
675
        c->ppMode= *ppMode; //FIXME
676

    
677
        // useing ifs here as they are faster than function pointers allthough the
678
        // difference wouldnt be messureable here but its much better because
679
        // someone might exchange the cpu whithout restarting mplayer ;)
680
#ifdef RUNTIME_CPUDETECT
681
#if defined(ARCH_X86) || defined(ARCH_X86_64)
682
        // ordered per speed fasterst first
683
        if(c->cpuCaps & PP_CPU_CAPS_MMX2)
684
                postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
685
        else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
686
                postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
687
        else if(c->cpuCaps & PP_CPU_CAPS_MMX)
688
                postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
689
        else
690
                postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
691
#else
692
#ifdef ARCH_POWERPC
693
#ifdef HAVE_ALTIVEC
694
        if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
695
                postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
696
        else
697
#endif
698
#endif
699
                postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
700
#endif
701
#else //RUNTIME_CPUDETECT
702
#ifdef HAVE_MMX2
703
                postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
704
#elif defined (HAVE_3DNOW)
705
                postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
706
#elif defined (HAVE_MMX)
707
                postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
708
#elif defined (HAVE_ALTIVEC)
709
                postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
710
#else
711
                postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
712
#endif
713
#endif //!RUNTIME_CPUDETECT
714
}
715

    
716
//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
717
//        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
718

    
719
/* -pp Command line Help
720
*/
721
char *pp_help=
722
"Available postprocessing filters:\n"
723
"Filters                        Options\n"
724
"short  long name       short   long option     Description\n"
725
"*      *               a       autoq           CPU power dependent enabler\n"
726
"                       c       chrom           chrominance filtering enabled\n"
727
"                       y       nochrom         chrominance filtering disabled\n"
728
"                       n       noluma          luma filtering disabled\n"
729
"hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
730
"       1. difference factor: default=32, higher -> more deblocking\n"
731
"       2. flatness threshold: default=39, lower -> more deblocking\n"
732
"                       the h & v deblocking filters share these\n"
733
"                       so you can't set different thresholds for h / v\n"
734
"vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
735
"ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
736
"va     vadeblock       (2 threshold)           vertical deblocking filter\n"
737
"h1     x1hdeblock                              experimental h deblock filter 1\n"
738
"v1     x1vdeblock                              experimental v deblock filter 1\n"
739
"dr     dering                                  deringing filter\n"
740
"al     autolevels                              automatic brightness / contrast\n"
741
"                       f        fullyrange     stretch luminance to (0..255)\n"
742
"lb     linblenddeint                           linear blend deinterlacer\n"
743
"li     linipoldeint                            linear interpolating deinterlace\n"
744
"ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
745
"md     mediandeint                             median deinterlacer\n"
746
"fd     ffmpegdeint                             ffmpeg deinterlacer\n"
747
"l5     lowpass5                                FIR lowpass deinterlacer\n"
748
"de     default                                 hb:a,vb:a,dr:a\n"
749
"fa     fast                                    h1:a,v1:a,dr:a\n"
750
"ac                                             ha:a:128:7,va:a,dr:a\n"
751
"tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
752
"                     1. <= 2. <= 3.            larger -> stronger filtering\n"
753
"fq     forceQuant      <quantizer>             force quantizer\n"
754
"Usage:\n"
755
"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
756
"long form example:\n"
757
"vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
758
"short form example:\n"
759
"vb:a/hb:a/lb                                   de,-vb\n"
760
"more examples:\n"
761
"tn:64:128:256\n"
762
;
763

    
764
pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
765
{
766
        char temp[GET_MODE_BUFFER_SIZE];
767
        char *p= temp;
768
        char *filterDelimiters= ",/";
769
        char *optionDelimiters= ":";
770
        struct PPMode *ppMode;
771
        char *filterToken;
772

    
773
        ppMode= memalign(8, sizeof(PPMode));
774

    
775
        ppMode->lumMode= 0;
776
        ppMode->chromMode= 0;
777
        ppMode->maxTmpNoise[0]= 700;
778
        ppMode->maxTmpNoise[1]= 1500;
779
        ppMode->maxTmpNoise[2]= 3000;
780
        ppMode->maxAllowedY= 234;
781
        ppMode->minAllowedY= 16;
782
        ppMode->baseDcDiff= 256/8;
783
        ppMode->flatnessThreshold= 56-16-1;
784
        ppMode->maxClippedThreshold= 0.01;
785
        ppMode->error=0;
786

    
787
        strncpy(temp, name, GET_MODE_BUFFER_SIZE);
788

    
789
        if(verbose>1) printf("pp: %s\n", name);
790

    
791
        for(;;){
792
                char *filterName;
793
                int q= 1000000; //PP_QUALITY_MAX;
794
                int chrom=-1;
795
                int luma=-1;
796
                char *option;
797
                char *options[OPTIONS_ARRAY_SIZE];
798
                int i;
799
                int filterNameOk=0;
800
                int numOfUnknownOptions=0;
801
                int enable=1; //does the user want us to enabled or disabled the filter
802

    
803
                filterToken= strtok(p, filterDelimiters);
804
                if(filterToken == NULL) break;
805
                p+= strlen(filterToken) + 1; // p points to next filterToken
806
                filterName= strtok(filterToken, optionDelimiters);
807
                if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
808

    
809
                if(*filterName == '-')
810
                {
811
                        enable=0;
812
                        filterName++;
813
                }
814

    
815
                for(;;){ //for all options
816
                        option= strtok(NULL, optionDelimiters);
817
                        if(option == NULL) break;
818

    
819
                        if(verbose>1) printf("pp: option: %s\n", option);
820
                        if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
821
                        else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
822
                        else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
823
                        else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
824
                        else
825
                        {
826
                                options[numOfUnknownOptions] = option;
827
                                numOfUnknownOptions++;
828
                        }
829
                        if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
830
                }
831
                options[numOfUnknownOptions] = NULL;
832

    
833
                /* replace stuff from the replace Table */
834
                for(i=0; replaceTable[2*i]!=NULL; i++)
835
                {
836
                        if(!strcmp(replaceTable[2*i], filterName))
837
                        {
838
                                int newlen= strlen(replaceTable[2*i + 1]);
839
                                int plen;
840
                                int spaceLeft;
841

    
842
                                if(p==NULL) p= temp, *p=0;      //last filter
843
                                else p--, *p=',';               //not last filter
844

    
845
                                plen= strlen(p);
846
                                spaceLeft= p - temp + plen;
847
                                if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
848
                                {
849
                                        ppMode->error++;
850
                                        break;
851
                                }
852
                                memmove(p + newlen, p, plen+1);
853
                                memcpy(p, replaceTable[2*i + 1], newlen);
854
                                filterNameOk=1;
855
                        }
856
                }
857

    
858
                for(i=0; filters[i].shortName!=NULL; i++)
859
                {
860
//                        printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
861
                        if(   !strcmp(filters[i].longName, filterName)
862
                           || !strcmp(filters[i].shortName, filterName))
863
                        {
864
                                ppMode->lumMode &= ~filters[i].mask;
865
                                ppMode->chromMode &= ~filters[i].mask;
866

    
867
                                filterNameOk=1;
868
                                if(!enable) break; // user wants to disable it
869

    
870
                                if(q >= filters[i].minLumQuality && luma)
871
                                        ppMode->lumMode|= filters[i].mask;
872
                                if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
873
                                        if(q >= filters[i].minChromQuality)
874
                                                ppMode->chromMode|= filters[i].mask;
875

    
876
                                if(filters[i].mask == LEVEL_FIX)
877
                                {
878
                                        int o;
879
                                        ppMode->minAllowedY= 16;
880
                                        ppMode->maxAllowedY= 234;
881
                                        for(o=0; options[o]!=NULL; o++)
882
                                        {
883
                                                if(  !strcmp(options[o],"fullyrange")
884
                                                   ||!strcmp(options[o],"f"))
885
                                                {
886
                                                        ppMode->minAllowedY= 0;
887
                                                        ppMode->maxAllowedY= 255;
888
                                                        numOfUnknownOptions--;
889
                                                }
890
                                        }
891
                                }
892
                                else if(filters[i].mask == TEMP_NOISE_FILTER)
893
                                {
894
                                        int o;
895
                                        int numOfNoises=0;
896

    
897
                                        for(o=0; options[o]!=NULL; o++)
898
                                        {
899
                                                char *tail;
900
                                                ppMode->maxTmpNoise[numOfNoises]=
901
                                                        strtol(options[o], &tail, 0);
902
                                                if(tail!=options[o])
903
                                                {
904
                                                        numOfNoises++;
905
                                                        numOfUnknownOptions--;
906
                                                        if(numOfNoises >= 3) break;
907
                                                }
908
                                        }
909
                                }
910
                                else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
911
                                     || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
912
                                {
913
                                        int o;
914

    
915
                                        for(o=0; options[o]!=NULL && o<2; o++)
916
                                        {
917
                                                char *tail;
918
                                                int val= strtol(options[o], &tail, 0);
919
                                                if(tail==options[o]) break;
920

    
921
                                                numOfUnknownOptions--;
922
                                                if(o==0) ppMode->baseDcDiff= val;
923
                                                else ppMode->flatnessThreshold= val;
924
                                        }
925
                                }
926
                                else if(filters[i].mask == FORCE_QUANT)
927
                                {
928
                                        int o;
929
                                        ppMode->forcedQuant= 15;
930

    
931
                                        for(o=0; options[o]!=NULL && o<1; o++)
932
                                        {
933
                                                char *tail;
934
                                                int val= strtol(options[o], &tail, 0);
935
                                                if(tail==options[o]) break;
936

    
937
                                                numOfUnknownOptions--;
938
                                                ppMode->forcedQuant= val;
939
                                        }
940
                                }
941
                        }
942
                }
943
                if(!filterNameOk) ppMode->error++;
944
                ppMode->error += numOfUnknownOptions;
945
        }
946

    
947
        if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
948
        if(ppMode->error)
949
        {
950
                fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
951
                free(ppMode);
952
                return NULL;
953
        }
954
        return ppMode;
955
}
956

    
957
void pp_free_mode(pp_mode_t *mode){
958
    if(mode) free(mode);
959
}
960

    
961
static void reallocAlign(void **p, int alignment, int size){
962
        if(*p) free(*p);
963
        *p= memalign(alignment, size);
964
        memset(*p, 0, size);
965
}
966

    
967
static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
968
        int mbWidth = (width+15)>>4;
969
        int mbHeight= (height+15)>>4;
970
        int i;
971

    
972
        c->stride= stride;
973
        c->qpStride= qpStride;
974

    
975
        reallocAlign((void **)&c->tempDst, 8, stride*24);
976
        reallocAlign((void **)&c->tempSrc, 8, stride*24);
977
        reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
978
        reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
979
        for(i=0; i<256; i++)
980
                c->yHistogram[i]= width*height/64*15/256;
981

    
982
        for(i=0; i<3; i++)
983
        {
984
                //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
985
                reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
986
                reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
987
        }
988

    
989
        reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
990
        reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
991
        reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
992
        reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
993
}
994

    
995
static void global_init(void){
996
        int i;
997
        memset(clip_table, 0, 256);
998
        for(i=256; i<512; i++)
999
                clip_table[i]= i;
1000
        memset(clip_table+512, 0, 256);
1001
}
1002

    
1003
pp_context_t *pp_get_context(int width, int height, int cpuCaps){
1004
        PPContext *c= memalign(32, sizeof(PPContext));
1005
        int stride= (width+15)&(~15);    //assumed / will realloc if needed
1006
        int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
1007

    
1008
        global_init();
1009

    
1010
        memset(c, 0, sizeof(PPContext));
1011
        c->cpuCaps= cpuCaps;
1012
        if(cpuCaps&PP_FORMAT){
1013
                c->hChromaSubSample= cpuCaps&0x3;
1014
                c->vChromaSubSample= (cpuCaps>>4)&0x3;
1015
        }else{
1016
                c->hChromaSubSample= 1;
1017
                c->vChromaSubSample= 1;
1018
        }
1019

    
1020
        reallocBuffers(c, width, height, stride, qpStride);
1021

    
1022
        c->frameNum=-1;
1023

    
1024
        return c;
1025
}
1026

    
1027
void pp_free_context(void *vc){
1028
        PPContext *c = (PPContext*)vc;
1029
        int i;
1030

    
1031
        for(i=0; i<3; i++) free(c->tempBlured[i]);
1032
        for(i=0; i<3; i++) free(c->tempBluredPast[i]);
1033

    
1034
        free(c->tempBlocks);
1035
        free(c->yHistogram);
1036
        free(c->tempDst);
1037
        free(c->tempSrc);
1038
        free(c->deintTemp);
1039
        free(c->stdQPTable);
1040
        free(c->nonBQPTable);
1041
        free(c->forcedQPTable);
1042

    
1043
        memset(c, 0, sizeof(PPContext));
1044

    
1045
        free(c);
1046
}
1047

    
1048
void  pp_postprocess(uint8_t * src[3], int srcStride[3],
1049
                 uint8_t * dst[3], int dstStride[3],
1050
                 int width, int height,
1051
                 QP_STORE_T *QP_store,  int QPStride,
1052
                 pp_mode_t *vm,  void *vc, int pict_type)
1053
{
1054
        int mbWidth = (width+15)>>4;
1055
        int mbHeight= (height+15)>>4;
1056
        PPMode *mode = (PPMode*)vm;
1057
        PPContext *c = (PPContext*)vc;
1058
        int minStride= MAX(ABS(srcStride[0]), ABS(dstStride[0]));
1059
        int absQPStride = ABS(QPStride);
1060

    
1061
        // c->stride and c->QPStride are always positive
1062
        if(c->stride < minStride || c->qpStride < absQPStride)
1063
                reallocBuffers(c, width, height,
1064
                                MAX(minStride, c->stride),
1065
                                MAX(c->qpStride, absQPStride));
1066

    
1067
        if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1068
        {
1069
                int i;
1070
                QP_store= c->forcedQPTable;
1071
                absQPStride = QPStride = 0;
1072
                if(mode->lumMode & FORCE_QUANT)
1073
                        for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
1074
                else
1075
                        for(i=0; i<mbWidth; i++) QP_store[i]= 1;
1076
        }
1077
//printf("pict_type:%d\n", pict_type);
1078

    
1079
        if(pict_type & PP_PICT_TYPE_QP2){
1080
                int i;
1081
                const int count= mbHeight * absQPStride;
1082
                for(i=0; i<(count>>2); i++){
1083
                        ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1084
                }
1085
                for(i<<=2; i<count; i++){
1086
                        c->stdQPTable[i] = QP_store[i]>>1;
1087
                }
1088
                QP_store= c->stdQPTable;
1089
                QPStride= absQPStride;
1090
        }
1091

    
1092
if(0){
1093
int x,y;
1094
for(y=0; y<mbHeight; y++){
1095
        for(x=0; x<mbWidth; x++){
1096
                printf("%2d ", QP_store[x + y*QPStride]);
1097
        }
1098
        printf("\n");
1099
}
1100
        printf("\n");
1101
}
1102

    
1103
        if((pict_type&7)!=3)
1104
        {
1105
                if (QPStride >= 0) {
1106
                        int i;
1107
                        const int count= mbHeight * QPStride;
1108
                        for(i=0; i<(count>>2); i++){
1109
                                ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1110
                        }
1111
                        for(i<<=2; i<count; i++){
1112
                                c->nonBQPTable[i] = QP_store[i] & 0x3F;
1113
                        }
1114
                } else {
1115
                        int i,j;
1116
                        for(i=0; i<mbHeight; i++) {
1117
                                    for(j=0; j<absQPStride; j++) {
1118
                                        c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1119
                                }
1120
                        }
1121
                }
1122
        }
1123

    
1124
        if(verbose>2)
1125
        {
1126
                printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
1127
        }
1128

    
1129
        postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1130
                width, height, QP_store, QPStride, 0, mode, c);
1131

    
1132
        width  = (width )>>c->hChromaSubSample;
1133
        height = (height)>>c->vChromaSubSample;
1134

    
1135
        if(mode->chromMode)
1136
        {
1137
                postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1138
                        width, height, QP_store, QPStride, 1, mode, c);
1139
                postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1140
                        width, height, QP_store, QPStride, 2, mode, c);
1141
        }
1142
        else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1143
        {
1144
                linecpy(dst[1], src[1], height, srcStride[1]);
1145
                linecpy(dst[2], src[2], height, srcStride[2]);
1146
        }
1147
        else
1148
        {
1149
                int y;
1150
                for(y=0; y<height; y++)
1151
                {
1152
                        memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1153
                        memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1154
                }
1155
        }
1156
}
1157