Statistics
| Branch: | Revision:

ffmpeg / postproc / postprocess_template.c @ 70c5ae87

History | View | Annotate | Download (93.7 KB)

1
/*
2
    Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
/*
20
                        C        MMX        MMX2        3DNow
21
isVertDC                Ec        Ec
22
isVertMinMaxOk                Ec        Ec
23
doVertLowPass                E                e        e
24
doVertDefFilter                Ec        Ec        Ec
25
isHorizDC                Ec        Ec
26
isHorizMinMaxOk                a        E
27
doHorizLowPass                E                e        e
28
doHorizDefFilter        Ec        Ec        Ec
29
deRing                                        e
30
Vertical RKAlgo1        E                a        a
31
Horizontal RKAlgo1                        a        a
32
Vertical X1                a                E        E
33
Horizontal X1                a                E        E
34
LinIpolDeinterlace        e                E        E*
35
CubicIpolDeinterlace        a                e        e*
36
LinBlendDeinterlace        e                E        E*
37
MedianDeinterlace                 Ec        Ec
38

39

40
* i dont have a 3dnow CPU -> its untested
41
E = Exact implementation
42
e = allmost exact implementation (slightly different rounding,...)
43
a = alternative / approximate impl
44
c = checked against the other implementations (-vo md5)
45
*/
46

    
47
/*
48
TODO:
49
verify that everything workes as it should (how?)
50
reduce the time wasted on the mem transfer
51
implement dering
52
implement everything in C at least (done at the moment but ...)
53
unroll stuff if instructions depend too much on the prior one
54
we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55
move YScale thing to the end instead of fixing QP
56
write a faster and higher quality deblocking filter :)
57
do something about the speed of the horizontal filters
58
make the mainloop more flexible (variable number of blocks at once
59
        (the if/else stuff per block is slowing things down)
60
compare the quality & speed of all filters
61
split this huge file
62
fix warnings (unused vars, ...)
63
noise reduction filters
64
border remover
65
...
66

67
Notes:
68
*/
69

    
70
//Changelog: use the CVS log
71

    
72
#include <inttypes.h>
73
#include <stdio.h>
74
#include <stdlib.h>
75
#include <string.h>
76
#include "../config.h"
77
//#undef HAVE_MMX2
78
//#define HAVE_3DNOW
79
//#undef HAVE_MMX
80
#include "postprocess.h"
81

    
82
#define MIN(a,b) ((a) > (b) ? (b) : (a))
83
#define MAX(a,b) ((a) < (b) ? (b) : (a))
84
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
85
#define SIGN(a) ((a) > 0 ? 1 : -1)
86

    
87
#ifdef HAVE_MMX2
88
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
89
#elif defined (HAVE_3DNOW)
90
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
91
#endif
92

    
93
#define GET_MODE_BUFFER_SIZE 500
94
#define OPTIONS_ARRAY_SIZE 10
95

    
96

    
97
static uint64_t packedYOffset=        0x0000000000000000LL;
98
static uint64_t packedYScale=        0x0100010001000100LL;
99
static uint64_t w05=                0x0005000500050005LL;
100
static uint64_t w20=                0x0020002000200020LL;
101
static uint64_t w1400=                0x1400140014001400LL;
102
static uint64_t bm00000001=        0x00000000000000FFLL;
103
static uint64_t bm00010000=        0x000000FF00000000LL;
104
static uint64_t bm00001000=        0x00000000FF000000LL;
105
static uint64_t bm10000000=        0xFF00000000000000LL;
106
static uint64_t bm10000001=        0xFF000000000000FFLL;
107
static uint64_t bm11000011=        0xFFFF00000000FFFFLL;
108
static uint64_t bm00000011=        0x000000000000FFFFLL;
109
static uint64_t bm11111110=        0xFFFFFFFFFFFFFF00LL;
110
static uint64_t bm11000000=        0xFFFF000000000000LL;
111
static uint64_t bm00011000=        0x000000FFFF000000LL;
112
static uint64_t bm00110011=        0x0000FFFF0000FFFFLL;
113
static uint64_t bm11001100=        0xFFFF0000FFFF0000LL;
114
static uint64_t b00=                 0x0000000000000000LL;
115
static uint64_t b01=                 0x0101010101010101LL;
116
static uint64_t b02=                 0x0202020202020202LL;
117
static uint64_t b0F=                 0x0F0F0F0F0F0F0F0FLL;
118
static uint64_t b04=                 0x0404040404040404LL;
119
static uint64_t b08=                 0x0808080808080808LL;
120
static uint64_t bFF=                 0xFFFFFFFFFFFFFFFFLL;
121
static uint64_t b20=                 0x2020202020202020LL;
122
static uint64_t b80=                 0x8080808080808080LL;
123
static uint64_t b7E=                 0x7E7E7E7E7E7E7E7ELL;
124
static uint64_t b7C=                 0x7C7C7C7C7C7C7C7CLL;
125
static uint64_t b3F=                 0x3F3F3F3F3F3F3F3FLL;
126
static uint64_t temp0=0;
127
static uint64_t temp1=0;
128
static uint64_t temp2=0;
129
static uint64_t temp3=0;
130
static uint64_t temp4=0;
131
static uint64_t temp5=0;
132
static uint64_t pQPb=0;
133
static uint64_t pQPb2=0;
134
static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
135

    
136
int hFlatnessThreshold= 56 - 16;
137
int vFlatnessThreshold= 56 - 16;
138

    
139
//amount of "black" u r willing to loose to get a brightness corrected picture
140
double maxClippedThreshold= 0.01;
141

    
142
int maxAllowedY=234;
143
int minAllowedY=16;
144

    
145
static struct PPFilter filters[]=
146
{
147
        {"hb", "hdeblock",                 1, 1, 3, H_DEBLOCK},
148
        {"vb", "vdeblock",                 1, 2, 4, V_DEBLOCK},
149
        {"vr", "rkvdeblock",                 1, 2, 4, H_RK1_FILTER},
150
        {"h1", "x1hdeblock",                 1, 1, 3, H_X1_FILTER},
151
        {"v1", "x1vdeblock",                 1, 2, 4, V_X1_FILTER},
152
        {"dr", "dering",                 1, 5, 6, DERING},
153
        {"al", "autolevels",                 0, 1, 2, LEVEL_FIX},
154
        {"lb", "linblenddeint",         0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
155
        {"li", "linipoldeint",                 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
156
        {"ci", "cubicipoldeint",        0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
157
        {"md", "mediandeint",                 0, 1, 6, MEDIAN_DEINT_FILTER},
158
        {NULL, NULL,0,0,0,0} //End Marker
159
};
160

    
161
static char *replaceTable[]=
162
{
163
        "default",         "hdeblock:a,vdeblock:a,dering:a,autolevels",
164
        "de",                 "hdeblock:a,vdeblock:a,dering:a,autolevels",
165
        "fast",         "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
166
        "fa",                 "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
167
        NULL //End Marker
168
};
169

    
170
static inline void unusedVariableWarningFixer()
171
{
172
if(
173
 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
174
 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
175
 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
176
 + bFF + b20 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
177
 + temp5 + pQPb== 0) b00=0;
178
}
179

    
180
#ifdef TIMING
181
static inline long long rdtsc()
182
{
183
        long long l;
184
        asm volatile(        "rdtsc\n\t"
185
                : "=A" (l)
186
        );
187
//        printf("%d\n", int(l/1000));
188
        return l;
189
}
190
#endif
191

    
192
#ifdef HAVE_MMX2
193
static inline void prefetchnta(void *p)
194
{
195
        asm volatile(        "prefetchnta (%0)\n\t"
196
                : : "r" (p)
197
        );
198
}
199

    
200
static inline void prefetcht0(void *p)
201
{
202
        asm volatile(        "prefetcht0 (%0)\n\t"
203
                : : "r" (p)
204
        );
205
}
206

    
207
static inline void prefetcht1(void *p)
208
{
209
        asm volatile(        "prefetcht1 (%0)\n\t"
210
                : : "r" (p)
211
        );
212
}
213

    
214
static inline void prefetcht2(void *p)
215
{
216
        asm volatile(        "prefetcht2 (%0)\n\t"
217
                : : "r" (p)
218
        );
219
}
220
#endif
221

    
222
//FIXME? |255-0| = 1 (shouldnt be a problem ...)
223
/**
224
 * Check if the middle 8x8 Block in the given 8x16 block is flat
225
 */
226
static inline int isVertDC(uint8_t src[], int stride){
227
        int numEq= 0;
228
#ifndef HAVE_MMX
229
        int y;
230
#endif
231
        src+= stride*4; // src points to begin of the 8x8 Block
232
#ifdef HAVE_MMX
233
asm volatile(
234
                "leal (%1, %2), %%eax                                \n\t"
235
                "leal (%%eax, %2, 4), %%ebx                        \n\t"
236
//        0        1        2        3        4        5        6        7        8        9
237
//        %1        eax        eax+%2        eax+2%2        %1+4%2        ebx        ebx+%2        ebx+2%2        %1+8%2        ebx+4%2
238
                "movq b7E, %%mm7                                        \n\t" // mm7 = 0x7F
239
                "movq b7C, %%mm6                                        \n\t" // mm6 = 0x7D
240
                "movq (%1), %%mm0                                \n\t"
241
                "movq (%%eax), %%mm1                                \n\t"
242
                "psubb %%mm1, %%mm0                                \n\t" // mm0 = differnece
243
                "paddb %%mm7, %%mm0                                \n\t"
244
                "pcmpgtb %%mm6, %%mm0                                \n\t"
245

    
246
                "movq (%%eax,%2), %%mm2                                \n\t"
247
                "psubb %%mm2, %%mm1                                \n\t"
248
                "paddb %%mm7, %%mm1                                \n\t"
249
                "pcmpgtb %%mm6, %%mm1                                \n\t"
250
                "paddb %%mm1, %%mm0                                \n\t"
251

    
252
                "movq (%%eax, %2, 2), %%mm1                        \n\t"
253
                "psubb %%mm1, %%mm2                                \n\t"
254
                "paddb %%mm7, %%mm2                                \n\t"
255
                "pcmpgtb %%mm6, %%mm2                                \n\t"
256
                "paddb %%mm2, %%mm0                                \n\t"
257

    
258
                "movq (%1, %2, 4), %%mm2                        \n\t"
259
                "psubb %%mm2, %%mm1                                \n\t"
260
                "paddb %%mm7, %%mm1                                \n\t"
261
                "pcmpgtb %%mm6, %%mm1                                \n\t"
262
                "paddb %%mm1, %%mm0                                \n\t"
263

    
264
                "movq (%%ebx), %%mm1                                \n\t"
265
                "psubb %%mm1, %%mm2                                \n\t"
266
                "paddb %%mm7, %%mm2                                \n\t"
267
                "pcmpgtb %%mm6, %%mm2                                \n\t"
268
                "paddb %%mm2, %%mm0                                \n\t"
269

    
270
                "movq (%%ebx, %2), %%mm2                        \n\t"
271
                "psubb %%mm2, %%mm1                                \n\t"
272
                "paddb %%mm7, %%mm1                                \n\t"
273
                "pcmpgtb %%mm6, %%mm1                                \n\t"
274
                "paddb %%mm1, %%mm0                                \n\t"
275

    
276
                "movq (%%ebx, %2, 2), %%mm1                        \n\t"
277
                "psubb %%mm1, %%mm2                                \n\t"
278
                "paddb %%mm7, %%mm2                                \n\t"
279
                "pcmpgtb %%mm6, %%mm2                                \n\t"
280
                "paddb %%mm2, %%mm0                                \n\t"
281

    
282
                "                                                \n\t"
283
                "movq %%mm0, %%mm1                                \n\t"
284
                "psrlw $8, %%mm0                                \n\t"
285
                "paddb %%mm1, %%mm0                                \n\t"
286
#ifdef HAVE_MMX2
287
                "pshufw $0xF9, %%mm0, %%mm1                        \n\t"
288
                "paddb %%mm1, %%mm0                                \n\t"
289
                "pshufw $0xFE, %%mm0, %%mm1                        \n\t"
290
#else
291
                "movq %%mm0, %%mm1                                \n\t"
292
                "psrlq $16, %%mm0                                \n\t"
293
                "paddb %%mm1, %%mm0                                \n\t"
294
                "movq %%mm0, %%mm1                                \n\t"
295
                "psrlq $32, %%mm0                                \n\t"
296
#endif
297
                "paddb %%mm1, %%mm0                                \n\t"
298
                "movd %%mm0, %0                                        \n\t"
299
                : "=r" (numEq)
300
                : "r" (src), "r" (stride)
301
                : "%eax", "%ebx"
302
                );
303

    
304
        numEq= (256 - numEq) &0xFF;
305

    
306
#else
307
        for(y=0; y<BLOCK_SIZE-1; y++)
308
        {
309
                if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
310
                if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
311
                if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
312
                if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
313
                if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
314
                if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
315
                if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
316
                if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
317
                src+= stride;
318
        }
319
#endif
320
/*        if(abs(numEq - asmEq) > 0)
321
        {
322
                printf("\nasm:%d  c:%d\n", asmEq, numEq);
323
                for(int y=0; y<8; y++)
324
                {
325
                        for(int x=0; x<8; x++)
326
                        {
327
                                printf("%d ", temp[x + y*stride]);
328
                        }
329
                        printf("\n");
330
                }
331
        }
332
*/
333
//        for(int i=0; i<numEq/8; i++) src[i]=255;
334
        return (numEq > vFlatnessThreshold) ? 1 : 0;
335
}
336

    
337
static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
338
{
339
#ifdef HAVE_MMX
340
        int isOk;
341
        src+= stride*3;
342
        asm volatile(
343
//                "int $3 \n\t"
344
                "movq (%1, %2), %%mm0                                \n\t"
345
                "movq (%1, %2, 8), %%mm1                        \n\t"
346
                "movq %%mm0, %%mm2                                \n\t"
347
                "psubusb %%mm1, %%mm0                                \n\t"
348
                "psubusb %%mm2, %%mm1                                \n\t"
349
                "por %%mm1, %%mm0                                \n\t" // ABS Diff
350

    
351
                "movq pQPb, %%mm7                                \n\t" // QP,..., QP
352
                "paddusb %%mm7, %%mm7                                \n\t" // 2QP ... 2QP
353
                "psubusb %%mm7, %%mm0                                \n\t" // Diff <= 2QP -> 0
354
                "pcmpeqd b00, %%mm0                                \n\t"
355
                "psrlq $16, %%mm0                                \n\t"
356
                "pcmpeqd bFF, %%mm0                                \n\t"
357
//                "movd %%mm0, (%1, %2, 4)\n\t"
358
                "movd %%mm0, %0                                        \n\t"
359
                : "=r" (isOk)
360
                : "r" (src), "r" (stride)
361
                );
362
        return isOk;
363
#else
364

    
365
        int isOk2= 1;
366
        int x;
367
        src+= stride*3;
368
        for(x=0; x<BLOCK_SIZE; x++)
369
        {
370
                if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
371
        }
372
/*        if(isOk && !isOk2 || !isOk && isOk2)
373
        {
374
                printf("\nasm:%d  c:%d QP:%d\n", isOk, isOk2, QP);
375
                for(int y=0; y<9; y++)
376
                {
377
                        for(int x=0; x<8; x++)
378
                        {
379
                                printf("%d ", src[x + y*stride]);
380
                        }
381
                        printf("\n");
382
                }
383
        } */
384

    
385
        return isOk2;
386
#endif
387

    
388
}
389

    
390
/**
391
 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
392
 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
393
 */
394
static inline void doVertLowPass(uint8_t *src, int stride, int QP)
395
{
396
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
397
        src+= stride*3;
398
        asm volatile(        //"movv %0 %1 %2\n\t"
399
                "pushl %0 \n\t"
400
                "movq pQPb, %%mm0                                \n\t"  // QP,..., QP
401

    
402
                "movq (%0), %%mm6                                \n\t"
403
                "movq (%0, %1), %%mm5                                \n\t"
404
                "movq %%mm5, %%mm1                                \n\t"
405
                "movq %%mm6, %%mm2                                \n\t"
406
                "psubusb %%mm6, %%mm5                                \n\t"
407
                "psubusb %%mm1, %%mm2                                \n\t"
408
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
409
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
410
                "pcmpeqb b00, %%mm2                                \n\t" // diff <= QP -> FF
411

    
412
                "pand %%mm2, %%mm6                                \n\t"
413
                "pandn %%mm1, %%mm2                                \n\t"
414
                "por %%mm2, %%mm6                                \n\t"// First Line to Filter
415

    
416
                "movq (%0, %1, 8), %%mm5                        \n\t"
417
                "leal (%0, %1, 4), %%eax                        \n\t"
418
                "leal (%0, %1, 8), %%ebx                        \n\t"
419
                "subl %1, %%ebx                                        \n\t"
420
                "addl %1, %0                                        \n\t" // %0 points to line 1 not 0
421
                "movq (%0, %1, 8), %%mm7                        \n\t"
422
                "movq %%mm5, %%mm1                                \n\t"
423
                "movq %%mm7, %%mm2                                \n\t"
424
                "psubusb %%mm7, %%mm5                                \n\t"
425
                "psubusb %%mm1, %%mm2                                \n\t"
426
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
427
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
428
                "pcmpeqb b00, %%mm2                                \n\t" // diff <= QP -> FF
429

    
430
                "pand %%mm2, %%mm7                                \n\t"
431
                "pandn %%mm1, %%mm2                                \n\t"
432
                "por %%mm2, %%mm7                                \n\t" // First Line to Filter
433

    
434

    
435
                //         1        2        3        4        5        6        7        8
436
                //        %0        %0+%1        %0+2%1        eax        %0+4%1        eax+2%1        ebx        eax+4%1
437
                // 6 4 2 2 1 1
438
                // 6 4 4 2
439
                // 6 8 2
440

    
441
                "movq (%0, %1), %%mm0                                \n\t" //  1
442
                "movq %%mm0, %%mm1                                \n\t" //  1
443
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
444
                PAVGB(%%mm6, %%mm0)                                      //3 1        /4
445

    
446
                "movq (%0, %1, 4), %%mm2                        \n\t" //     1
447
                "movq %%mm2, %%mm5                                \n\t" //     1
448
                PAVGB((%%eax), %%mm2)                                      //    11        /2
449
                PAVGB((%0, %1, 2), %%mm2)                              //   211        /4
450
                "movq %%mm2, %%mm3                                \n\t" //   211        /4
451
                "movq (%0), %%mm4                                \n\t" // 1
452
                PAVGB(%%mm4, %%mm3)                                      // 4 211        /8
453
                PAVGB(%%mm0, %%mm3)                                      //642211        /16
454
                "movq %%mm3, (%0)                                \n\t" // X
455
                // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
456
                "movq %%mm1, %%mm0                                \n\t" //  1
457
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
458
                "movq %%mm4, %%mm3                                \n\t" // 1
459
                PAVGB((%0,%1,2), %%mm3)                                      // 1 1        /2
460
                PAVGB((%%eax,%1,2), %%mm5)                              //     11        /2
461
                PAVGB((%%eax), %%mm5)                                      //    211 /4
462
                PAVGB(%%mm5, %%mm3)                                      // 2 2211 /8
463
                PAVGB(%%mm0, %%mm3)                                      //4242211 /16
464
                "movq %%mm3, (%0,%1)                                \n\t" //  X
465
                // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
466
                PAVGB(%%mm4, %%mm6)                                      //11        /2
467
                "movq (%%ebx), %%mm0                                \n\t" //       1
468
                PAVGB((%%eax, %1, 2), %%mm0)                              //      11/2
469
                "movq %%mm0, %%mm3                                \n\t" //      11/2
470
                PAVGB(%%mm1, %%mm0)                                      //  2   11/4
471
                PAVGB(%%mm6, %%mm0)                                      //222   11/8
472
                PAVGB(%%mm2, %%mm0)                                      //22242211/16
473
                "movq (%0, %1, 2), %%mm2                        \n\t" //   1
474
                "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
475
                // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
476
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
477
                PAVGB((%%ebx), %%mm0)                                      //       11        /2
478
                PAVGB(%%mm0, %%mm6)                                      //11     11        /4
479
                PAVGB(%%mm1, %%mm4)                                      // 11                /2
480
                PAVGB(%%mm2, %%mm1)                                      //  11                /2
481
                PAVGB(%%mm1, %%mm6)                                      //1122   11        /8
482
                PAVGB(%%mm5, %%mm6)                                      //112242211        /16
483
                "movq (%%eax), %%mm5                                \n\t" //    1
484
                "movq %%mm6, (%%eax)                                \n\t" //    X
485
                // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
486
                "movq (%%eax, %1, 4), %%mm6                        \n\t" //        1
487
                PAVGB(%%mm7, %%mm6)                                      //        11        /2
488
                PAVGB(%%mm4, %%mm6)                                      // 11     11        /4
489
                PAVGB(%%mm3, %%mm6)                                      // 11   2211        /8
490
                PAVGB(%%mm5, %%mm2)                                      //   11                /2
491
                "movq (%0, %1, 4), %%mm4                        \n\t" //     1
492
                PAVGB(%%mm4, %%mm2)                                      //   112                /4
493
                PAVGB(%%mm2, %%mm6)                                      // 112242211        /16
494
                "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
495
                // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
496
                PAVGB(%%mm7, %%mm1)                                      //  11     2        /4
497
                PAVGB(%%mm4, %%mm5)                                      //    11                /2
498
                PAVGB(%%mm5, %%mm0)                                      //    11 11        /4
499
                "movq (%%eax, %1, 2), %%mm6                        \n\t" //      1
500
                PAVGB(%%mm6, %%mm1)                                      //  11  4  2        /8
501
                PAVGB(%%mm0, %%mm1)                                      //  11224222        /16
502
                "movq %%mm1, (%%eax, %1, 2)                        \n\t" //      X
503
                // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
504
                PAVGB((%%ebx), %%mm2)                                      //   112 4        /8
505
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
506
                PAVGB(%%mm0, %%mm6)                                      //      1 1        /2
507
                PAVGB(%%mm7, %%mm6)                                      //      1 12        /4
508
                PAVGB(%%mm2, %%mm6)                                      //   1122424        /4
509
                "movq %%mm6, (%%ebx)                                \n\t" //       X
510
                // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
511
                PAVGB(%%mm7, %%mm5)                                      //    11   2        /4
512
                PAVGB(%%mm7, %%mm5)                                      //    11   6        /8
513

    
514
                PAVGB(%%mm3, %%mm0)                                      //      112        /4
515
                PAVGB(%%mm0, %%mm5)                                      //    112246        /16
516
                "movq %%mm5, (%%eax, %1, 4)                        \n\t" //        X
517
                "popl %0\n\t"
518

    
519
                :
520
                : "r" (src), "r" (stride)
521
                : "%eax", "%ebx"
522
        );
523
#else
524
        const int l1= stride;
525
        const int l2= stride + l1;
526
        const int l3= stride + l2;
527
        const int l4= stride + l3;
528
        const int l5= stride + l4;
529
        const int l6= stride + l5;
530
        const int l7= stride + l6;
531
        const int l8= stride + l7;
532
        const int l9= stride + l8;
533
        int x;
534
        src+= stride*3;
535
        for(x=0; x<BLOCK_SIZE; x++)
536
        {
537
                const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
538
                const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
539

    
540
                int sums[9];
541
                sums[0] = first + src[l1];
542
                sums[1] = src[l1] + src[l2];
543
                sums[2] = src[l2] + src[l3];
544
                sums[3] = src[l3] + src[l4];
545
                sums[4] = src[l4] + src[l5];
546
                sums[5] = src[l5] + src[l6];
547
                sums[6] = src[l6] + src[l7];
548
                sums[7] = src[l7] + src[l8];
549
                sums[8] = src[l8] + last;
550

    
551
                src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
552
                src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
553
                src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
554
                src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
555
                src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
556
                src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
557
                src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
558
                src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
559

    
560
                src++;
561
        }
562

    
563
#endif
564
}
565

    
566
/**
567
 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
568
 * values are correctly clipped (MMX2)
569
 * values are wraparound (C)
570
 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
571
        0 8 16 24
572
        x = 8
573
        x/2 = 4
574
        x/8 = 1
575
        1 12 12 23
576
 */
577
static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
578
{
579
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
580
        src+= stride*3;
581
// FIXME rounding
582
        asm volatile(
583
                "pxor %%mm7, %%mm7                                \n\t" // 0
584
                "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
585
                "leal (%0, %1), %%eax                                \n\t"
586
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
587
//        0        1        2        3        4        5        6        7        8        9
588
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
589
                "movq pQPb, %%mm0                                \n\t" // QP,..., QP
590
                "movq %%mm0, %%mm1                                \n\t" // QP,..., QP
591
                "paddusb b02, %%mm0                                \n\t"
592
                "psrlw $2, %%mm0                                \n\t"
593
                "pand b3F, %%mm0                                \n\t" // QP/4,..., QP/4
594
                "paddusb %%mm1, %%mm0                                \n\t" // QP*1.25 ...
595
                "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
596
                "movq (%%ebx), %%mm3                                \n\t" // line 5
597
                "movq %%mm2, %%mm4                                \n\t" // line 4
598
                "pcmpeqb %%mm5, %%mm5                                \n\t" // -1
599
                "pxor %%mm2, %%mm5                                \n\t" // -line 4 - 1
600
                PAVGB(%%mm3, %%mm5)
601
                "paddb %%mm6, %%mm5                                \n\t" // (l5-l4)/2
602
                "psubusb %%mm3, %%mm4                                \n\t"
603
                "psubusb %%mm2, %%mm3                                \n\t"
604
                "por %%mm3, %%mm4                                \n\t" // |l4 - l5|
605
                "psubusb %%mm0, %%mm4                                \n\t"
606
                "pcmpeqb %%mm7, %%mm4                                \n\t"
607
                "pand %%mm4, %%mm5                                \n\t" // d/2
608

    
609
//                "paddb %%mm6, %%mm2                                \n\t" // line 4 + 0x80
610
                "paddb %%mm5, %%mm2                                \n\t"
611
//                "psubb %%mm6, %%mm2                                \n\t"
612
                "movq %%mm2, (%0,%1, 4)                                \n\t"
613

    
614
                "movq (%%ebx), %%mm2                                \n\t"
615
//                "paddb %%mm6, %%mm2                                \n\t" // line 5 + 0x80
616
                "psubb %%mm5, %%mm2                                \n\t"
617
//                "psubb %%mm6, %%mm2                                \n\t"
618
                "movq %%mm2, (%%ebx)                                \n\t"
619

    
620
                "paddb %%mm6, %%mm5                                \n\t"
621
                "psrlw $2, %%mm5                                \n\t"
622
                "pand b3F, %%mm5                                \n\t"
623
                "psubb b20, %%mm5                                \n\t" // (l5-l4)/8
624

    
625
                "movq (%%eax, %1, 2), %%mm2                        \n\t"
626
                "paddb %%mm6, %%mm2                                \n\t" // line 3 + 0x80
627
                "paddsb %%mm5, %%mm2                                \n\t"
628
                "psubb %%mm6, %%mm2                                \n\t"
629
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
630

    
631
                "movq (%%ebx, %1), %%mm2                        \n\t"
632
                "paddb %%mm6, %%mm2                                \n\t" // line 6 + 0x80
633
                "psubsb %%mm5, %%mm2                                \n\t"
634
                "psubb %%mm6, %%mm2                                \n\t"
635
                "movq %%mm2, (%%ebx, %1)                        \n\t"
636

    
637
                :
638
                : "r" (src), "r" (stride)
639
                : "%eax", "%ebx"
640
        );
641
#else
642
         const int l1= stride;
643
        const int l2= stride + l1;
644
        const int l3= stride + l2;
645
        const int l4= stride + l3;
646
        const int l5= stride + l4;
647
        const int l6= stride + l5;
648
//        const int l7= stride + l6;
649
//        const int l8= stride + l7;
650
//        const int l9= stride + l8;
651
        int x;
652
        src+= stride*3;
653
        for(x=0; x<BLOCK_SIZE; x++)
654
        {
655
                if(ABS(src[l4]-src[l5]) < QP + QP/4)
656
                {
657
                        int v = (src[l5] - src[l4]);
658

    
659
                        src[l3] +=v/8;
660
                        src[l4] +=v/2;
661
                        src[l5] -=v/2;
662
                        src[l6] -=v/8;
663

    
664
                }
665
                src++;
666
        }
667

    
668
#endif
669
}
670

    
671
/**
672
 * Experimental Filter 1
673
 * will not damage linear gradients
674
 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
675
 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
676
 * MMX2 version does correct clipping C version doesnt
677
 */
678
static inline void vertX1Filter(uint8_t *src, int stride, int QP)
679
{
680
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
681
        src+= stride*3;
682

    
683
        asm volatile(
684
                "pxor %%mm7, %%mm7                                \n\t" // 0
685
//                "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
686
                "leal (%0, %1), %%eax                                \n\t"
687
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
688
//        0        1        2        3        4        5        6        7        8        9
689
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
690
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
691
                "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
692
                "movq %%mm1, %%mm2                                \n\t" // line 4
693
                "psubusb %%mm0, %%mm1                                \n\t"
694
                "psubusb %%mm2, %%mm0                                \n\t"
695
                "por %%mm1, %%mm0                                \n\t" // |l2 - l3|
696
                "movq (%%ebx), %%mm3                                \n\t" // line 5
697
                "movq (%%ebx, %1), %%mm4                                \n\t" // line 6
698
                "movq %%mm3, %%mm5                                \n\t" // line 5
699
                "psubusb %%mm4, %%mm3                                \n\t"
700
                "psubusb %%mm5, %%mm4                                \n\t"
701
                "por %%mm4, %%mm3                                \n\t" // |l5 - l6|
702
                PAVGB(%%mm3, %%mm0)                                      // (|l2 - l3| + |l5 - l6|)/2
703
                "movq %%mm2, %%mm1                                \n\t" // line 4
704
                "psubusb %%mm5, %%mm2                                \n\t"
705
                "movq %%mm2, %%mm4                                \n\t"
706
                "pcmpeqb %%mm7, %%mm2                                \n\t" // (l4 - l5) <= 0 ? -1 : 0
707
                "psubusb %%mm1, %%mm5                                \n\t"
708
                "por %%mm5, %%mm4                                \n\t" // |l4 - l5|
709
                "psubusb %%mm0, %%mm4                \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
710
                "movq %%mm4, %%mm3                                \n\t" // d
711
                "psubusb pQPb, %%mm4                                \n\t"
712
                "pcmpeqb %%mm7, %%mm4                                \n\t" // d <= QP ? -1 : 0
713
                "psubusb b01, %%mm3                                \n\t"
714
                "pand %%mm4, %%mm3                                \n\t" // d <= QP ? d : 0
715

    
716
                PAVGB(%%mm7, %%mm3)                                      // d/2
717
                "movq %%mm3, %%mm1                                \n\t" // d/2
718
                PAVGB(%%mm7, %%mm3)                                      // d/4
719
                PAVGB(%%mm1, %%mm3)                                      // 3*d/8
720

    
721
                "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
722
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
723
                "psubusb %%mm3, %%mm0                                \n\t"
724
                "pxor %%mm2, %%mm0                                \n\t"
725
                "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
726

    
727
                "movq (%%ebx), %%mm0                                \n\t" // line 5
728
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
729
                "paddusb %%mm3, %%mm0                                \n\t"
730
                "pxor %%mm2, %%mm0                                \n\t"
731
                "movq %%mm0, (%%ebx)                                \n\t" // line 5
732

    
733
                PAVGB(%%mm7, %%mm1)                                      // d/4
734

    
735
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
736
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
737
                "psubusb %%mm1, %%mm0                                \n\t"
738
                "pxor %%mm2, %%mm0                                \n\t"
739
                "movq %%mm0, (%%eax, %1, 2)                        \n\t" // line 3
740

    
741
                "movq (%%ebx, %1), %%mm0                        \n\t" // line 6
742
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
743
                "paddusb %%mm1, %%mm0                                \n\t"
744
                "pxor %%mm2, %%mm0                                \n\t"
745
                "movq %%mm0, (%%ebx, %1)                        \n\t" // line 6
746

    
747
                PAVGB(%%mm7, %%mm1)                                      // d/8
748

    
749
                "movq (%%eax, %1), %%mm0                        \n\t" // line 2
750
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
751
                "psubusb %%mm1, %%mm0                                \n\t"
752
                "pxor %%mm2, %%mm0                                \n\t"
753
                "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
754

    
755
                "movq (%%ebx, %1, 2), %%mm0                        \n\t" // line 7
756
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
757
                "paddusb %%mm1, %%mm0                                \n\t"
758
                "pxor %%mm2, %%mm0                                \n\t"
759
                "movq %%mm0, (%%ebx, %1, 2)                        \n\t" // line 7
760

    
761
                :
762
                : "r" (src), "r" (stride)
763
                : "%eax", "%ebx"
764
        );
765
#else
766

    
767
         const int l1= stride;
768
        const int l2= stride + l1;
769
        const int l3= stride + l2;
770
        const int l4= stride + l3;
771
        const int l5= stride + l4;
772
        const int l6= stride + l5;
773
        const int l7= stride + l6;
774
//        const int l8= stride + l7;
775
//        const int l9= stride + l8;
776
        int x;
777

    
778
        src+= stride*3;
779
        for(x=0; x<BLOCK_SIZE; x++)
780
        {
781
                int a= src[l3] - src[l4];
782
                int b= src[l4] - src[l5];
783
                int c= src[l5] - src[l6];
784

    
785
                int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
786

    
787
                if(d < QP)
788
                {
789
                        int v = d * SIGN(-b);
790

    
791
                        src[l2] +=v/8;
792
                        src[l3] +=v/4;
793
                        src[l4] +=3*v/8;
794
                        src[l5] -=3*v/8;
795
                        src[l6] -=v/4;
796
                        src[l7] -=v/8;
797

    
798
                }
799
                src++;
800
        }
801
        /*
802
         const int l1= stride;
803
        const int l2= stride + l1;
804
        const int l3= stride + l2;
805
        const int l4= stride + l3;
806
        const int l5= stride + l4;
807
        const int l6= stride + l5;
808
        const int l7= stride + l6;
809
        const int l8= stride + l7;
810
        const int l9= stride + l8;
811
        for(int x=0; x<BLOCK_SIZE; x++)
812
        {
813
                int v2= src[l2];
814
                int v3= src[l3];
815
                int v4= src[l4];
816
                int v5= src[l5];
817
                int v6= src[l6];
818
                int v7= src[l7];
819

820
                if(ABS(v4-v5)<QP &&  ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
821
                {
822
                        src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6         )/16;
823
                        src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7  )/16;
824
                        src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
825
                        src[l6] = (       1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
826
                }
827
                src++;
828
        }
829
*/
830
#endif
831
}
832

    
833
/**
834
 * Experimental Filter 1 (Horizontal)
835
 * will not damage linear gradients
836
 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
837
 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
838
 * MMX2 version does correct clipping C version doesnt
839
 * not identical with the vertical one
840
 */
841
static inline void horizX1Filter(uint8_t *src, int stride, int QP)
842
{
843
        int y;
844
        static uint64_t *lut= NULL;
845
        if(lut==NULL)
846
        {
847
                int i;
848
                lut= (uint64_t*)memalign(8, 256*8);
849
                for(i=0; i<256; i++)
850
                {
851
                        int v= i < 128 ? 2*i : 2*(i-256);
852
/*
853
//Simulate 112242211 9-Tap filter
854
                        uint64_t a= (v/16) & 0xFF;
855
                        uint64_t b= (v/8) & 0xFF;
856
                        uint64_t c= (v/4) & 0xFF;
857
                        uint64_t d= (3*v/8) & 0xFF;
858
*/
859
//Simulate piecewise linear interpolation
860
                        uint64_t a= (v/16) & 0xFF;
861
                        uint64_t b= (v*3/16) & 0xFF;
862
                        uint64_t c= (v*5/16) & 0xFF;
863
                        uint64_t d= (7*v/16) & 0xFF;
864
                        uint64_t A= (0x100 - a)&0xFF;
865
                        uint64_t B= (0x100 - b)&0xFF;
866
                        uint64_t C= (0x100 - c)&0xFF;
867
                        uint64_t D= (0x100 - c)&0xFF;
868

    
869
                        lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
870
                                (D<<24) | (C<<16) | (B<<8) | (A);
871
                        //lut[i] = (v<<32) | (v<<24);
872
                }
873
        }
874

    
875
#if 0
876
        asm volatile(
877
                "pxor %%mm7, %%mm7                                \n\t" // 0
878
//                "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
879
                "leal (%0, %1), %%eax                                \n\t"
880
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
881

882
                "movq b80, %%mm6                                \n\t"
883
                "movd pQPb, %%mm5                                \n\t" // QP
884
                "movq %%mm5, %%mm4                                \n\t"
885
                "paddusb %%mm5, %%mm5                                \n\t" // 2QP
886
                "paddusb %%mm5, %%mm4                                \n\t" // 3QP
887
                "pxor %%mm5, %%mm5                                \n\t" // 0
888
                "psubb %%mm4, %%mm5                                \n\t" // -3QP
889
                "por bm11111110, %%mm5                                \n\t" // ...,FF,FF,-3QP
890
                "psllq $24, %%mm5                                \n\t"
891

892
//        0        1        2        3        4        5        6        7        8        9
893
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
894

895
#define HX1old(a) \
896
                "movd " #a ", %%mm0                                \n\t"\
897
                "movd 4" #a ", %%mm1                                \n\t"\
898
                "punpckldq %%mm1, %%mm0                                \n\t"\
899
                "movq %%mm0, %%mm1                                \n\t"\
900
                "movq %%mm0, %%mm2                                \n\t"\
901
                "psrlq $8, %%mm1                                \n\t"\
902
                "psubusb %%mm1, %%mm2                                \n\t"\
903
                "psubusb %%mm0, %%mm1                                \n\t"\
904
                "por %%mm2, %%mm1                                \n\t" /* p?x = |px - p(x+1)| */\
905
                "pcmpeqb %%mm7, %%mm2                                \n\t" /* p?x = sgn[px - p(x+1)] */\
906
                "pshufw $0x00, %%mm1, %%mm3                        \n\t" /* p?5 = |p1 - p2| */\
907
                PAVGB(%%mm1, %%mm3)                                      /* p?5 = (|p2-p1| + |p6-p5|)/2 */\
908
                "psrlq $16, %%mm3                                \n\t" /* p?3 = (|p2-p1| + |p6-p5|)/2 */\
909
                "psubusb %%mm3, %%mm1                        \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
910
                "paddb %%mm5, %%mm1                                \n\t"\
911
                "psubusb %%mm5, %%mm1                                \n\t"\
912
                PAVGB(%%mm7, %%mm1)\
913
                "pxor %%mm2, %%mm1                                \n\t"\
914
                "psubb %%mm2, %%mm1                                \n\t"\
915
                "psrlq $24, %%mm1                                \n\t"\
916
                "movd %%mm1, %%ecx                                \n\t"\
917
                "paddb %%mm6, %%mm0                                \n\t"\
918
                "paddsb (%3, %%ecx, 8), %%mm0                        \n\t"\
919
                "paddb %%mm6, %%mm0                                \n\t"\
920
                "movq %%mm0, " #a "                                \n\t"\
921

922
/*
923
HX1old((%0))
924
HX1old((%%eax))
925
HX1old((%%eax, %1))
926
HX1old((%%eax, %1, 2))
927
HX1old((%0, %1, 4))
928
HX1old((%%ebx))
929
HX1old((%%ebx, %1))
930
HX1old((%%ebx, %1, 2))
931
*/
932

933
//FIXME add some comments, its unreadable ...
934
#define HX1b(a, c, b, d) \
935
                "movd " #a ", %%mm0                                \n\t"\
936
                "movd 4" #a ", %%mm1                                \n\t"\
937
                "punpckldq %%mm1, %%mm0                                \n\t"\
938
                "movd " #b ", %%mm4                                \n\t"\
939
                "movq %%mm0, %%mm1                                \n\t"\
940
                "movq %%mm0, %%mm2                                \n\t"\
941
                "psrlq $8, %%mm1                                \n\t"\
942
                "movd 4" #b ", %%mm3                                \n\t"\
943
                "psubusb %%mm1, %%mm2                                \n\t"\
944
                "psubusb %%mm0, %%mm1                                \n\t"\
945
                "por %%mm2, %%mm1                                \n\t" /* p?x = |px - p(x+1)| */\
946
                "pcmpeqb %%mm7, %%mm2                                \n\t" /* p?x = sgn[px - p(x+1)] */\
947
                "punpckldq %%mm3, %%mm4                                \n\t"\
948
                "movq %%mm1, %%mm3                                \n\t"\
949
                "psllq $32, %%mm3                                \n\t" /* p?5 = |p1 - p2| */\
950
                PAVGB(%%mm1, %%mm3)                                      /* p?5 = (|p2-p1| + |p6-p5|)/2 */\
951
                "paddb %%mm6, %%mm0                                \n\t"\
952
                "psrlq $16, %%mm3                                \n\t" /* p?3 = (|p2-p1| + |p6-p5|)/2 */\
953
                "psubusb %%mm3, %%mm1                        \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
954
                "movq %%mm4, %%mm3                                \n\t"\
955
                "paddb %%mm5, %%mm1                                \n\t"\
956
                "psubusb %%mm5, %%mm1                                \n\t"\
957
                "psrlq $8, %%mm3                                \n\t"\
958
                PAVGB(%%mm7, %%mm1)\
959
                "pxor %%mm2, %%mm1                                \n\t"\
960
                "psubb %%mm2, %%mm1                                \n\t"\
961
                "movq %%mm4, %%mm2                                \n\t"\
962
                "psrlq $24, %%mm1                                \n\t"\
963
                "psubusb %%mm3, %%mm2                                \n\t"\
964
                "movd %%mm1, %%ecx                                \n\t"\
965
                "psubusb %%mm4, %%mm3                                \n\t"\
966
                "paddsb (%2, %%ecx, 8), %%mm0                        \n\t"\
967
                "por %%mm2, %%mm3                                \n\t" /* p?x = |px - p(x+1)| */\
968
                "paddb %%mm6, %%mm0                                \n\t"\
969
                "pcmpeqb %%mm7, %%mm2                                \n\t" /* p?x = sgn[px - p(x+1)] */\
970
                "movq %%mm3, %%mm1                                \n\t"\
971
                "psllq $32, %%mm1                                \n\t" /* p?5 = |p1 - p2| */\
972
                "movq %%mm0, " #a "                                \n\t"\
973
                PAVGB(%%mm3, %%mm1)                                      /* p?5 = (|p2-p1| + |p6-p5|)/2 */\
974
                "paddb %%mm6, %%mm4                                \n\t"\
975
                "psrlq $16, %%mm1                                \n\t" /* p?3 = (|p2-p1| + |p6-p5|)/2 */\
976
                "psubusb %%mm1, %%mm3                        \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
977
                "paddb %%mm5, %%mm3                                \n\t"\
978
                "psubusb %%mm5, %%mm3                                \n\t"\
979
                PAVGB(%%mm7, %%mm3)\
980
                "pxor %%mm2, %%mm3                                \n\t"\
981
                "psubb %%mm2, %%mm3                                \n\t"\
982
                "psrlq $24, %%mm3                                \n\t"\
983
                "movd " #c ", %%mm0                                \n\t"\
984
                "movd 4" #c ", %%mm1                                \n\t"\
985
                "punpckldq %%mm1, %%mm0                                \n\t"\
986
                "paddb %%mm6, %%mm0                                \n\t"\
987
                "paddsb (%2, %%ecx, 8), %%mm0                        \n\t"\
988
                "paddb %%mm6, %%mm0                                \n\t"\
989
                "movq %%mm0, " #c "                                \n\t"\
990
                "movd %%mm3, %%ecx                                \n\t"\
991
                "movd " #d ", %%mm0                                \n\t"\
992
                "paddsb (%2, %%ecx, 8), %%mm4                        \n\t"\
993
                "movd 4" #d ", %%mm1                                \n\t"\
994
                "paddb %%mm6, %%mm4                                \n\t"\
995
                "punpckldq %%mm1, %%mm0                                \n\t"\
996
                "movq %%mm4, " #b "                                \n\t"\
997
                "paddb %%mm6, %%mm0                                \n\t"\
998
                "paddsb (%2, %%ecx, 8), %%mm0                        \n\t"\
999
                "paddb %%mm6, %%mm0                                \n\t"\
1000
                "movq %%mm0, " #d "                                \n\t"\
1001

1002
HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
1003
HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
1004

1005

1006
                :
1007
                : "r" (src), "r" (stride), "r" (lut)
1008
                : "%eax", "%ebx", "%ecx"
1009
        );
1010
#else
1011

    
1012
//FIXME (has little in common with the mmx2 version)
1013
        for(y=0; y<BLOCK_SIZE; y++)
1014
        {
1015
                int a= src[1] - src[2];
1016
                int b= src[3] - src[4];
1017
                int c= src[5] - src[6];
1018

    
1019
                int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
1020

    
1021
                if(d < QP)
1022
                {
1023
                        int v = d * SIGN(-b);
1024

    
1025
                        src[1] +=v/8;
1026
                        src[2] +=v/4;
1027
                        src[3] +=3*v/8;
1028
                        src[4] -=3*v/8;
1029
                        src[5] -=v/4;
1030
                        src[6] -=v/8;
1031

    
1032
                }
1033
                src+=stride;
1034
        }
1035
#endif
1036
}
1037

    
1038

    
1039
static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1040
{
1041
#ifdef HAVE_MMX
1042
        src+= stride*4;
1043
        //FIXME try pmul for *5 stuff
1044
//        src[0]=0;
1045
        asm volatile(
1046
                "pxor %%mm7, %%mm7                                \n\t"
1047
                "leal (%0, %1), %%eax                                \n\t"
1048
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1049
//        0        1        2        3        4        5        6        7
1050
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ebx+%1        ebx+2%1
1051
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1
1052

    
1053
                "movq (%0), %%mm0                                \n\t"
1054
                "movq %%mm0, %%mm1                                \n\t"
1055
                "punpcklbw %%mm7, %%mm0                                \n\t" // low part of line 0
1056
                "punpckhbw %%mm7, %%mm1                                \n\t" // high part of line 0
1057

    
1058
                "movq (%%eax), %%mm2                                \n\t"
1059
                "movq %%mm2, %%mm3                                \n\t"
1060
                "punpcklbw %%mm7, %%mm2                                \n\t" // low part of line 1
1061
                "punpckhbw %%mm7, %%mm3                                \n\t" // high part of line 1
1062

    
1063
                "movq (%%eax, %1), %%mm4                        \n\t"
1064
                "movq %%mm4, %%mm5                                \n\t"
1065
                "punpcklbw %%mm7, %%mm4                                \n\t" // low part of line 2
1066
                "punpckhbw %%mm7, %%mm5                                \n\t" // high part of line 2
1067

    
1068
                "paddw %%mm0, %%mm0                                \n\t" // 2L0
1069
                "paddw %%mm1, %%mm1                                \n\t" // 2H0
1070
                "psubw %%mm4, %%mm2                                \n\t" // L1 - L2
1071
                "psubw %%mm5, %%mm3                                \n\t" // H1 - H2
1072
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - L1 + L2
1073
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - H1 + H2
1074

    
1075
                "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
1076
                "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
1077
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2
1078
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2
1079

    
1080
                "movq (%%eax, %1, 2), %%mm2                        \n\t"
1081
                "movq %%mm2, %%mm3                                \n\t"
1082
                "punpcklbw %%mm7, %%mm2                                \n\t" // L3
1083
                "punpckhbw %%mm7, %%mm3                                \n\t" // H3
1084

    
1085
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - L3
1086
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - H3
1087
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1088
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1089
                "movq %%mm0, temp0                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1090
                "movq %%mm1, temp1                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1091

    
1092
                "movq (%0, %1, 4), %%mm0                        \n\t"
1093
                "movq %%mm0, %%mm1                                \n\t"
1094
                "punpcklbw %%mm7, %%mm0                                \n\t" // L4
1095
                "punpckhbw %%mm7, %%mm1                                \n\t" // H4
1096

    
1097
                "psubw %%mm0, %%mm2                                \n\t" // L3 - L4
1098
                "psubw %%mm1, %%mm3                                \n\t" // H3 - H4
1099
                "movq %%mm2, temp2                                \n\t" // L3 - L4
1100
                "movq %%mm3, temp3                                \n\t" // H3 - H4
1101
                "paddw %%mm4, %%mm4                                \n\t" // 2L2
1102
                "paddw %%mm5, %%mm5                                \n\t" // 2H2
1103
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - L3 + L4
1104
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - H3 + H4
1105

    
1106
                "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
1107
                "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
1108
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4
1109
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4
1110
//50 opcodes so far
1111
                "movq (%%ebx), %%mm2                                \n\t"
1112
                "movq %%mm2, %%mm3                                \n\t"
1113
                "punpcklbw %%mm7, %%mm2                                \n\t" // L5
1114
                "punpckhbw %%mm7, %%mm3                                \n\t" // H5
1115
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - L5
1116
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - H5
1117
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1118
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1119

    
1120
                "movq (%%ebx, %1), %%mm6                        \n\t"
1121
                "punpcklbw %%mm7, %%mm6                                \n\t" // L6
1122
                "psubw %%mm6, %%mm2                                \n\t" // L5 - L6
1123
                "movq (%%ebx, %1), %%mm6                        \n\t"
1124
                "punpckhbw %%mm7, %%mm6                                \n\t" // H6
1125
                "psubw %%mm6, %%mm3                                \n\t" // H5 - H6
1126

    
1127
                "paddw %%mm0, %%mm0                                \n\t" // 2L4
1128
                "paddw %%mm1, %%mm1                                \n\t" // 2H4
1129
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - L5 + L6
1130
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - H5 + H6
1131

    
1132
                "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
1133
                "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
1134
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6
1135
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6
1136

    
1137
                "movq (%%ebx, %1, 2), %%mm2                        \n\t"
1138
                "movq %%mm2, %%mm3                                \n\t"
1139
                "punpcklbw %%mm7, %%mm2                                \n\t" // L7
1140
                "punpckhbw %%mm7, %%mm3                                \n\t" // H7
1141

    
1142
                "paddw %%mm2, %%mm2                                \n\t" // 2L7
1143
                "paddw %%mm3, %%mm3                                \n\t" // 2H7
1144
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1145
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1146

    
1147
                "movq temp0, %%mm2                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1148
                "movq temp1, %%mm3                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1149
//FIXME pxor, psubw, pmax for abs
1150
                "movq %%mm7, %%mm6                                \n\t" // 0
1151
                "pcmpgtw %%mm0, %%mm6                                \n\t"
1152
                "pxor %%mm6, %%mm0                                \n\t"
1153
                "psubw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1154
                "movq %%mm7, %%mm6                                \n\t" // 0
1155
                "pcmpgtw %%mm1, %%mm6                                \n\t"
1156
                "pxor %%mm6, %%mm1                                \n\t"
1157
                "psubw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1158

    
1159
                "movq %%mm7, %%mm6                                \n\t" // 0
1160
                "pcmpgtw %%mm2, %%mm6                                \n\t"
1161
                "pxor %%mm6, %%mm2                                \n\t"
1162
                "psubw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1163
                "movq %%mm7, %%mm6                                \n\t" // 0
1164
                "pcmpgtw %%mm3, %%mm6                                \n\t"
1165
                "pxor %%mm6, %%mm3                                \n\t"
1166
                "psubw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1167

    
1168
#ifdef HAVE_MMX2
1169
                "pminsw %%mm2, %%mm0                                \n\t"
1170
                "pminsw %%mm3, %%mm1                                \n\t"
1171
#else
1172
                "movq %%mm0, %%mm6                                \n\t"
1173
                "psubusw %%mm2, %%mm6                                \n\t"
1174
                "psubw %%mm6, %%mm0                                \n\t"
1175
                "movq %%mm1, %%mm6                                \n\t"
1176
                "psubusw %%mm3, %%mm6                                \n\t"
1177
                "psubw %%mm6, %%mm1                                \n\t"
1178
#endif
1179

    
1180
                "movq %%mm7, %%mm6                                \n\t" // 0
1181
                "pcmpgtw %%mm4, %%mm6                                \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1182
                "pxor %%mm6, %%mm4                                \n\t"
1183
                "psubw %%mm6, %%mm4                                \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1184
                "pcmpgtw %%mm5, %%mm7                                \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1185
                "pxor %%mm7, %%mm5                                \n\t"
1186
                "psubw %%mm7, %%mm5                                \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1187
// 100 opcodes
1188
                "movd %2, %%mm2                                        \n\t" // QP
1189
                "punpcklwd %%mm2, %%mm2                                \n\t"
1190
                "punpcklwd %%mm2, %%mm2                                \n\t"
1191
                "psllw $3, %%mm2                                \n\t" // 8QP
1192
                "movq %%mm2, %%mm3                                \n\t" // 8QP
1193
                "pcmpgtw %%mm4, %%mm2                                \n\t"
1194
                "pcmpgtw %%mm5, %%mm3                                \n\t"
1195
                "pand %%mm2, %%mm4                                \n\t"
1196
                "pand %%mm3, %%mm5                                \n\t"
1197

    
1198

    
1199
                "psubusw %%mm0, %%mm4                                \n\t" // hd
1200
                "psubusw %%mm1, %%mm5                                \n\t" // ld
1201

    
1202

    
1203
                "movq w05, %%mm2                                \n\t" // 5
1204
                "pmullw %%mm2, %%mm4                                \n\t"
1205
                "pmullw %%mm2, %%mm5                                \n\t"
1206
                "movq w20, %%mm2                                \n\t" // 32
1207
                "paddw %%mm2, %%mm4                                \n\t"
1208
                "paddw %%mm2, %%mm5                                \n\t"
1209
                "psrlw $6, %%mm4                                \n\t"
1210
                "psrlw $6, %%mm5                                \n\t"
1211

    
1212
/*
1213
                "movq w06, %%mm2                                \n\t" // 6
1214
                "paddw %%mm2, %%mm4                                \n\t"
1215
                "paddw %%mm2, %%mm5                                \n\t"
1216
                "movq w1400, %%mm2                                \n\t" // 1400h = 5120 = 5/64*2^16
1217
//FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1218
                "pmulhw %%mm2, %%mm4                                \n\t" // hd/13
1219
                "pmulhw %%mm2, %%mm5                                \n\t" // ld/13
1220
*/
1221

    
1222
                "movq temp2, %%mm0                                \n\t" // L3 - L4
1223
                "movq temp3, %%mm1                                \n\t" // H3 - H4
1224

    
1225
                "pxor %%mm2, %%mm2                                \n\t"
1226
                "pxor %%mm3, %%mm3                                \n\t"
1227

    
1228
                "pcmpgtw %%mm0, %%mm2                                \n\t" // sign (L3-L4)
1229
                "pcmpgtw %%mm1, %%mm3                                \n\t" // sign (H3-H4)
1230
                "pxor %%mm2, %%mm0                                \n\t"
1231
                "pxor %%mm3, %%mm1                                \n\t"
1232
                "psubw %%mm2, %%mm0                                \n\t" // |L3-L4|
1233
                "psubw %%mm3, %%mm1                                \n\t" // |H3-H4|
1234
                "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1235
                "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1236

    
1237
                "pxor %%mm6, %%mm2                                \n\t"
1238
                "pxor %%mm7, %%mm3                                \n\t"
1239
                "pand %%mm2, %%mm4                                \n\t"
1240
                "pand %%mm3, %%mm5                                \n\t"
1241

    
1242
#ifdef HAVE_MMX2
1243
                "pminsw %%mm0, %%mm4                                \n\t"
1244
                "pminsw %%mm1, %%mm5                                \n\t"
1245
#else
1246
                "movq %%mm4, %%mm2                                \n\t"
1247
                "psubusw %%mm0, %%mm2                                \n\t"
1248
                "psubw %%mm2, %%mm4                                \n\t"
1249
                "movq %%mm5, %%mm2                                \n\t"
1250
                "psubusw %%mm1, %%mm2                                \n\t"
1251
                "psubw %%mm2, %%mm5                                \n\t"
1252
#endif
1253
                "pxor %%mm6, %%mm4                                \n\t"
1254
                "pxor %%mm7, %%mm5                                \n\t"
1255
                "psubw %%mm6, %%mm4                                \n\t"
1256
                "psubw %%mm7, %%mm5                                \n\t"
1257
                "packsswb %%mm5, %%mm4                                \n\t"
1258
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
1259
                "paddb   %%mm4, %%mm0                                \n\t"
1260
                "movq %%mm0, (%%eax, %1, 2)                         \n\t"
1261
                "movq (%0, %1, 4), %%mm0                        \n\t"
1262
                "psubb %%mm4, %%mm0                                \n\t"
1263
                "movq %%mm0, (%0, %1, 4)                         \n\t"
1264

    
1265
                :
1266
                : "r" (src), "r" (stride), "r" (QP)
1267
                : "%eax", "%ebx"
1268
        );
1269
#else
1270
        const int l1= stride;
1271
        const int l2= stride + l1;
1272
        const int l3= stride + l2;
1273
        const int l4= stride + l3;
1274
        const int l5= stride + l4;
1275
        const int l6= stride + l5;
1276
        const int l7= stride + l6;
1277
        const int l8= stride + l7;
1278
//        const int l9= stride + l8;
1279
        int x;
1280
        src+= stride*3;
1281
        for(x=0; x<BLOCK_SIZE; x++)
1282
        {
1283
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1284
                if(ABS(middleEnergy) < 8*QP)
1285
                {
1286
                        const int q=(src[l4] - src[l5])/2;
1287
                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1288
                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1289

    
1290
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1291
                        d= MAX(d, 0);
1292

    
1293
                        d= (5*d + 32) >> 6;
1294
                        d*= SIGN(-middleEnergy);
1295

    
1296
                        if(q>0)
1297
                        {
1298
                                d= d<0 ? 0 : d;
1299
                                d= d>q ? q : d;
1300
                        }
1301
                        else
1302
                        {
1303
                                d= d>0 ? 0 : d;
1304
                                d= d<q ? q : d;
1305
                        }
1306

    
1307
                        src[l4]-= d;
1308
                        src[l5]+= d;
1309
                }
1310
                src++;
1311
        }
1312
#endif
1313
}
1314

    
1315
//FIXME?  |255-0| = 1
1316
/**
1317
 * Check if the given 8x8 Block is mostly "flat"
1318
 */
1319
static inline int isHorizDC(uint8_t src[], int stride)
1320
{
1321
//        src++;
1322
        int numEq= 0;
1323
#if 0
1324
asm volatile (
1325
//                "int $3 \n\t"
1326
                "leal (%1, %2), %%ecx                                \n\t"
1327
                "leal (%%ecx, %2, 4), %%ebx                        \n\t"
1328
//        0        1        2        3        4        5        6        7        8        9
1329
//        %1        ecx        ecx+%2        ecx+2%2        %1+4%2        ebx        ebx+%2        ebx+2%2        %1+8%2        ebx+4%2
1330
                "movq b7E, %%mm7                                \n\t" // mm7 = 0x7F
1331
                "movq b7C, %%mm6                                \n\t" // mm6 = 0x7D
1332
                "pxor %%mm0, %%mm0                                \n\t"
1333
                "movl %1, %%eax                                        \n\t"
1334
                "andl $0x1F, %%eax                                \n\t"
1335
                "cmpl $24, %%eax                                \n\t"
1336
                "leal tempBlock, %%eax                                \n\t"
1337
                "jb 1f                                                \n\t"
1338

1339
#define HDC_CHECK_AND_CPY(src, dst) \
1340
                "movd " #src ", %%mm2                                \n\t"\
1341
                "punpckldq 4" #src ", %%mm2                                \n\t" /* (%1) */\
1342
                "movq %%mm2, %%mm1                                \n\t"\
1343
                "psrlq $8, %%mm2                                \n\t"\
1344
                "psubb %%mm1, %%mm2                                \n\t"\
1345
                "paddb %%mm7, %%mm2                                \n\t"\
1346
                "pcmpgtb %%mm6, %%mm2                                \n\t"\
1347
                "paddb %%mm2, %%mm0                                \n\t"\
1348
                "movq %%mm1," #dst "(%%eax)                        \n\t"
1349

1350
                HDC_CHECK_AND_CPY((%1),0)
1351
                HDC_CHECK_AND_CPY((%%ecx),8)
1352
                HDC_CHECK_AND_CPY((%%ecx, %2),16)
1353
                HDC_CHECK_AND_CPY((%%ecx, %2, 2),24)
1354
                HDC_CHECK_AND_CPY((%1, %2, 4),32)
1355
                HDC_CHECK_AND_CPY((%%ebx),40)
1356
                HDC_CHECK_AND_CPY((%%ebx, %2),48)
1357
                HDC_CHECK_AND_CPY((%%ebx, %2, 2),56)
1358
                "jmp 2f                                                \n\t"
1359
                "1:                                                \n\t"
1360
// src does not cross a 32 byte cache line so dont waste time with alignment
1361
#define HDC_CHECK_AND_CPY2(src, dst) \
1362
                "movq " #src ", %%mm2                                \n\t"\
1363
                "movq " #src ", %%mm1                                \n\t"\
1364
                "psrlq $8, %%mm2                                \n\t"\
1365
                "psubb %%mm1, %%mm2                                \n\t"\
1366
                "paddb %%mm7, %%mm2                                \n\t"\
1367
                "pcmpgtb %%mm6, %%mm2                                \n\t"\
1368
                "paddb %%mm2, %%mm0                                \n\t"\
1369
                "movq %%mm1," #dst "(%%eax)                        \n\t"
1370

1371
                HDC_CHECK_AND_CPY2((%1),0)
1372
                HDC_CHECK_AND_CPY2((%%ecx),8)
1373
                HDC_CHECK_AND_CPY2((%%ecx, %2),16)
1374
                HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24)
1375
                HDC_CHECK_AND_CPY2((%1, %2, 4),32)
1376
                HDC_CHECK_AND_CPY2((%%ebx),40)
1377
                HDC_CHECK_AND_CPY2((%%ebx, %2),48)
1378
                HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56)
1379
                "2:                                                \n\t"
1380
                "psllq $8, %%mm0                                \n\t" // remove dummy value
1381
                "movq %%mm0, %%mm1                                \n\t"
1382
                "psrlw $8, %%mm0                                \n\t"
1383
                "paddb %%mm1, %%mm0                                \n\t"
1384
                "movq %%mm0, %%mm1                                \n\t"
1385
                "psrlq $16, %%mm0                                \n\t"
1386
                "paddb %%mm1, %%mm0                                \n\t"
1387
                "movq %%mm0, %%mm1                                \n\t"
1388
                "psrlq $32, %%mm0                                \n\t"
1389
                "paddb %%mm1, %%mm0                                \n\t"
1390
                "movd %%mm0, %0                                        \n\t"
1391
                : "=r" (numEq)
1392
                : "r" (src), "r" (stride)
1393
                : "%eax", "%ebx", "%ecx"
1394
                );
1395
//        printf("%d\n", numEq);
1396
        numEq= (256 - numEq) &0xFF;
1397
#else
1398
        int y;
1399
        for(y=0; y<BLOCK_SIZE; y++)
1400
        {
1401
                if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1402
                if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1403
                if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1404
                if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1405
                if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1406
                if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1407
                if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1408
                src+= stride;
1409
        }
1410
#endif
1411
/*        if(abs(numEq - asmEq) > 0)
1412
        {
1413
//                printf("\nasm:%d  c:%d\n", asmEq, numEq);
1414
                for(int y=0; y<8; y++)
1415
                {
1416
                        for(int x=0; x<8; x++)
1417
                        {
1418
                                printf("%d ", src[x + y*stride]);
1419
                        }
1420
                        printf("\n");
1421
                }
1422
        }
1423
*/
1424
//        printf("%d\n", numEq);
1425
        return numEq > hFlatnessThreshold;
1426
}
1427

    
1428
static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1429
{
1430
        if(abs(src[0] - src[7]) > 2*QP) return 0;
1431

    
1432
        return 1;
1433
}
1434

    
1435
static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1436
{
1437
#if 0
1438
        asm volatile(
1439
                "leal (%0, %1), %%ecx                                \n\t"
1440
                "leal (%%ecx, %1, 4), %%ebx                        \n\t"
1441
//        0        1        2        3        4        5        6        7        8        9
1442
//        %0        ecx        ecx+%1        ecx+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1443
                "pxor %%mm7, %%mm7                                \n\t"
1444
                "movq bm00001000, %%mm6                                \n\t"
1445
                "movd %2, %%mm5                                        \n\t" // QP
1446
                "movq %%mm5, %%mm4                                \n\t"
1447
                "paddusb %%mm5, %%mm5                                \n\t" // 2QP
1448
                "paddusb %%mm5, %%mm4                                \n\t" // 3QP
1449
                "psllq $24, %%mm4                                \n\t"
1450
                "pxor %%mm5, %%mm5                                \n\t" // 0
1451
                "psubb %%mm4, %%mm5                                \n\t" // -QP
1452
                "leal tempBlock, %%eax                                \n\t"
1453

1454
//FIXME? "unroll by 2" and mix
1455
#ifdef HAVE_MMX2
1456
#define HDF(src, dst)        \
1457
                "movq " #src "(%%eax), %%mm0                        \n\t"\
1458
                "movq " #src "(%%eax), %%mm1                        \n\t"\
1459
                "movq " #src "(%%eax), %%mm2                        \n\t"\
1460
                "psrlq $8, %%mm1                                \n\t"\
1461
                "psubusb %%mm1, %%mm2                                \n\t"\
1462
                "psubusb %%mm0, %%mm1                                \n\t"\
1463
                "por %%mm2, %%mm1                                \n\t" /* p?x = |px - p(x+1)| */\
1464
                "pcmpeqb %%mm7, %%mm2                                \n\t" /* p?x = sgn[px - p(x+1)] */\
1465
                "pshufw $0x00, %%mm1, %%mm3                        \n\t" /* p?5 = |p1 - p2| */\
1466
                "pminub %%mm1, %%mm3                                \n\t" /* p?5 = min(|p2-p1|, |p6-p5|)*/\
1467
                "psrlq $16, %%mm3                                \n\t" /* p?3 = min(|p2-p1|, |p6-p5|)*/\
1468
                "psubusb %%mm3, %%mm1                        \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1469
                "paddb %%mm5, %%mm1                                \n\t"\
1470
                "psubusb %%mm5, %%mm1                                \n\t"\
1471
                "psrlw $2, %%mm1                                \n\t"\
1472
                "pxor %%mm2, %%mm1                                \n\t"\
1473
                "psubb %%mm2, %%mm1                                \n\t"\
1474
                "pand %%mm6, %%mm1                                \n\t"\
1475
                "psubb %%mm1, %%mm0                                \n\t"\
1476
                "psllq $8, %%mm1                                \n\t"\
1477
                "paddb %%mm1, %%mm0                                \n\t"\
1478
                "movd %%mm0, " #dst"                                \n\t"\
1479
                "psrlq $32, %%mm0                                \n\t"\
1480
                "movd %%mm0, 4" #dst"                                \n\t"
1481
#else
1482
#define HDF(src, dst)\
1483
                "movq " #src "(%%eax), %%mm0                        \n\t"\
1484
                "movq %%mm0, %%mm1                                \n\t"\
1485
                "movq %%mm0, %%mm2                                \n\t"\
1486
                "psrlq $8, %%mm1                                \n\t"\
1487
                "psubusb %%mm1, %%mm2                                \n\t"\
1488
                "psubusb %%mm0, %%mm1                                \n\t"\
1489
                "por %%mm2, %%mm1                                \n\t" /* p?x = |px - p(x+1)| */\
1490
                "pcmpeqb %%mm7, %%mm2                                \n\t" /* p?x = sgn[px - p(x+1)] */\
1491
                "movq %%mm1, %%mm3                                \n\t"\
1492
                "psllq $32, %%mm3                                \n\t"\
1493
                "movq %%mm3, %%mm4                                \n\t"\
1494
                "psubusb %%mm1, %%mm4                                \n\t"\
1495
                "psubb %%mm4, %%mm3                                \n\t"\
1496
                "psrlq $16, %%mm3                                \n\t" /* p?3 = min(|p2-p1|, |p6-p5|)*/\
1497
                "psubusb %%mm3, %%mm1                        \n\t" /* |p3-p4|-min(|p1-p2|,|p5,?6|) */\
1498
                "paddb %%mm5, %%mm1                                \n\t"\
1499
                "psubusb %%mm5, %%mm1                                \n\t"\
1500
                "psrlw $2, %%mm1                                \n\t"\
1501
                "pxor %%mm2, %%mm1                                \n\t"\
1502
                "psubb %%mm2, %%mm1                                \n\t"\
1503
                "pand %%mm6, %%mm1                                \n\t"\
1504
                "psubb %%mm1, %%mm0                                \n\t"\
1505
                "psllq $8, %%mm1                                \n\t"\
1506
                "paddb %%mm1, %%mm0                                \n\t"\
1507
                "movd %%mm0, " #dst "                                \n\t"\
1508
                "psrlq $32, %%mm0                                \n\t"\
1509
                "movd %%mm0, 4" #dst "                                \n\t"
1510
#endif
1511
                HDF(0,(%0))
1512
                HDF(8,(%%ecx))
1513
                HDF(16,(%%ecx, %1))
1514
                HDF(24,(%%ecx, %1, 2))
1515
                HDF(32,(%0, %1, 4))
1516
                HDF(40,(%%ebx))
1517
                HDF(48,(%%ebx, %1))
1518
                HDF(56,(%%ebx, %1, 2))
1519
                :
1520
                : "r" (dst), "r" (stride), "r" (QP)
1521
                : "%eax", "%ebx", "%ecx"
1522
        );
1523
#else
1524
        int y;
1525
        for(y=0; y<BLOCK_SIZE; y++)
1526
        {
1527
                const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1528

    
1529
                if(ABS(middleEnergy) < 8*QP)
1530
                {
1531
                        const int q=(dst[3] - dst[4])/2;
1532
                        const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1533
                        const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1534

    
1535
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1536
                        d= MAX(d, 0);
1537

    
1538
                        d= (5*d + 32) >> 6;
1539
                        d*= SIGN(-middleEnergy);
1540

    
1541
                        if(q>0)
1542
                        {
1543
                                d= d<0 ? 0 : d;
1544
                                d= d>q ? q : d;
1545
                        }
1546
                        else
1547
                        {
1548
                                d= d>0 ? 0 : d;
1549
                                d= d<q ? q : d;
1550
                        }
1551

    
1552
                        dst[3]-= d;
1553
                        dst[4]+= d;
1554
                }
1555
                dst+= stride;
1556
        }
1557
#endif
1558
}
1559

    
1560
/**
1561
 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1562
 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1563
 * using the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
1564
 */
1565
static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1566
{
1567

    
1568
#if 0
1569
        asm volatile(
1570
                "leal (%0, %1), %%ecx                                \n\t"
1571
                "leal (%%ecx, %1, 4), %%ebx                        \n\t"
1572
//        0        1        2        3        4        5        6        7        8        9
1573
//        %0        ecx        ecx+%1        ecx+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1574
                "pxor %%mm7, %%mm7                                        \n\t"
1575
                "leal tempBlock, %%eax                                        \n\t"
1576
/*
1577
#define HLP1        "movq (%0), %%mm0                                        \n\t"\
1578
                "movq %%mm0, %%mm1                                        \n\t"\
1579
                "psllq $8, %%mm0                                        \n\t"\
1580
                PAVGB(%%mm1, %%mm0)\
1581
                "psrlw $8, %%mm0                                        \n\t"\
1582
                "pxor %%mm1, %%mm1                                        \n\t"\
1583
                "packuswb %%mm1, %%mm0                                        \n\t"\
1584
                "movq %%mm0, %%mm1                                        \n\t"\
1585
                "movq %%mm0, %%mm2                                        \n\t"\
1586
                "psllq $32, %%mm0                                        \n\t"\
1587
                "paddb %%mm0, %%mm1                                        \n\t"\
1588
                "psllq $16, %%mm2                                        \n\t"\
1589
                PAVGB(%%mm2, %%mm0)\
1590
                "movq %%mm0, %%mm3                                        \n\t"\
1591
                "pand bm11001100, %%mm0                                        \n\t"\
1592
                "paddusb %%mm0, %%mm3                                        \n\t"\
1593
                "psrlq $8, %%mm3                                        \n\t"\
1594
                PAVGB(%%mm1, %%mm4)\
1595
                PAVGB(%%mm3, %%mm2)\
1596
                "psrlq $16, %%mm2                                        \n\t"\
1597
                "punpcklbw %%mm2, %%mm2                                        \n\t"\
1598
                "movq %%mm2, (%0)                                        \n\t"\
1599

1600
#define HLP2        "movq (%0), %%mm0                                        \n\t"\
1601
                "movq %%mm0, %%mm1                                        \n\t"\
1602
                "psllq $8, %%mm0                                        \n\t"\
1603
                PAVGB(%%mm1, %%mm0)\
1604
                "psrlw $8, %%mm0                                        \n\t"\
1605
                "pxor %%mm1, %%mm1                                        \n\t"\
1606
                "packuswb %%mm1, %%mm0                                        \n\t"\
1607
                "movq %%mm0, %%mm2                                        \n\t"\
1608
                "psllq $32, %%mm0                                        \n\t"\
1609
                "psllq $16, %%mm2                                        \n\t"\
1610
                PAVGB(%%mm2, %%mm0)\
1611
                "movq %%mm0, %%mm3                                        \n\t"\
1612
                "pand bm11001100, %%mm0                                        \n\t"\
1613
                "paddusb %%mm0, %%mm3                                        \n\t"\
1614
                "psrlq $8, %%mm3                                        \n\t"\
1615
                PAVGB(%%mm3, %%mm2)\
1616
                "psrlq $16, %%mm2                                        \n\t"\
1617
                "punpcklbw %%mm2, %%mm2                                        \n\t"\
1618
                "movq %%mm2, (%0)                                        \n\t"\
1619
*/
1620
// approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1621
/*
1622
Implemented        Exact 7-Tap
1623
 9421                A321
1624
 36421                64321
1625
 334321                =
1626
 1234321        =
1627
  1234321        =
1628
   123433        =
1629
    12463          12346
1630
     1249           123A
1631

1632
*/
1633

1634
#ifdef HAVE_MMX2
1635
#define HLP3(i)        "movq " #i "(%%eax), %%mm0                                \n\t"\
1636
                "movq %%mm0, %%mm1                                        \n\t"\
1637
                "movq %%mm0, %%mm2                                        \n\t"\
1638
                "movq %%mm0, %%mm3                                        \n\t"\
1639
                "movq %%mm0, %%mm4                                        \n\t"\
1640
                "psllq $8, %%mm1                                        \n\t"\
1641
                "psrlq $8, %%mm2                                        \n\t"\
1642
                "pand bm00000001, %%mm3                                        \n\t"\
1643
                "pand bm10000000, %%mm4                                        \n\t"\
1644
                "por %%mm3, %%mm1                                        \n\t"\
1645
                "por %%mm4, %%mm2                                        \n\t"\
1646
                PAVGB(%%mm2, %%mm1)\
1647
                PAVGB(%%mm1, %%mm0)\
1648
\
1649
                "pshufw $0xF9, %%mm0, %%mm3                                \n\t"\
1650
                "pshufw $0x90, %%mm0, %%mm4                                \n\t"\
1651
                PAVGB(%%mm3, %%mm4)\
1652
                PAVGB(%%mm4, %%mm0)\
1653
                "movd %%mm0, (%0)                                        \n\t"\
1654
                "psrlq $32, %%mm0                                        \n\t"\
1655
                "movd %%mm0, 4(%0)                                        \n\t"
1656
#else
1657
#define HLP3(i)        "movq " #i "(%%eax), %%mm0                                \n\t"\
1658
                "movq %%mm0, %%mm1                                        \n\t"\
1659
                "movq %%mm0, %%mm2                                        \n\t"\
1660
                "movd -4(%0), %%mm3                                        \n\t" /*0001000*/\
1661
                "movd 8(%0), %%mm4                                        \n\t" /*0001000*/\
1662
                "psllq $8, %%mm1                                        \n\t"\
1663
                "psrlq $8, %%mm2                                        \n\t"\
1664
                "psrlq $24, %%mm3                                        \n\t"\
1665
                "psllq $56, %%mm4                                        \n\t"\
1666
                "por %%mm3, %%mm1                                        \n\t"\
1667
                "por %%mm4, %%mm2                                        \n\t"\
1668
                PAVGB(%%mm2, %%mm1)\
1669
                PAVGB(%%mm1, %%mm0)\
1670
\
1671
                "movq %%mm0, %%mm3                                        \n\t"\
1672
                "movq %%mm0, %%mm4                                        \n\t"\
1673
                "movq %%mm0, %%mm5                                        \n\t"\
1674
                "psrlq $16, %%mm3                                        \n\t"\
1675
                "psllq $16, %%mm4                                        \n\t"\
1676
                "pand bm11000000, %%mm5                                        \n\t"\
1677
                "por %%mm5, %%mm3                                        \n\t"\
1678
                "movq %%mm0, %%mm5                                        \n\t"\
1679
                "pand bm00000011, %%mm5                                        \n\t"\
1680
                "por %%mm5, %%mm4                                        \n\t"\
1681
                PAVGB(%%mm3, %%mm4)\
1682
                PAVGB(%%mm4, %%mm0)\
1683
                "movd %%mm0, (%0)                                        \n\t"\
1684
                "psrlq $32, %%mm0                                        \n\t"\
1685
                "movd %%mm0, 4(%0)                                        \n\t"
1686
#endif
1687

    
1688
/* uses the 7-Tap Filter: 1112111 */
1689
#define NEW_HLP(src, dst)\
1690
                "movq " #src "(%%eax), %%mm1                                \n\t"\
1691
                "movq " #src "(%%eax), %%mm2                                \n\t"\
1692
                "psllq $8, %%mm1                                        \n\t"\
1693
                "psrlq $8, %%mm2                                        \n\t"\
1694
                "movd -4" #dst ", %%mm3                                        \n\t" /*0001000*/\
1695
                "movd 8" #dst ", %%mm4                                        \n\t" /*0001000*/\
1696
                "psrlq $24, %%mm3                                        \n\t"\
1697
                "psllq $56, %%mm4                                        \n\t"\
1698
                "por %%mm3, %%mm1                                        \n\t"\
1699
                "por %%mm4, %%mm2                                        \n\t"\
1700
                "movq %%mm1, %%mm5                                        \n\t"\
1701
                PAVGB(%%mm2, %%mm1)\
1702
                "movq " #src "(%%eax), %%mm0                                \n\t"\
1703
                PAVGB(%%mm1, %%mm0)\
1704
                "psllq $8, %%mm5                                        \n\t"\
1705
                "psrlq $8, %%mm2                                        \n\t"\
1706
                "por %%mm3, %%mm5                                        \n\t"\
1707
                "por %%mm4, %%mm2                                        \n\t"\
1708
                "movq %%mm5, %%mm1                                        \n\t"\
1709
                PAVGB(%%mm2, %%mm5)\
1710
                "psllq $8, %%mm1                                        \n\t"\
1711
                "psrlq $8, %%mm2                                        \n\t"\
1712
                "por %%mm3, %%mm1                                        \n\t"\
1713
                "por %%mm4, %%mm2                                        \n\t"\
1714
                PAVGB(%%mm2, %%mm1)\
1715
                PAVGB(%%mm1, %%mm5)\
1716
                PAVGB(%%mm5, %%mm0)\
1717
                "movd %%mm0, " #dst "                                        \n\t"\
1718
                "psrlq $32, %%mm0                                        \n\t"\
1719
                "movd %%mm0, 4" #dst "                                        \n\t"
1720

    
1721
/* uses the 9-Tap Filter: 112242211 */
1722
#define NEW_HLP2(i)\
1723
                "movq " #i "(%%eax), %%mm0                                \n\t" /*0001000*/\
1724
                "movq %%mm0, %%mm1                                        \n\t" /*0001000*/\
1725
                "movq %%mm0, %%mm2                                        \n\t" /*0001000*/\
1726
                "movd -4(%0), %%mm3                                        \n\t" /*0001000*/\
1727
                "movd 8(%0), %%mm4                                        \n\t" /*0001000*/\
1728
                "psllq $8, %%mm1                                        \n\t"\
1729
                "psrlq $8, %%mm2                                        \n\t"\
1730
                "psrlq $24, %%mm3                                        \n\t"\
1731
                "psllq $56, %%mm4                                        \n\t"\
1732
                "por %%mm3, %%mm1                                        \n\t" /*0010000*/\
1733
                "por %%mm4, %%mm2                                        \n\t" /*0000100*/\
1734
                "movq %%mm1, %%mm5                                        \n\t" /*0010000*/\
1735
                PAVGB(%%mm2, %%mm1)                                              /*0010100*/\
1736
                PAVGB(%%mm1, %%mm0)                                              /*0012100*/\
1737
                "psllq $8, %%mm5                                        \n\t"\
1738
                "psrlq $8, %%mm2                                        \n\t"\
1739
                "por %%mm3, %%mm5                                        \n\t" /*0100000*/\
1740
                "por %%mm4, %%mm2                                        \n\t" /*0000010*/\
1741
                "movq %%mm5, %%mm1                                        \n\t" /*0100000*/\
1742
                PAVGB(%%mm2, %%mm5)                                              /*0100010*/\
1743
                "psllq $8, %%mm1                                        \n\t"\
1744
                "psrlq $8, %%mm2                                        \n\t"\
1745
                "por %%mm3, %%mm1                                        \n\t" /*1000000*/\
1746
                "por %%mm4, %%mm2                                        \n\t" /*0000001*/\
1747
                "movq %%mm1, %%mm6                                        \n\t" /*1000000*/\
1748
                PAVGB(%%mm2, %%mm1)                                              /*1000001*/\
1749
                "psllq $8, %%mm6                                        \n\t"\
1750
                "psrlq $8, %%mm2                                        \n\t"\
1751
                "por %%mm3, %%mm6                                        \n\t"/*100000000*/\
1752
                "por %%mm4, %%mm2                                        \n\t"/*000000001*/\
1753
                PAVGB(%%mm2, %%mm6)                                             /*100000001*/\
1754
                PAVGB(%%mm6, %%mm1)                                             /*110000011*/\
1755
                PAVGB(%%mm1, %%mm5)                                             /*112000211*/\
1756
                PAVGB(%%mm5, %%mm0)                                             /*112242211*/\
1757
                "movd %%mm0, (%0)                                        \n\t"\
1758
                "psrlq $32, %%mm0                                        \n\t"\
1759
                "movd %%mm0, 4(%0)                                        \n\t"
1760

    
1761
#define HLP(src, dst) NEW_HLP(src, dst)
1762

    
1763
                HLP(0, (%0))
1764
                HLP(8, (%%ecx))
1765
                HLP(16, (%%ecx, %1))
1766
                HLP(24, (%%ecx, %1, 2))
1767
                HLP(32, (%0, %1, 4))
1768
                HLP(40, (%%ebx))
1769
                HLP(48, (%%ebx, %1))
1770
                HLP(56, (%%ebx, %1, 2))
1771

    
1772
                :
1773
                : "r" (dst), "r" (stride)
1774
                : "%eax", "%ebx", "%ecx"
1775
        );
1776

    
1777
#else
1778
        int y;
1779
        for(y=0; y<BLOCK_SIZE; y++)
1780
        {
1781
                const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1782
                const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1783

    
1784
                int sums[9];
1785
                sums[0] = first + dst[0];
1786
                sums[1] = dst[0] + dst[1];
1787
                sums[2] = dst[1] + dst[2];
1788
                sums[3] = dst[2] + dst[3];
1789
                sums[4] = dst[3] + dst[4];
1790
                sums[5] = dst[4] + dst[5];
1791
                sums[6] = dst[5] + dst[6];
1792
                sums[7] = dst[6] + dst[7];
1793
                sums[8] = dst[7] + last;
1794

    
1795
                dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1796
                dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1797
                dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1798
                dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1799
                dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1800
                dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1801
                dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1802
                dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1803

    
1804
                dst+= stride;
1805
        }
1806
#endif
1807
}
1808

    
1809
static inline void dering(uint8_t src[], int stride, int QP)
1810
{
1811
#ifdef HAVE_MMX2
1812
        asm volatile(
1813
                "movq pQPb, %%mm0                                \n\t"
1814
                "paddusb %%mm0, %%mm0                                \n\t"
1815
                "movq %%mm0, pQPb2                                \n\t"
1816

    
1817
                "leal (%0, %1), %%eax                                \n\t"
1818
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1819
//        0        1        2        3        4        5        6        7        8        9
1820
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1821

    
1822
                "pcmpeqb %%mm6, %%mm6                                \n\t"
1823
                "pxor %%mm7, %%mm7                                \n\t"
1824

    
1825
#define FIND_MIN_MAX(addr)\
1826
                "movq " #addr ", %%mm0                                \n\t"\
1827
                "pminub %%mm0, %%mm6                                \n\t"\
1828
                "pmaxub %%mm0, %%mm7                                \n\t"
1829

    
1830
FIND_MIN_MAX((%%eax))
1831
FIND_MIN_MAX((%%eax, %1))
1832
FIND_MIN_MAX((%%eax, %1, 2))
1833
FIND_MIN_MAX((%0, %1, 4))
1834
FIND_MIN_MAX((%%ebx))
1835
FIND_MIN_MAX((%%ebx, %1))
1836
FIND_MIN_MAX((%%ebx, %1, 2))
1837
FIND_MIN_MAX((%0, %1, 8))
1838

    
1839
                "movq %%mm6, %%mm4                                \n\t"
1840
                "psrlq $8, %%mm6                                \n\t"
1841
                "pminub %%mm4, %%mm6                                \n\t" // min of pixels
1842
#ifdef HAVE_MMX2
1843
                "pshufw $0xF9, %%mm6, %%mm4                        \n\t"
1844
                "pminub %%mm4, %%mm6                                \n\t" // min of pixels
1845
                "pshufw $0xFE, %%mm6, %%mm4                        \n\t"
1846
#else
1847
                "movq %%mm6, %%mm4                                \n\t"
1848
                "psrlq $16, %%mm6                                \n\t"
1849
                "pminub %%mm4, %%mm6                                \n\t"
1850
                "movq %%mm6, %%mm4                                \n\t"
1851
                "psrlq $32, %%mm6                                \n\t"
1852
#endif
1853
                "pminub %%mm4, %%mm6                                \n\t"
1854

    
1855

    
1856
                "movq %%mm7, %%mm4                                \n\t"
1857
                "psrlq $8, %%mm7                                \n\t"
1858
                "pmaxub %%mm4, %%mm7                                \n\t" // max of pixels
1859
#ifdef HAVE_MMX2
1860
                "pshufw $0xF9, %%mm7, %%mm4                        \n\t"
1861
                "pmaxub %%mm4, %%mm7                                \n\t" // min of pixels
1862
                "pshufw $0xFE, %%mm7, %%mm4                        \n\t"
1863
#else
1864
                "movq %%mm7, %%mm4                                \n\t"
1865
                "psrlq $16, %%mm7                                \n\t"
1866
                "pmaxub %%mm4, %%mm7                                \n\t"
1867
                "movq %%mm7, %%mm4                                \n\t"
1868
                "psrlq $32, %%mm7                                \n\t"
1869
#endif
1870
                "pmaxub %%mm4, %%mm7                                \n\t"
1871
                PAVGB(%%mm6, %%mm7)                                      // a=(max + min)/2
1872
                "punpcklbw %%mm7, %%mm7                                \n\t"
1873
                "punpcklbw %%mm7, %%mm7                                \n\t"
1874
                "punpcklbw %%mm7, %%mm7                                \n\t"
1875
                "movq %%mm7, temp0                                \n\t"
1876

    
1877
                "movq (%0), %%mm0                                \n\t" // L10
1878
                "movq %%mm0, %%mm1                                \n\t" // L10
1879
                "movq %%mm0, %%mm2                                \n\t" // L10
1880
                "psllq $8, %%mm1                                \n\t"
1881
                "psrlq $8, %%mm2                                \n\t"
1882
                "movd -4(%0), %%mm3                                \n\t"
1883
                "movd 8(%0), %%mm4                                \n\t"
1884
                "psrlq $24, %%mm3                                \n\t"
1885
                "psllq $56, %%mm4                                \n\t"
1886
                "por %%mm3, %%mm1                                \n\t" // L00
1887
                "por %%mm4, %%mm2                                \n\t" // L20
1888
                "movq %%mm1, %%mm3                                \n\t" // L00
1889
                PAVGB(%%mm2, %%mm1)                                      // (L20 + L00)/2
1890
                PAVGB(%%mm0, %%mm1)                                      // (L20 + L00 + 2L10)/4
1891
                "psubusb %%mm7, %%mm0                                \n\t"
1892
                "psubusb %%mm7, %%mm2                                \n\t"
1893
                "psubusb %%mm7, %%mm3                                \n\t"
1894
                "pcmpeqb b00, %%mm0                                \n\t" // L10 > a ? 0 : -1
1895
                "pcmpeqb b00, %%mm2                                \n\t" // L20 > a ? 0 : -1
1896
                "pcmpeqb b00, %%mm3                                \n\t" // L00 > a ? 0 : -1
1897
                "paddb %%mm2, %%mm0                                \n\t"
1898
                "paddb %%mm3, %%mm0                                \n\t"
1899

    
1900
                "movq (%%eax), %%mm2                                \n\t" // L11
1901
                "movq %%mm2, %%mm3                                \n\t" // L11
1902
                "movq %%mm2, %%mm4                                \n\t" // L11
1903
                "psllq $8, %%mm3                                \n\t"
1904
                "psrlq $8, %%mm4                                \n\t"
1905
                "movd -4(%%eax), %%mm5                                \n\t"
1906
                "movd 8(%%eax), %%mm6                                \n\t"
1907
                "psrlq $24, %%mm5                                \n\t"
1908
                "psllq $56, %%mm6                                \n\t"
1909
                "por %%mm5, %%mm3                                \n\t" // L01
1910
                "por %%mm6, %%mm4                                \n\t" // L21
1911
                "movq %%mm3, %%mm5                                \n\t" // L01
1912
                PAVGB(%%mm4, %%mm3)                                      // (L21 + L01)/2
1913
                PAVGB(%%mm2, %%mm3)                                      // (L21 + L01 + 2L11)/4
1914
                "psubusb %%mm7, %%mm2                                \n\t"
1915
                "psubusb %%mm7, %%mm4                                \n\t"
1916
                "psubusb %%mm7, %%mm5                                \n\t"
1917
                "pcmpeqb b00, %%mm2                                \n\t" // L11 > a ? 0 : -1
1918
                "pcmpeqb b00, %%mm4                                \n\t" // L21 > a ? 0 : -1
1919
                "pcmpeqb b00, %%mm5                                \n\t" // L01 > a ? 0 : -1
1920
                "paddb %%mm4, %%mm2                                \n\t"
1921
                "paddb %%mm5, %%mm2                                \n\t"
1922
// 0, 2, 3, 1
1923
#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1924
                "movq " #src ", " #sx "                                \n\t" /* src[0] */\
1925
                "movq " #sx ", " #lx "                                \n\t" /* src[0] */\
1926
                "movq " #sx ", " #t0 "                                \n\t" /* src[0] */\
1927
                "psllq $8, " #lx "                                \n\t"\
1928
                "psrlq $8, " #t0 "                                \n\t"\
1929
                "movd -4" #src ", " #t1 "                        \n\t"\
1930
                "psrlq $24, " #t1 "                                \n\t"\
1931
                "por " #t1 ", " #lx "                                \n\t" /* src[-1] */\
1932
                "movd 8" #src ", " #t1 "                        \n\t"\
1933
                "psllq $56, " #t1 "                                \n\t"\
1934
                "por " #t1 ", " #t0 "                                \n\t" /* src[+1] */\
1935
                "movq " #lx ", " #t1 "                                \n\t" /* src[-1] */\
1936
                PAVGB(t0, lx)                                              /* (src[-1] + src[+1])/2 */\
1937
                PAVGB(sx, lx)                                      /* (src[-1] + 2src[0] + src[+1])/4 */\
1938
                "psubusb temp0, " #t1 "                                \n\t"\
1939
                "psubusb temp0, " #t0 "                                \n\t"\
1940
                "psubusb temp0, " #sx "                                \n\t"\
1941
                "pcmpeqb b00, " #t1 "                                \n\t" /* src[-1] > a ? 0 : -1*/\
1942
                "pcmpeqb b00, " #t0 "                                \n\t" /* src[+1] > a ? 0 : -1*/\
1943
                "pcmpeqb b00, " #sx "                                \n\t" /* src[0]  > a ? 0 : -1*/\
1944
                "paddb " #t1 ", " #t0 "                                \n\t"\
1945
                "paddb " #t0 ", " #sx "                                \n\t"\
1946
\
1947
                PAVGB(lx, pplx)                                             \
1948
                PAVGB(plx, pplx)                                      /* filtered */\
1949
                "movq " #dst ", " #t0 "                                \n\t" /* dst */\
1950
                "movq " #pplx ", " #t1 "                        \n\t"\
1951
                "psubusb " #t0 ", " #pplx "                        \n\t"\
1952
                "psubusb " #t1 ", " #t0 "                        \n\t"\
1953
                "por " #t0 ", " #pplx "                                \n\t" /* |filtered - dst| */\
1954
                "psubusb pQPb2, " #pplx "                        \n\t"\
1955
                "pcmpeqb b00, " #pplx "                                \n\t"\
1956
                "paddb " #sx ", " #ppsx "                        \n\t"\
1957
                "paddb " #psx ", " #ppsx "                        \n\t"\
1958
        "#paddb b02, " #ppsx "                                \n\t"\
1959
                "pand b08, " #ppsx "                                \n\t"\
1960
                "pcmpeqb b00, " #ppsx "                                \n\t"\
1961
                "pand " #pplx ", " #ppsx "                        \n\t"\
1962
                "pand " #ppsx ", " #t1 "                        \n\t"\
1963
                "pandn " #dst ", " #ppsx "                        \n\t"\
1964
                "por " #t1 ", " #ppsx "                                \n\t"\
1965
                "movq " #ppsx ", " #dst "                        \n\t"
1966
/*
1967
0000000
1968
1111111
1969

1970
1111110
1971
1111101
1972
1111100
1973
1111011
1974
1111010
1975
1111001
1976

1977
1111000
1978
1110111
1979

1980
*/
1981
//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1982
DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1983
DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1984
DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1985
DERING_CORE((%0, %1, 4),(%%ebx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1986
DERING_CORE((%%ebx),(%%ebx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1987
DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1988
DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1989
DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1990

    
1991

    
1992
                : : "r" (src), "r" (stride), "r" (QP)
1993
                : "%eax", "%ebx"
1994
        );
1995
#else
1996

    
1997
//FIXME
1998
#endif
1999
}
2000

    
2001
/**
2002
 * Deinterlaces the given block
2003
 * will be called for every 8x8 block, and can read & write into an 8x16 block
2004
 */
2005
static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
2006
{
2007
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2008
        asm volatile(
2009
                "leal (%0, %1), %%eax                                \n\t"
2010
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2011
//        0        1        2        3        4        5        6        7        8        9
2012
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
2013

    
2014
                "movq (%0), %%mm0                                \n\t"
2015
                "movq (%%eax, %1), %%mm1                        \n\t"
2016
                PAVGB(%%mm1, %%mm0)
2017
                "movq %%mm0, (%%eax)                                \n\t"
2018
                "movq (%0, %1, 4), %%mm0                        \n\t"
2019
                PAVGB(%%mm0, %%mm1)
2020
                "movq %%mm1, (%%eax, %1, 2)                        \n\t"
2021
                "movq (%%ebx, %1), %%mm1                        \n\t"
2022
                PAVGB(%%mm1, %%mm0)
2023
                "movq %%mm0, (%%ebx)                                \n\t"
2024
                "movq (%0, %1, 8), %%mm0                        \n\t"
2025
                PAVGB(%%mm0, %%mm1)
2026
                "movq %%mm1, (%%ebx, %1, 2)                        \n\t"
2027

    
2028
                : : "r" (src), "r" (stride)
2029
                : "%eax", "%ebx"
2030
        );
2031
#else
2032
        int x;
2033
        for(x=0; x<8; x++)
2034
        {
2035
                src[stride]   = (src[0]        + src[stride*2])>>1;
2036
                src[stride*3] = (src[stride*2] + src[stride*4])>>1;
2037
                src[stride*5] = (src[stride*4] + src[stride*6])>>1;
2038
                src[stride*7] = (src[stride*6] + src[stride*8])>>1;
2039
                src++;
2040
        }
2041
#endif
2042
}
2043

    
2044
/**
2045
 * Deinterlaces the given block
2046
 * will be called for every 8x8 block, and can read & write into an 8x16 block
2047
 * no cliping in C version
2048
 */
2049
static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
2050
{
2051
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2052
        asm volatile(
2053
                "leal (%0, %1), %%eax                                \n\t"
2054
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2055
                "leal (%%ebx, %1, 4), %%ecx                        \n\t"
2056
                "addl %1, %%ecx                                        \n\t"
2057
                "pxor %%mm7, %%mm7                                \n\t"
2058
//        0        1        2        3        4        5        6        7        8        9        10
2059
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1 ecx
2060

    
2061
#define DEINT_CUBIC(a,b,c,d,e)\
2062
                "movq " #a ", %%mm0                                \n\t"\
2063
                "movq " #b ", %%mm1                                \n\t"\
2064
                "movq " #d ", %%mm2                                \n\t"\
2065
                "movq " #e ", %%mm3                                \n\t"\
2066
                PAVGB(%%mm2, %%mm1)                                        /* (b+d) /2 */\
2067
                PAVGB(%%mm3, %%mm0)                                        /* a(a+e) /2 */\
2068
                "movq %%mm0, %%mm2                                \n\t"\
2069
                "punpcklbw %%mm7, %%mm0                                \n\t"\
2070
                "punpckhbw %%mm7, %%mm2                                \n\t"\
2071
                "movq %%mm1, %%mm3                                \n\t"\
2072
                "punpcklbw %%mm7, %%mm1                                \n\t"\
2073
                "punpckhbw %%mm7, %%mm3                                \n\t"\
2074
                "psubw %%mm1, %%mm0                                \n\t"        /* L(a+e - (b+d))/2 */\
2075
                "psubw %%mm3, %%mm2                                \n\t"        /* H(a+e - (b+d))/2 */\
2076
                "psraw $3, %%mm0                                \n\t"        /* L(a+e - (b+d))/16 */\
2077
                "psraw $3, %%mm2                                \n\t"        /* H(a+e - (b+d))/16 */\
2078
                "psubw %%mm0, %%mm1                                \n\t"        /* L(9b + 9d - a - e)/16 */\
2079
                "psubw %%mm2, %%mm3                                \n\t"        /* H(9b + 9d - a - e)/16 */\
2080
                "packuswb %%mm3, %%mm1                                \n\t"\
2081
                "movq %%mm1, " #c "                                \n\t"
2082

    
2083
DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
2084
DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
2085
DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
2086
DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
2087

    
2088
                : : "r" (src), "r" (stride)
2089
                : "%eax", "%ebx", "ecx"
2090
        );
2091
#else
2092
        int x;
2093
        for(x=0; x<8; x++)
2094
        {
2095
                src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
2096
                src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
2097
                src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
2098
                src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
2099
                src++;
2100
        }
2101
#endif
2102
}
2103

    
2104
/**
2105
 * Deinterlaces the given block
2106
 * will be called for every 8x8 block, and can read & write into an 8x16 block
2107
 * will shift the image up by 1 line (FIXME if this is a problem)
2108
 */
2109
static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
2110
{
2111
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2112
        asm volatile(
2113
                "leal (%0, %1), %%eax                                \n\t"
2114
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2115
//        0        1        2        3        4        5        6        7        8        9
2116
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
2117

    
2118
                "movq (%0), %%mm0                                \n\t" // L0
2119
                "movq (%%eax, %1), %%mm1                        \n\t" // L2
2120
                PAVGB(%%mm1, %%mm0)                                      // L0+L2
2121
                "movq (%%eax), %%mm2                                \n\t" // L1
2122
                PAVGB(%%mm2, %%mm0)
2123
                "movq %%mm0, (%0)                                \n\t"
2124
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // L3
2125
                PAVGB(%%mm0, %%mm2)                                      // L1+L3
2126
                PAVGB(%%mm1, %%mm2)                                      // 2L2 + L1 + L3
2127
                "movq %%mm2, (%%eax)                                \n\t"
2128
                "movq (%0, %1, 4), %%mm2                        \n\t" // L4
2129
                PAVGB(%%mm2, %%mm1)                                      // L2+L4
2130
                PAVGB(%%mm0, %%mm1)                                      // 2L3 + L2 + L4
2131
                "movq %%mm1, (%%eax, %1)                        \n\t"
2132
                "movq (%%ebx), %%mm1                                \n\t" // L5
2133
                PAVGB(%%mm1, %%mm0)                                      // L3+L5
2134
                PAVGB(%%mm2, %%mm0)                                      // 2L4 + L3 + L5
2135
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
2136
                "movq (%%ebx, %1), %%mm0                        \n\t" // L6
2137
                PAVGB(%%mm0, %%mm2)                                      // L4+L6
2138
                PAVGB(%%mm1, %%mm2)                                      // 2L5 + L4 + L6
2139
                "movq %%mm2, (%0, %1, 4)                        \n\t"
2140
                "movq (%%ebx, %1, 2), %%mm2                        \n\t" // L7
2141
                PAVGB(%%mm2, %%mm1)                                      // L5+L7
2142
                PAVGB(%%mm0, %%mm1)                                      // 2L6 + L5 + L7
2143
                "movq %%mm1, (%%ebx)                                \n\t"
2144
                "movq (%0, %1, 8), %%mm1                        \n\t" // L8
2145
                PAVGB(%%mm1, %%mm0)                                      // L6+L8
2146
                PAVGB(%%mm2, %%mm0)                                      // 2L7 + L6 + L8
2147
                "movq %%mm0, (%%ebx, %1)                        \n\t"
2148
                "movq (%%ebx, %1, 4), %%mm0                        \n\t" // L9
2149
                PAVGB(%%mm0, %%mm2)                                      // L7+L9
2150
                PAVGB(%%mm1, %%mm2)                                      // 2L8 + L7 + L9
2151
                "movq %%mm2, (%%ebx, %1, 2)                        \n\t"
2152

    
2153

    
2154
                : : "r" (src), "r" (stride)
2155
                : "%eax", "%ebx"
2156
        );
2157
#else
2158
        int x;
2159
        for(x=0; x<8; x++)
2160
        {
2161
                src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2162
                src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2163
                src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2164
                src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2165
                src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2166
                src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2167
                src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2168
                src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2169
                src++;
2170
        }
2171
#endif
2172
}
2173

    
2174
/**
2175
 * Deinterlaces the given block
2176
 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
2177
 */
2178
static inline void deInterlaceMedian(uint8_t src[], int stride)
2179
{
2180
#ifdef HAVE_MMX
2181
#ifdef HAVE_MMX2
2182
        asm volatile(
2183
                "leal (%0, %1), %%eax                                \n\t"
2184
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2185
//        0        1        2        3        4        5        6        7        8        9
2186
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
2187

    
2188
                "movq (%0), %%mm0                                \n\t" //
2189
                "movq (%%eax, %1), %%mm2                        \n\t" //
2190
                "movq (%%eax), %%mm1                                \n\t" //
2191
                "movq %%mm0, %%mm3                                \n\t"
2192
                "pmaxub %%mm1, %%mm0                                \n\t" //
2193
                "pminub %%mm3, %%mm1                                \n\t" //
2194
                "pmaxub %%mm2, %%mm1                                \n\t" //
2195
                "pminub %%mm1, %%mm0                                \n\t"
2196
                "movq %%mm0, (%%eax)                                \n\t"
2197

    
2198
                "movq (%0, %1, 4), %%mm0                        \n\t" //
2199
                "movq (%%eax, %1, 2), %%mm1                        \n\t" //
2200
                "movq %%mm2, %%mm3                                \n\t"
2201
                "pmaxub %%mm1, %%mm2                                \n\t" //
2202
                "pminub %%mm3, %%mm1                                \n\t" //
2203
                "pmaxub %%mm0, %%mm1                                \n\t" //
2204
                "pminub %%mm1, %%mm2                                \n\t"
2205
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
2206

    
2207
                "movq (%%ebx), %%mm2                                \n\t" //
2208
                "movq (%%ebx, %1), %%mm1                        \n\t" //
2209
                "movq %%mm2, %%mm3                                \n\t"
2210
                "pmaxub %%mm0, %%mm2                                \n\t" //
2211
                "pminub %%mm3, %%mm0                                \n\t" //
2212
                "pmaxub %%mm1, %%mm0                                \n\t" //
2213
                "pminub %%mm0, %%mm2                                \n\t"
2214
                "movq %%mm2, (%%ebx)                                \n\t"
2215

    
2216
                "movq (%%ebx, %1, 2), %%mm2                        \n\t" //
2217
                "movq (%0, %1, 8), %%mm0                        \n\t" //
2218
                "movq %%mm2, %%mm3                                \n\t"
2219
                "pmaxub %%mm0, %%mm2                                \n\t" //
2220
                "pminub %%mm3, %%mm0                                \n\t" //
2221
                "pmaxub %%mm1, %%mm0                                \n\t" //
2222
                "pminub %%mm0, %%mm2                                \n\t"
2223
                "movq %%mm2, (%%ebx, %1, 2)                        \n\t"
2224

    
2225

    
2226
                : : "r" (src), "r" (stride)
2227
                : "%eax", "%ebx"
2228
        );
2229

    
2230
#else // MMX without MMX2
2231
        asm volatile(
2232
                "leal (%0, %1), %%eax                                \n\t"
2233
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2234
//        0        1        2        3        4        5        6        7        8        9
2235
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
2236
                "pxor %%mm7, %%mm7                                \n\t"
2237

    
2238
#define MEDIAN(a,b,c)\
2239
                "movq " #a ", %%mm0                                \n\t"\
2240
                "movq " #b ", %%mm2                                \n\t"\
2241
                "movq " #c ", %%mm1                                \n\t"\
2242
                "movq %%mm0, %%mm3                                \n\t"\
2243
                "movq %%mm1, %%mm4                                \n\t"\
2244
                "movq %%mm2, %%mm5                                \n\t"\
2245
                "psubusb %%mm1, %%mm3                                \n\t"\
2246
                "psubusb %%mm2, %%mm4                                \n\t"\
2247
                "psubusb %%mm0, %%mm5                                \n\t"\
2248
                "pcmpeqb %%mm7, %%mm3                                \n\t"\
2249
                "pcmpeqb %%mm7, %%mm4                                \n\t"\
2250
                "pcmpeqb %%mm7, %%mm5                                \n\t"\
2251
                "movq %%mm3, %%mm6                                \n\t"\
2252
                "pxor %%mm4, %%mm3                                \n\t"\
2253
                "pxor %%mm5, %%mm4                                \n\t"\
2254
                "pxor %%mm6, %%mm5                                \n\t"\
2255
                "por %%mm3, %%mm1                                \n\t"\
2256
                "por %%mm4, %%mm2                                \n\t"\
2257
                "por %%mm5, %%mm0                                \n\t"\
2258
                "pand %%mm2, %%mm0                                \n\t"\
2259
                "pand %%mm1, %%mm0                                \n\t"\
2260
                "movq %%mm0, " #b "                                \n\t"
2261

    
2262
MEDIAN((%0), (%%eax), (%%eax, %1))
2263
MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2264
MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2265
MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2266

    
2267
                : : "r" (src), "r" (stride)
2268
                : "%eax", "%ebx"
2269
        );
2270
#endif // MMX
2271
#else
2272
        //FIXME
2273
        int x;
2274
        for(x=0; x<8; x++)
2275
        {
2276
                src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2277
                src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2278
                src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2279
                src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2280
                src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2281
                src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2282
                src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2283
                src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2284
                src++;
2285
        }
2286
#endif
2287
}
2288

    
2289
#ifdef HAVE_MMX
2290
/**
2291
 * transposes and shift the given 8x8 Block into dst1 and dst2
2292
 */
2293
static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2294
{
2295
        asm(
2296
                "leal (%0, %1), %%eax                                \n\t"
2297
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2298
//        0        1        2        3        4        5        6        7        8        9
2299
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
2300
                "movq (%0), %%mm0                \n\t" // 12345678
2301
                "movq (%%eax), %%mm1                \n\t" // abcdefgh
2302
                "movq %%mm0, %%mm2                \n\t" // 12345678
2303
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2304
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2305

    
2306
                "movq (%%eax, %1), %%mm1        \n\t"
2307
                "movq (%%eax, %1, 2), %%mm3        \n\t"
2308
                "movq %%mm1, %%mm4                \n\t"
2309
                "punpcklbw %%mm3, %%mm1                \n\t"
2310
                "punpckhbw %%mm3, %%mm4                \n\t"
2311

    
2312
                "movq %%mm0, %%mm3                \n\t"
2313
                "punpcklwd %%mm1, %%mm0                \n\t"
2314
                "punpckhwd %%mm1, %%mm3                \n\t"
2315
                "movq %%mm2, %%mm1                \n\t"
2316
                "punpcklwd %%mm4, %%mm2                \n\t"
2317
                "punpckhwd %%mm4, %%mm1                \n\t"
2318

    
2319
                "movd %%mm0, 128(%2)                \n\t"
2320
                "psrlq $32, %%mm0                \n\t"
2321
                "movd %%mm0, 144(%2)                \n\t"
2322
                "movd %%mm3, 160(%2)                \n\t"
2323
                "psrlq $32, %%mm3                \n\t"
2324
                "movd %%mm3, 176(%2)                \n\t"
2325
                "movd %%mm3, 48(%3)                \n\t"
2326
                "movd %%mm2, 192(%2)                \n\t"
2327
                "movd %%mm2, 64(%3)                \n\t"
2328
                "psrlq $32, %%mm2                \n\t"
2329
                "movd %%mm2, 80(%3)                \n\t"
2330
                "movd %%mm1, 96(%3)                \n\t"
2331
                "psrlq $32, %%mm1                \n\t"
2332
                "movd %%mm1, 112(%3)                \n\t"
2333

    
2334
                "movq (%0, %1, 4), %%mm0        \n\t" // 12345678
2335
                "movq (%%ebx), %%mm1                \n\t" // abcdefgh
2336
                "movq %%mm0, %%mm2                \n\t" // 12345678
2337
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2338
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2339

    
2340
                "movq (%%ebx, %1), %%mm1        \n\t"
2341
                "movq (%%ebx, %1, 2), %%mm3        \n\t"
2342
                "movq %%mm1, %%mm4                \n\t"
2343
                "punpcklbw %%mm3, %%mm1                \n\t"
2344
                "punpckhbw %%mm3, %%mm4                \n\t"
2345

    
2346
                "movq %%mm0, %%mm3                \n\t"
2347
                "punpcklwd %%mm1, %%mm0                \n\t"
2348
                "punpckhwd %%mm1, %%mm3                \n\t"
2349
                "movq %%mm2, %%mm1                \n\t"
2350
                "punpcklwd %%mm4, %%mm2                \n\t"
2351
                "punpckhwd %%mm4, %%mm1                \n\t"
2352

    
2353
                "movd %%mm0, 132(%2)                \n\t"
2354
                "psrlq $32, %%mm0                \n\t"
2355
                "movd %%mm0, 148(%2)                \n\t"
2356
                "movd %%mm3, 164(%2)                \n\t"
2357
                "psrlq $32, %%mm3                \n\t"
2358
                "movd %%mm3, 180(%2)                \n\t"
2359
                "movd %%mm3, 52(%3)                \n\t"
2360
                "movd %%mm2, 196(%2)                \n\t"
2361
                "movd %%mm2, 68(%3)                \n\t"
2362
                "psrlq $32, %%mm2                \n\t"
2363
                "movd %%mm2, 84(%3)                \n\t"
2364
                "movd %%mm1, 100(%3)                \n\t"
2365
                "psrlq $32, %%mm1                \n\t"
2366
                "movd %%mm1, 116(%3)                \n\t"
2367

    
2368

    
2369
        :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2370
        : "%eax", "%ebx"
2371
        );
2372
}
2373

    
2374
/**
2375
 * transposes the given 8x8 block
2376
 */
2377
static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2378
{
2379
        asm(
2380
                "leal (%0, %1), %%eax                                \n\t"
2381
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2382
//        0        1        2        3        4        5        6        7        8        9
2383
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
2384
                "movq (%2), %%mm0                \n\t" // 12345678
2385
                "movq 16(%2), %%mm1                \n\t" // abcdefgh
2386
                "movq %%mm0, %%mm2                \n\t" // 12345678
2387
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2388
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2389

    
2390
                "movq 32(%2), %%mm1                \n\t"
2391
                "movq 48(%2), %%mm3                \n\t"
2392
                "movq %%mm1, %%mm4                \n\t"
2393
                "punpcklbw %%mm3, %%mm1                \n\t"
2394
                "punpckhbw %%mm3, %%mm4                \n\t"
2395

    
2396
                "movq %%mm0, %%mm3                \n\t"
2397
                "punpcklwd %%mm1, %%mm0                \n\t"
2398
                "punpckhwd %%mm1, %%mm3                \n\t"
2399
                "movq %%mm2, %%mm1                \n\t"
2400
                "punpcklwd %%mm4, %%mm2                \n\t"
2401
                "punpckhwd %%mm4, %%mm1                \n\t"
2402

    
2403
                "movd %%mm0, (%0)                \n\t"
2404
                "psrlq $32, %%mm0                \n\t"
2405
                "movd %%mm0, (%%eax)                \n\t"
2406
                "movd %%mm3, (%%eax, %1)        \n\t"
2407
                "psrlq $32, %%mm3                \n\t"
2408
                "movd %%mm3, (%%eax, %1, 2)        \n\t"
2409
                "movd %%mm2, (%0, %1, 4)        \n\t"
2410
                "psrlq $32, %%mm2                \n\t"
2411
                "movd %%mm2, (%%ebx)                \n\t"
2412
                "movd %%mm1, (%%ebx, %1)        \n\t"
2413
                "psrlq $32, %%mm1                \n\t"
2414
                "movd %%mm1, (%%ebx, %1, 2)        \n\t"
2415

    
2416

    
2417
                "movq 64(%2), %%mm0                \n\t" // 12345678
2418
                "movq 80(%2), %%mm1                \n\t" // abcdefgh
2419
                "movq %%mm0, %%mm2                \n\t" // 12345678
2420
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2421
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2422

    
2423
                "movq 96(%2), %%mm1                \n\t"
2424
                "movq 112(%2), %%mm3                \n\t"
2425
                "movq %%mm1, %%mm4                \n\t"
2426
                "punpcklbw %%mm3, %%mm1                \n\t"
2427
                "punpckhbw %%mm3, %%mm4                \n\t"
2428

    
2429
                "movq %%mm0, %%mm3                \n\t"
2430
                "punpcklwd %%mm1, %%mm0                \n\t"
2431
                "punpckhwd %%mm1, %%mm3                \n\t"
2432
                "movq %%mm2, %%mm1                \n\t"
2433
                "punpcklwd %%mm4, %%mm2                \n\t"
2434
                "punpckhwd %%mm4, %%mm1                \n\t"
2435

    
2436
                "movd %%mm0, 4(%0)                \n\t"
2437
                "psrlq $32, %%mm0                \n\t"
2438
                "movd %%mm0, 4(%%eax)                \n\t"
2439
                "movd %%mm3, 4(%%eax, %1)        \n\t"
2440
                "psrlq $32, %%mm3                \n\t"
2441
                "movd %%mm3, 4(%%eax, %1, 2)        \n\t"
2442
                "movd %%mm2, 4(%0, %1, 4)        \n\t"
2443
                "psrlq $32, %%mm2                \n\t"
2444
                "movd %%mm2, 4(%%ebx)                \n\t"
2445
                "movd %%mm1, 4(%%ebx, %1)        \n\t"
2446
                "psrlq $32, %%mm1                \n\t"
2447
                "movd %%mm1, 4(%%ebx, %1, 2)        \n\t"
2448

    
2449
        :: "r" (dst), "r" (dstStride), "r" (src)
2450
        : "%eax", "%ebx"
2451
        );
2452
}
2453
#endif
2454

    
2455
#ifdef HAVE_ODIVX_POSTPROCESS
2456
#include "../opendivx/postprocess.h"
2457
int use_old_pp=0;
2458
#endif
2459

    
2460
static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2461
        QP_STORE_T QPs[], int QPStride, int isColor, int mode);
2462

    
2463
/* -pp Command line Help
2464
NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2465

2466
-pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2467

2468
long form example:
2469
-pp vdeblock:autoq,hdeblock:autoq,linblenddeint                -pp default,-vdeblock
2470
short form example:
2471
-pp vb:a,hb:a,lb                                        -pp de,-vb
2472

2473
Filters                        Options
2474
short        long name        short        long option        Description
2475
*        *                a        autoq                cpu power dependant enabler
2476
                        c        chrom                chrominance filtring enabled
2477
                        y        nochrom                chrominance filtring disabled
2478
hb        hdeblock                                horizontal deblocking filter
2479
vb        vdeblock                                vertical deblocking filter
2480
vr        rkvdeblock
2481
h1        x1hdeblock                                Experimental horizontal deblock filter 1
2482
v1        x1vdeblock                                Experimental vertical deblock filter 1
2483
dr        dering                                        not implemented yet
2484
al        autolevels                                automatic brightness / contrast fixer
2485
                        f        fullyrange        stretch luminance range to (0..255)
2486
lb        linblenddeint                                linear blend deinterlacer
2487
li        linipoldeint                                linear interpolating deinterlacer
2488
ci        cubicipoldeint                                cubic interpolating deinterlacer
2489
md        mediandeint                                median deinterlacer
2490
de        default                                        hdeblock:a,vdeblock:a,dering:a,autolevels
2491
fa        fast                                        x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2492
*/
2493

    
2494
/**
2495
 * returns a PPMode struct which will have a non 0 error variable if an error occured
2496
 * name is the string after "-pp" on the command line
2497
 * quality is a number from 0 to GET_PP_QUALITY_MAX
2498
 */
2499
struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2500
{
2501
        char temp[GET_MODE_BUFFER_SIZE];
2502
        char *p= temp;
2503
        char *filterDelimiters= ",";
2504
        char *optionDelimiters= ":";
2505
        struct PPMode ppMode= {0,0,0,0,0,0};
2506
        char *filterToken;
2507

    
2508
        strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2509

    
2510
        for(;;){
2511
                char *filterName;
2512
                int q= GET_PP_QUALITY_MAX;
2513
                int chrom=-1;
2514
                char *option;
2515
                char *options[OPTIONS_ARRAY_SIZE];
2516
                int i;
2517
                int filterNameOk=0;
2518
                int numOfUnknownOptions=0;
2519
                int enable=1; //does the user want us to enabled or disabled the filter
2520

    
2521
                filterToken= strtok(p, filterDelimiters);
2522
                if(filterToken == NULL) break;
2523
                p+= strlen(filterToken) + 1;
2524
                filterName= strtok(filterToken, optionDelimiters);
2525
                printf("%s::%s\n", filterToken, filterName);
2526

    
2527
                if(*filterName == '-')
2528
                {
2529
                        enable=0;
2530
                        filterName++;
2531
                }
2532
                for(;;){ //for all options
2533
                        option= strtok(NULL, optionDelimiters);
2534
                        if(option == NULL) break;
2535

    
2536
                        printf("%s\n", option);
2537
                        if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2538
                        else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2539
                        else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2540
                        else
2541
                        {
2542
                                options[numOfUnknownOptions] = option;
2543
                                numOfUnknownOptions++;
2544
                                options[numOfUnknownOptions] = NULL;
2545
                        }
2546
                        if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2547
                }
2548

    
2549
                /* replace stuff from the replace Table */
2550
                for(i=0; replaceTable[2*i]!=NULL; i++)
2551
                {
2552
                        if(!strcmp(replaceTable[2*i], filterName))
2553
                        {
2554
                                int newlen= strlen(replaceTable[2*i + 1]);
2555
                                int plen;
2556
                                int spaceLeft;
2557

    
2558
                                if(p==NULL) p= temp, *p=0;         //last filter
2559
                                else p--, *p=',';                //not last filter
2560

    
2561
                                plen= strlen(p);
2562
                                spaceLeft= (int)p - (int)temp + plen;
2563
                                if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
2564
                                {
2565
                                        ppMode.error++;
2566
                                        break;
2567
                                }
2568
                                memmove(p + newlen, p, plen+1);
2569
                                memcpy(p, replaceTable[2*i + 1], newlen);
2570
                                filterNameOk=1;
2571
                        }
2572
                }
2573

    
2574
                for(i=0; filters[i].shortName!=NULL; i++)
2575
                {
2576
                        if(   !strcmp(filters[i].longName, filterName)
2577
                           || !strcmp(filters[i].shortName, filterName))
2578
                        {
2579
                                ppMode.lumMode &= ~filters[i].mask;
2580
                                ppMode.chromMode &= ~filters[i].mask;
2581

    
2582
                                filterNameOk=1;
2583
                                if(!enable) break; // user wants to disable it
2584

    
2585
                                if(q >= filters[i].minLumQuality)
2586
                                        ppMode.lumMode|= filters[i].mask;
2587
                                if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2588
                                        if(q >= filters[i].minChromQuality)
2589
                                                ppMode.chromMode|= filters[i].mask;
2590

    
2591
                                if(filters[i].mask == LEVEL_FIX)
2592
                                {
2593
                                        int o;
2594
                                        ppMode.minAllowedY= 16;
2595
                                        ppMode.maxAllowedY= 234;
2596
                                        for(o=0; options[o]!=NULL; o++)
2597
                                                if(  !strcmp(options[o],"fullyrange")
2598
                                                   ||!strcmp(options[o],"f"))
2599
                                                {
2600
                                                        ppMode.minAllowedY= 0;
2601
                                                        ppMode.maxAllowedY= 255;
2602
                                                        numOfUnknownOptions--;
2603
                                                }
2604
                                }
2605
                        }
2606
                }
2607
                if(!filterNameOk) ppMode.error++;
2608
                ppMode.error += numOfUnknownOptions;
2609
        }
2610

    
2611
        if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2612
        if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2613
        if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2614
        if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2615
        if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2616
        if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2617

    
2618
        return ppMode;
2619
}
2620

    
2621
/**
2622
 * ...
2623
 */
2624
void  postprocess(unsigned char * src[], int src_stride,
2625
                 unsigned char * dst[], int dst_stride,
2626
                 int horizontal_size,   int vertical_size,
2627
                 QP_STORE_T *QP_store,  int QP_stride,
2628
                                          int mode)
2629
{
2630
/*
2631
        static int qual=0;
2632

2633
        struct PPMode ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock", qual);
2634
        qual++;
2635
        qual%=7;
2636
        printf("\n%d %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error);
2637
        postprocess2(src, src_stride, dst, dst_stride,
2638
                 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
2639

2640
        return;
2641
*/
2642

    
2643
#ifdef HAVE_ODIVX_POSTPROCESS
2644
// Note: I could make this shit outside of this file, but it would mean one
2645
// more function call...
2646
        if(use_old_pp){
2647
            odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2648
            return;
2649
        }
2650
#endif
2651

    
2652
        postProcess(src[0], src_stride, dst[0], dst_stride,
2653
                horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
2654

    
2655
        horizontal_size >>= 1;
2656
        vertical_size   >>= 1;
2657
        src_stride      >>= 1;
2658
        dst_stride      >>= 1;
2659
        mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2660

    
2661
        if(1)
2662
        {
2663
                postProcess(src[1], src_stride, dst[1], dst_stride,
2664
                        horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
2665
                postProcess(src[2], src_stride, dst[2], dst_stride,
2666
                        horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
2667
        }
2668
        else
2669
        {
2670
                memcpy(dst[1], src[1], src_stride*horizontal_size);
2671
                memcpy(dst[2], src[2], src_stride*horizontal_size);
2672
        }
2673
}
2674

    
2675
void  postprocess2(unsigned char * src[], int src_stride,
2676
                 unsigned char * dst[], int dst_stride,
2677
                 int horizontal_size,   int vertical_size,
2678
                 QP_STORE_T *QP_store,  int QP_stride,
2679
                 struct PPMode *mode)
2680
{
2681

    
2682
#ifdef HAVE_ODIVX_POSTPROCESS
2683
// Note: I could make this shit outside of this file, but it would mean one
2684
// more function call...
2685
        if(use_old_pp){
2686
            odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
2687
            mode->oldMode);
2688
            return;
2689
        }
2690
#endif
2691

    
2692
        postProcess(src[0], src_stride, dst[0], dst_stride,
2693
                horizontal_size, vertical_size, QP_store, QP_stride, 0, mode->lumMode);
2694

    
2695
        horizontal_size >>= 1;
2696
        vertical_size   >>= 1;
2697
        src_stride      >>= 1;
2698
        dst_stride      >>= 1;
2699

    
2700
        postProcess(src[1], src_stride, dst[1], dst_stride,
2701
                horizontal_size, vertical_size, QP_store, QP_stride, 1, mode->chromMode);
2702
        postProcess(src[2], src_stride, dst[2], dst_stride,
2703
                horizontal_size, vertical_size, QP_store, QP_stride, 2, mode->chromMode);
2704
}
2705

    
2706

    
2707
/**
2708
 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
2709
 * 0 <= quality <= 6
2710
 */
2711
int getPpModeForQuality(int quality){
2712
        int modes[1+GET_PP_QUALITY_MAX]= {
2713
                0,
2714
#if 1
2715
                // horizontal filters first
2716
                LUM_H_DEBLOCK,
2717
                LUM_H_DEBLOCK | LUM_V_DEBLOCK,
2718
                LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
2719
                LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
2720
                LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
2721
                LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
2722
#else
2723
                // vertical filters first
2724
                LUM_V_DEBLOCK,
2725
                LUM_V_DEBLOCK | LUM_H_DEBLOCK,
2726
                LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
2727
                LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
2728
                LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
2729
                LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
2730
#endif
2731
        };
2732

    
2733
#ifdef HAVE_ODIVX_POSTPROCESS
2734
        int odivx_modes[1+GET_PP_QUALITY_MAX]= {
2735
                0,
2736
                PP_DEBLOCK_Y_H,
2737
                PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
2738
                PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
2739
                PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
2740
                PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
2741
                PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
2742
        };
2743
        if(use_old_pp) return odivx_modes[quality];
2744
#endif
2745
        return modes[quality];
2746
}
2747

    
2748
/**
2749
 * Copies a block from src to dst and fixes the blacklevel
2750
 * numLines must be a multiple of 4
2751
 * levelFix == 0 -> dont touch the brighness & contrast
2752
 */
2753
static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2754
        int numLines, int levelFix)
2755
{
2756
#ifndef HAVE_MMX
2757
        int i;
2758
#endif
2759
        if(levelFix)
2760
        {
2761
#ifdef HAVE_MMX
2762
                                        asm volatile(
2763
                                                "leal (%2,%2), %%eax        \n\t"
2764
                                                "leal (%3,%3), %%ebx        \n\t"
2765
                                                "movq packedYOffset, %%mm2        \n\t"
2766
                                                "movq packedYScale, %%mm3        \n\t"
2767
                                                "pxor %%mm4, %%mm4        \n\t"
2768

    
2769
#define SCALED_CPY                                        \
2770
                                                "movq (%0), %%mm0        \n\t"\
2771
                                                "movq (%0), %%mm5        \n\t"\
2772
                                                "punpcklbw %%mm4, %%mm0 \n\t"\
2773
                                                "punpckhbw %%mm4, %%mm5 \n\t"\
2774
                                                "psubw %%mm2, %%mm0        \n\t"\
2775
                                                "psubw %%mm2, %%mm5        \n\t"\
2776
                                                "movq (%0,%2), %%mm1        \n\t"\
2777
                                                "psllw $6, %%mm0        \n\t"\
2778
                                                "psllw $6, %%mm5        \n\t"\
2779
                                                "pmulhw %%mm3, %%mm0        \n\t"\
2780
                                                "movq (%0,%2), %%mm6        \n\t"\
2781
                                                "pmulhw %%mm3, %%mm5        \n\t"\
2782
                                                "punpcklbw %%mm4, %%mm1 \n\t"\
2783
                                                "punpckhbw %%mm4, %%mm6 \n\t"\
2784
                                                "psubw %%mm2, %%mm1        \n\t"\
2785
                                                "psubw %%mm2, %%mm6        \n\t"\
2786
                                                "psllw $6, %%mm1        \n\t"\
2787
                                                "psllw $6, %%mm6        \n\t"\
2788
                                                "pmulhw %%mm3, %%mm1        \n\t"\
2789
                                                "pmulhw %%mm3, %%mm6        \n\t"\
2790
                                                "addl %%eax, %0                \n\t"\
2791
                                                "packuswb %%mm5, %%mm0        \n\t"\
2792
                                                "packuswb %%mm6, %%mm1        \n\t"\
2793
                                                "movq %%mm0, (%1)        \n\t"\
2794
                                                "movq %%mm1, (%1, %3)        \n\t"\
2795

    
2796
SCALED_CPY
2797
                                                "addl %%ebx, %1                \n\t"
2798
SCALED_CPY
2799
                                                "addl %%ebx, %1                \n\t"
2800
SCALED_CPY
2801
                                                "addl %%ebx, %1                \n\t"
2802
SCALED_CPY
2803

    
2804
                                                : "+r"(src),
2805
                                                "+r"(dst)
2806
                                                :"r" (srcStride),
2807
                                                "r" (dstStride)
2808
                                                : "%eax", "%ebx"
2809
                                        );
2810
#else
2811
                                for(i=0; i<numLines; i++)
2812
                                        memcpy(        &(dst[dstStride*i]),
2813
                                                &(src[srcStride*i]), BLOCK_SIZE);
2814
#endif
2815
        }
2816
        else
2817
        {
2818
#ifdef HAVE_MMX
2819
                                        asm volatile(
2820
                                                "movl %4, %%eax \n\t"
2821
                                                "movl %%eax, temp0\n\t"
2822
                                                "pushl %0 \n\t"
2823
                                                "pushl %1 \n\t"
2824
                                                "leal (%2,%2), %%eax        \n\t"
2825
                                                "leal (%3,%3), %%ebx        \n\t"
2826
                                                "movq packedYOffset, %%mm2        \n\t"
2827
                                                "movq packedYScale, %%mm3        \n\t"
2828

    
2829
#define SIMPLE_CPY                                        \
2830
                                                "movq (%0), %%mm0        \n\t"\
2831
                                                "movq (%0,%2), %%mm1        \n\t"\
2832
                                                "movq %%mm0, (%1)        \n\t"\
2833
                                                "movq %%mm1, (%1, %3)        \n\t"\
2834

    
2835
                                                "1:                        \n\t"
2836
SIMPLE_CPY
2837
                                                "addl %%eax, %0                \n\t"
2838
                                                "addl %%ebx, %1                \n\t"
2839
SIMPLE_CPY
2840
                                                "addl %%eax, %0                \n\t"
2841
                                                "addl %%ebx, %1                \n\t"
2842
                                                "decl temp0                \n\t"
2843
                                                "jnz 1b                        \n\t"
2844

    
2845
                                                "popl %1 \n\t"
2846
                                                "popl %0 \n\t"
2847
                                                : : "r" (src),
2848
                                                "r" (dst),
2849
                                                "r" (srcStride),
2850
                                                "r" (dstStride),
2851
                                                "m" (numLines>>2)
2852
                                                : "%eax", "%ebx"
2853
                                        );
2854
#else
2855
                                for(i=0; i<numLines; i++)
2856
                                        memcpy(        &(dst[dstStride*i]),
2857
                                                &(src[srcStride*i]), BLOCK_SIZE);
2858
#endif
2859
        }
2860
}
2861

    
2862

    
2863
/**
2864
 * Filters array of bytes (Y or U or V values)
2865
 */
2866
static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2867
        QP_STORE_T QPs[], int QPStride, int isColor, int mode)
2868
{
2869
        int x,y;
2870
        /* we need 64bit here otherwise we?ll going to have a problem
2871
           after watching a black picture for 5 hours*/
2872
        static uint64_t *yHistogram= NULL;
2873
        int black=0, white=255; // blackest black and whitest white in the picture
2874
        int QPCorrecture= 256;
2875

    
2876
        /* Temporary buffers for handling the last row(s) */
2877
        static uint8_t *tempDst= NULL;
2878
        static uint8_t *tempSrc= NULL;
2879

    
2880
        /* Temporary buffers for handling the last block */
2881
        static uint8_t *tempDstBlock= NULL;
2882
        static uint8_t *tempSrcBlock= NULL;
2883

    
2884
#ifdef PP_FUNNY_STRIDE
2885
        uint8_t *dstBlockPtrBackup;
2886
        uint8_t *srcBlockPtrBackup;
2887
#endif
2888

    
2889
#ifdef MORE_TIMING
2890
        long long T0, T1, diffTime=0;
2891
#endif
2892
#ifdef TIMING
2893
        long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
2894
        sumTime= rdtsc();
2895
#endif
2896

    
2897
        if(tempDst==NULL)
2898
        {
2899
                tempDst= (uint8_t*)memalign(8, 1024*24);
2900
                tempSrc= (uint8_t*)memalign(8, 1024*24);
2901
                tempDstBlock= (uint8_t*)memalign(8, 1024*24);
2902
                tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
2903
        }
2904

    
2905
        if(!yHistogram)
2906
        {
2907
                int i;
2908
                yHistogram= (uint64_t*)malloc(8*256);
2909
                for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
2910

    
2911
                if(mode & FULL_Y_RANGE)
2912
                {
2913
                        maxAllowedY=255;
2914
                        minAllowedY=0;
2915
                }
2916
        }
2917

    
2918
        if(!isColor)
2919
        {
2920
                uint64_t sum= 0;
2921
                int i;
2922
                static int framenum= -1;
2923
                uint64_t maxClipped;
2924
                uint64_t clipped;
2925
                double scale;
2926

    
2927
                framenum++;
2928
                if(framenum == 1) yHistogram[0]= width*height/64*15/256;
2929

    
2930
                for(i=0; i<256; i++)
2931
                {
2932
                        sum+= yHistogram[i];
2933
//                        printf("%d ", yHistogram[i]);
2934
                }
2935
//                printf("\n\n");
2936

    
2937
                /* we allways get a completly black picture first */
2938
                maxClipped= (uint64_t)(sum * maxClippedThreshold);
2939

    
2940
                clipped= sum;
2941
                for(black=255; black>0; black--)
2942
                {
2943
                        if(clipped < maxClipped) break;
2944
                        clipped-= yHistogram[black];
2945
                }
2946

    
2947
                clipped= sum;
2948
                for(white=0; white<256; white++)
2949
                {
2950
                        if(clipped < maxClipped) break;
2951
                        clipped-= yHistogram[white];
2952
                }
2953

    
2954
                packedYOffset= (black - minAllowedY) & 0xFFFF;
2955
                packedYOffset|= packedYOffset<<32;
2956
                packedYOffset|= packedYOffset<<16;
2957

    
2958
                scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
2959

    
2960
                packedYScale= (uint16_t)(scale*1024.0 + 0.5);
2961
                packedYScale|= packedYScale<<32;
2962
                packedYScale|= packedYScale<<16;
2963
        }
2964
        else
2965
        {
2966
                packedYScale= 0x0100010001000100LL;
2967
                packedYOffset= 0;
2968
        }
2969

    
2970
        if(mode & LEVEL_FIX)        QPCorrecture= packedYScale &0xFFFF;
2971
        else                        QPCorrecture= 256;
2972

    
2973
        /* copy first row of 8x8 blocks */
2974
        for(x=0; x<width; x+=BLOCK_SIZE)
2975
                blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
2976

    
2977
        for(y=0; y<height; y+=BLOCK_SIZE)
2978
        {
2979
                //1% speedup if these are here instead of the inner loop
2980
                uint8_t *srcBlock= &(src[y*srcStride]);
2981
                uint8_t *dstBlock= &(dst[y*dstStride]);
2982
#ifdef ARCH_X86
2983
                int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
2984
                int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
2985
                int QPFrac= QPDelta;
2986
                uint8_t *tempBlock1= tempBlocks;
2987
                uint8_t *tempBlock2= tempBlocks + 8;
2988
#endif
2989
                /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
2990
                   if not than use a temporary buffer */
2991
                if(y+15 >= height)
2992
                {
2993
                        /* copy from line 5 to 12 of src, these will be copied with
2994
                           blockcopy to dst later */
2995
                        memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
2996
                                srcStride*MAX(height-y-5, 0) );
2997

    
2998
                        /* duplicate last line to fill the void upto line 12 */
2999
                        if(y+12 >= height)
3000
                        {
3001
                                int i;
3002
                                for(i=height-y; i<=12; i++)
3003
                                        memcpy(tempSrc + srcStride*i,
3004
                                                src + srcStride*(height-1), srcStride);
3005
                        }
3006

    
3007

    
3008
                        /* copy up to 6 lines of dst */
3009
                        memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 6) );
3010
                        dstBlock= tempDst + dstStride;
3011
                        srcBlock= tempSrc;
3012
                }
3013

    
3014
                // From this point on it is guranteed that we can read and write 16 lines downward
3015
                // finish 1 block before the next otherwise we?ll might have a problem
3016
                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3017
                for(x=0; x<width; x+=BLOCK_SIZE)
3018
                {
3019
                        const int stride= dstStride;
3020
                        uint8_t *tmpXchg;
3021
#ifdef ARCH_X86
3022
                        int QP= *QPptr;
3023
                        asm volatile(
3024
                                "addl %2, %1                \n\t"
3025
                                "sbbl %%eax, %%eax        \n\t"
3026
                                "shll $2, %%eax                \n\t"
3027
                                "subl %%eax, %0                \n\t"
3028
                                : "+r" (QPptr), "+m" (QPFrac)
3029
                                : "r" (QPDelta)
3030
                                : "%eax"
3031
                        );
3032
#else
3033
                        int QP= isColor ?
3034
                                QPs[(y>>3)*QPStride + (x>>3)]:
3035
                                QPs[(y>>4)*QPStride + (x>>4)];
3036
#endif
3037
                        if(!isColor)
3038
                        {
3039
                                QP= (QP* QPCorrecture)>>8;
3040
                                yHistogram[ srcBlock[srcStride*4 + 4] ]++;
3041
                        }
3042
#ifdef HAVE_MMX
3043
                        asm volatile(
3044
                                "movd %0, %%mm7                                        \n\t"
3045
                                "packuswb %%mm7, %%mm7                                \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3046
                                "packuswb %%mm7, %%mm7                                \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3047
                                "packuswb %%mm7, %%mm7                                \n\t" // QP,..., QP
3048
                                "movq %%mm7, pQPb                                \n\t"
3049
                                : : "r" (QP)
3050
                        );
3051
#endif
3052

    
3053
#ifdef MORE_TIMING
3054
                        T0= rdtsc();
3055
#endif
3056

    
3057
#ifdef HAVE_MMX2
3058
/*
3059
                        prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3060
                        prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3061
                        prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3062
                        prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3063
*/
3064
/*
3065
                        prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3066
                        prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3067
                        prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3068
                        prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3069
*/
3070

    
3071
                        asm(
3072
                                "movl %4, %%eax                        \n\t"
3073
                                "shrl $2, %%eax                        \n\t"
3074
                                "andl $6, %%eax                        \n\t"
3075
                                "addl $5, %%eax                        \n\t"
3076
                                "movl %%eax, %%ebx                \n\t"
3077
                                "imul %1, %%eax                        \n\t"
3078
                                "imul %3, %%ebx                        \n\t"
3079
                                "prefetchnta 32(%%eax, %0)        \n\t"
3080
                                "prefetcht0 32(%%ebx, %2)        \n\t"
3081
                                "addl %1, %%eax                        \n\t"
3082
                                "addl %3, %%ebx                        \n\t"
3083
                                "prefetchnta 32(%%eax, %0)        \n\t"
3084
                                "prefetcht0 32(%%ebx, %2)        \n\t"
3085
                        :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3086
                        "m" (x)
3087
                        : "%eax", "%ebx"
3088
                        );
3089

    
3090
#elif defined(HAVE_3DNOW)
3091
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3092
/*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3093
                        prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3094
                        prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3095
                        prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3096
*/
3097
#endif
3098

    
3099
#ifdef PP_FUNNY_STRIDE
3100
                        //can we mess with a 8x16 block, if not use a temp buffer, yes again
3101
                        if(x+7 >= width)
3102
                        {
3103
                                int i;
3104
                                dstBlockPtrBackup= dstBlock;
3105
                                srcBlockPtrBackup= srcBlock;
3106

    
3107
                                for(i=0;i<BLOCK_SIZE*2; i++)
3108
                                {
3109
                                        memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
3110
                                        memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
3111
                                }
3112

    
3113
                                dstBlock= tempDstBlock;
3114
                                srcBlock= tempSrcBlock;
3115
                        }
3116
#endif
3117

    
3118
                        blockCopy(dstBlock + dstStride*5, dstStride,
3119
                                srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
3120

    
3121
                        if(mode & LINEAR_IPOL_DEINT_FILTER)
3122
                                deInterlaceInterpolateLinear(dstBlock, dstStride);
3123
                        else if(mode & LINEAR_BLEND_DEINT_FILTER)
3124
                                deInterlaceBlendLinear(dstBlock, dstStride);
3125
                        else if(mode & MEDIAN_DEINT_FILTER)
3126
                                deInterlaceMedian(dstBlock, dstStride);
3127
                        else if(mode & CUBIC_IPOL_DEINT_FILTER)
3128
                                deInterlaceInterpolateCubic(dstBlock, dstStride);
3129
/*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
3130
                                deInterlaceBlendCubic(dstBlock, dstStride);
3131
*/
3132

    
3133
                        /* only deblock if we have 2 blocks */
3134
                        if(y + 8 < height)
3135
                        {
3136
#ifdef MORE_TIMING
3137
                                T1= rdtsc();
3138
                                memcpyTime+= T1-T0;
3139
                                T0=T1;
3140
#endif
3141
                                if(mode & V_RK1_FILTER)
3142
                                        vertRK1Filter(dstBlock, stride, QP);
3143
                                else if(mode & V_X1_FILTER)
3144
                                        vertX1Filter(dstBlock, stride, QP);
3145
                                else if(mode & V_DEBLOCK)
3146
                                {
3147
                                        if( isVertDC(dstBlock, stride))
3148
                                        {
3149
                                                if(isVertMinMaxOk(dstBlock, stride, QP))
3150
                                                        doVertLowPass(dstBlock, stride, QP);
3151
                                        }
3152
                                        else
3153
                                                doVertDefFilter(dstBlock, stride, QP);
3154
                                }
3155
#ifdef MORE_TIMING
3156
                                T1= rdtsc();
3157
                                vertTime+= T1-T0;
3158
                                T0=T1;
3159
#endif
3160
                        }
3161

    
3162
#ifdef HAVE_MMX
3163
                        transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3164
#endif
3165
                        /* check if we have a previous block to deblock it with dstBlock */
3166
                        if(x - 8 >= 0)
3167
                        {
3168
#ifdef MORE_TIMING
3169
                                T0= rdtsc();
3170
#endif
3171
#ifdef HAVE_MMX
3172
                                if(mode & H_RK1_FILTER)
3173
                                        vertRK1Filter(tempBlock1, 16, QP);
3174
                                else if(mode & H_X1_FILTER)
3175
                                        vertX1Filter(tempBlock1, 16, QP);
3176
                                else if(mode & H_DEBLOCK)
3177
                                {
3178
                                        if( isVertDC(tempBlock1, 16))
3179
                                        {
3180
                                                if(isVertMinMaxOk(tempBlock1, 16, QP))
3181
                                                        doVertLowPass(tempBlock1, 16, QP);
3182
                                        }
3183
                                        else
3184
                                                doVertDefFilter(tempBlock1, 16, QP);
3185
                                }
3186

    
3187
                                transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
3188

    
3189
#else
3190
                                if(mode & H_X1_FILTER)
3191
                                        horizX1Filter(dstBlock-4, stride, QP);
3192
                                else if(mode & H_DEBLOCK)
3193
                                {
3194
                                        if( isHorizDC(dstBlock-4, stride))
3195
                                        {
3196
                                                if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3197
                                                        doHorizLowPass(dstBlock-4, stride, QP);
3198
                                        }
3199
                                        else
3200
                                                doHorizDefFilter(dstBlock-4, stride, QP);
3201
                                }
3202
#endif
3203
#ifdef MORE_TIMING
3204
                                T1= rdtsc();
3205
                                horizTime+= T1-T0;
3206
                                T0=T1;
3207
#endif
3208
                                if(mode & DERING)
3209
                                {
3210
                                //FIXME filter first line
3211
                                        if(y>0) dering(dstBlock - stride - 8, stride, QP);
3212
                                }
3213
                        }
3214
                        else if(mode & DERING)
3215
                        {
3216
                         //FIXME y+15 is required cuz of the tempBuffer thing -> bottom right block isnt filtered
3217
                                        if(y > 8 && y+15 < height) dering(dstBlock - stride*9 + width - 8, stride, QP);
3218
                        }
3219

    
3220

    
3221
#ifdef PP_FUNNY_STRIDE
3222
                        /* did we use a tmp-block buffer */
3223
                        if(x+7 >= width)
3224
                        {
3225
                                int i;
3226
                                dstBlock= dstBlockPtrBackup;
3227
                                srcBlock= srcBlockPtrBackup;
3228

    
3229
                                for(i=0;i<BLOCK_SIZE*2; i++)
3230
                                {
3231
                                        memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3232
                                }
3233
                        }
3234
#endif
3235

    
3236
                        dstBlock+=8;
3237
                        srcBlock+=8;
3238

    
3239
#ifdef HAVE_MMX
3240
                        tmpXchg= tempBlock1;
3241
                        tempBlock1= tempBlock2;
3242
                        tempBlock2 = tmpXchg;
3243
#endif
3244
                }
3245

    
3246
                /* did we use a tmp buffer */
3247
                if(y+15 >= height)
3248
                {
3249
                        uint8_t *dstBlock= &(dst[y*dstStride]);
3250
                        memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3251
                }
3252
        }
3253
#ifdef HAVE_3DNOW
3254
        asm volatile("femms");
3255
#elif defined (HAVE_MMX)
3256
        asm volatile("emms");
3257
#endif
3258

    
3259
#ifdef TIMING
3260
        // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3261
        sumTime= rdtsc() - sumTime;
3262
        if(!isColor)
3263
                printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
3264
                        (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3265
                        (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
3266
                        , black, white);
3267
#endif
3268
}