Statistics
| Branch: | Revision:

ffmpeg / postproc / postprocess_template.c @ 3fe8e8f0

History | View | Annotate | Download (101 KB)

1
/*
2
    Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
/*
20
                        C        MMX        MMX2        3DNow
21
isVertDC                Ec        Ec
22
isVertMinMaxOk                Ec        Ec
23
doVertLowPass                E                e        e
24
doVertDefFilter                Ec        Ec        Ec
25
isHorizDC                Ec        Ec
26
isHorizMinMaxOk                a        E
27
doHorizLowPass                E                e        e
28
doHorizDefFilter        Ec        Ec        Ec
29
deRing                        E                e        e*
30
Vertical RKAlgo1        E                a        a
31
Horizontal RKAlgo1                        a        a
32
Vertical X1                a                E        E
33
Horizontal X1                a                E        E
34
LinIpolDeinterlace        e                E        E*
35
CubicIpolDeinterlace        a                e        e*
36
LinBlendDeinterlace        e                E        E*
37
MedianDeinterlace                 Ec        Ec
38

39

40
* i dont have a 3dnow CPU -> its untested
41
E = Exact implementation
42
e = allmost exact implementation (slightly different rounding,...)
43
a = alternative / approximate impl
44
c = checked against the other implementations (-vo md5)
45
*/
46

    
47
/*
48
TODO:
49
verify that everything workes as it should (how?)
50
reduce the time wasted on the mem transfer
51
implement dering
52
implement everything in C at least (done at the moment but ...)
53
unroll stuff if instructions depend too much on the prior one
54
we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55
move YScale thing to the end instead of fixing QP
56
write a faster and higher quality deblocking filter :)
57
do something about the speed of the horizontal filters
58
make the mainloop more flexible (variable number of blocks at once
59
        (the if/else stuff per block is slowing things down)
60
compare the quality & speed of all filters
61
split this huge file
62
fix warnings (unused vars, ...)
63
noise reduction filters
64
border remover
65
optimize c versions
66
...
67

68
Notes:
69
*/
70

    
71
//Changelog: use the CVS log
72

    
73
#include <inttypes.h>
74
#include <stdio.h>
75
#include <stdlib.h>
76
#include <string.h>
77
#include "../config.h"
78
#ifdef HAVE_MALLOC_H
79
#include <malloc.h>
80
#endif
81
//#undef HAVE_MMX2
82
//#define HAVE_3DNOW
83
//#undef HAVE_MMX
84
#include "postprocess.h"
85

    
86
#define MIN(a,b) ((a) > (b) ? (b) : (a))
87
#define MAX(a,b) ((a) < (b) ? (b) : (a))
88
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
89
#define SIGN(a) ((a) > 0 ? 1 : -1)
90

    
91
#ifdef HAVE_MMX2
92
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
93
#elif defined (HAVE_3DNOW)
94
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
95
#endif
96

    
97
#ifdef HAVE_MMX2
98
#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
99
#elif defined (HAVE_MMX)
100
#define PMINUB(b,a,t) \
101
        "movq " #a ", " #t " \n\t"\
102
        "psubusb " #b ", " #t " \n\t"\
103
        "psubb " #t ", " #a " \n\t"
104
#endif
105

    
106
#ifdef HAVE_MMX2
107
#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
108
#elif defined (HAVE_MMX)
109
#define PMAXUB(a,b) \
110
        "psubusb " #a ", " #b " \n\t"\
111
        "paddb " #a ", " #b " \n\t"
112
#endif
113

    
114

    
115
#define GET_MODE_BUFFER_SIZE 500
116
#define OPTIONS_ARRAY_SIZE 10
117

    
118
#ifdef HAVE_MMX
119
static volatile uint64_t __attribute__((aligned(8))) packedYOffset=        0x0000000000000000LL;
120
static volatile uint64_t __attribute__((aligned(8))) packedYScale=        0x0100010001000100LL;
121
static uint64_t __attribute__((aligned(8))) w05=                0x0005000500050005LL;
122
static uint64_t __attribute__((aligned(8))) w20=                0x0020002000200020LL;
123
static uint64_t __attribute__((aligned(8))) w1400=                0x1400140014001400LL;
124
static uint64_t __attribute__((aligned(8))) bm00000001=                0x00000000000000FFLL;
125
static uint64_t __attribute__((aligned(8))) bm00010000=                0x000000FF00000000LL;
126
static uint64_t __attribute__((aligned(8))) bm00001000=                0x00000000FF000000LL;
127
static uint64_t __attribute__((aligned(8))) bm10000000=                0xFF00000000000000LL;
128
static uint64_t __attribute__((aligned(8))) bm10000001=                0xFF000000000000FFLL;
129
static uint64_t __attribute__((aligned(8))) bm11000011=                0xFFFF00000000FFFFLL;
130
static uint64_t __attribute__((aligned(8))) bm00000011=                0x000000000000FFFFLL;
131
static uint64_t __attribute__((aligned(8))) bm11111110=                0xFFFFFFFFFFFFFF00LL;
132
static uint64_t __attribute__((aligned(8))) bm11000000=                0xFFFF000000000000LL;
133
static uint64_t __attribute__((aligned(8))) bm00011000=                0x000000FFFF000000LL;
134
static uint64_t __attribute__((aligned(8))) bm00110011=                0x0000FFFF0000FFFFLL;
135
static uint64_t __attribute__((aligned(8))) bm11001100=                0xFFFF0000FFFF0000LL;
136
static uint64_t __attribute__((aligned(8))) b00=                 0x0000000000000000LL;
137
static uint64_t __attribute__((aligned(8))) b01=                 0x0101010101010101LL;
138
static uint64_t __attribute__((aligned(8))) b02=                 0x0202020202020202LL;
139
static uint64_t __attribute__((aligned(8))) b0F=                 0x0F0F0F0F0F0F0F0FLL;
140
static uint64_t __attribute__((aligned(8))) b04=                 0x0404040404040404LL;
141
static uint64_t __attribute__((aligned(8))) b08=                 0x0808080808080808LL;
142
static uint64_t __attribute__((aligned(8))) bFF=                 0xFFFFFFFFFFFFFFFFLL;
143
static uint64_t __attribute__((aligned(8))) b20=                 0x2020202020202020LL;
144
static uint64_t __attribute__((aligned(8))) b80=                 0x8080808080808080LL;
145
static uint64_t __attribute__((aligned(8))) b7E=                 0x7E7E7E7E7E7E7E7ELL;
146
static uint64_t __attribute__((aligned(8))) b7C=                 0x7C7C7C7C7C7C7C7CLL;
147
static uint64_t __attribute__((aligned(8))) b3F=                 0x3F3F3F3F3F3F3F3FLL;
148
static uint64_t __attribute__((aligned(8))) temp0=0;
149
static uint64_t __attribute__((aligned(8))) temp1=0;
150
static uint64_t __attribute__((aligned(8))) temp2=0;
151
static uint64_t __attribute__((aligned(8))) temp3=0;
152
static uint64_t __attribute__((aligned(8))) temp4=0;
153
static uint64_t __attribute__((aligned(8))) temp5=0;
154
static uint64_t __attribute__((aligned(8))) pQPb=0;
155
static uint64_t __attribute__((aligned(8))) pQPb2=0;
156
static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
157
#else
158
static uint64_t packedYOffset=        0x0000000000000000LL;
159
static uint64_t packedYScale=        0x0100010001000100LL;
160
static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
161
#endif
162

    
163
int hFlatnessThreshold= 56 - 16;
164
int vFlatnessThreshold= 56 - 16;
165

    
166
//amount of "black" u r willing to loose to get a brightness corrected picture
167
double maxClippedThreshold= 0.01;
168

    
169
int maxAllowedY=234;
170
int minAllowedY=16;
171

    
172
static struct PPFilter filters[]=
173
{
174
        {"hb", "hdeblock",                 1, 1, 3, H_DEBLOCK},
175
        {"vb", "vdeblock",                 1, 2, 4, V_DEBLOCK},
176
        {"vr", "rkvdeblock",                 1, 2, 4, H_RK1_FILTER},
177
        {"h1", "x1hdeblock",                 1, 1, 3, H_X1_FILTER},
178
        {"v1", "x1vdeblock",                 1, 2, 4, V_X1_FILTER},
179
        {"dr", "dering",                 1, 5, 6, DERING},
180
        {"al", "autolevels",                 0, 1, 2, LEVEL_FIX},
181
        {"lb", "linblenddeint",         0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
182
        {"li", "linipoldeint",                 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
183
        {"ci", "cubicipoldeint",        0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
184
        {"md", "mediandeint",                 0, 1, 6, MEDIAN_DEINT_FILTER},
185
        {NULL, NULL,0,0,0,0} //End Marker
186
};
187

    
188
static char *replaceTable[]=
189
{
190
        "default",         "hdeblock:a,vdeblock:a,dering:a,autolevels",
191
        "de",                 "hdeblock:a,vdeblock:a,dering:a,autolevels",
192
        "fast",         "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
193
        "fa",                 "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
194
        NULL //End Marker
195
};
196

    
197
#ifdef HAVE_MMX
198
static inline void unusedVariableWarningFixer()
199
{
200
if(
201
 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
202
 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
203
 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
204
 + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
205
 + temp5 + pQPb== 0) b00=0;
206
}
207
#endif
208

    
209
#ifdef TIMING
210
static inline long long rdtsc()
211
{
212
        long long l;
213
        asm volatile(        "rdtsc\n\t"
214
                : "=A" (l)
215
        );
216
//        printf("%d\n", int(l/1000));
217
        return l;
218
}
219
#endif
220

    
221
#ifdef HAVE_MMX2
222
static inline void prefetchnta(void *p)
223
{
224
        asm volatile(        "prefetchnta (%0)\n\t"
225
                : : "r" (p)
226
        );
227
}
228

    
229
static inline void prefetcht0(void *p)
230
{
231
        asm volatile(        "prefetcht0 (%0)\n\t"
232
                : : "r" (p)
233
        );
234
}
235

    
236
static inline void prefetcht1(void *p)
237
{
238
        asm volatile(        "prefetcht1 (%0)\n\t"
239
                : : "r" (p)
240
        );
241
}
242

    
243
static inline void prefetcht2(void *p)
244
{
245
        asm volatile(        "prefetcht2 (%0)\n\t"
246
                : : "r" (p)
247
        );
248
}
249
#endif
250

    
251
//FIXME? |255-0| = 1 (shouldnt be a problem ...)
252
/**
253
 * Check if the middle 8x8 Block in the given 8x16 block is flat
254
 */
255
static inline int isVertDC(uint8_t src[], int stride){
256
        int numEq= 0;
257
#ifndef HAVE_MMX
258
        int y;
259
#endif
260
        src+= stride*4; // src points to begin of the 8x8 Block
261
#ifdef HAVE_MMX
262
asm volatile(
263
                "leal (%1, %2), %%eax                                \n\t"
264
                "leal (%%eax, %2, 4), %%ebx                        \n\t"
265
//        0        1        2        3        4        5        6        7        8        9
266
//        %1        eax        eax+%2        eax+2%2        %1+4%2        ebx        ebx+%2        ebx+2%2        %1+8%2        ebx+4%2
267
                "movq b7E, %%mm7                                        \n\t" // mm7 = 0x7F
268
                "movq b7C, %%mm6                                        \n\t" // mm6 = 0x7D
269
                "movq (%1), %%mm0                                \n\t"
270
                "movq (%%eax), %%mm1                                \n\t"
271
                "psubb %%mm1, %%mm0                                \n\t" // mm0 = differnece
272
                "paddb %%mm7, %%mm0                                \n\t"
273
                "pcmpgtb %%mm6, %%mm0                                \n\t"
274

    
275
                "movq (%%eax,%2), %%mm2                                \n\t"
276
                "psubb %%mm2, %%mm1                                \n\t"
277
                "paddb %%mm7, %%mm1                                \n\t"
278
                "pcmpgtb %%mm6, %%mm1                                \n\t"
279
                "paddb %%mm1, %%mm0                                \n\t"
280

    
281
                "movq (%%eax, %2, 2), %%mm1                        \n\t"
282
                "psubb %%mm1, %%mm2                                \n\t"
283
                "paddb %%mm7, %%mm2                                \n\t"
284
                "pcmpgtb %%mm6, %%mm2                                \n\t"
285
                "paddb %%mm2, %%mm0                                \n\t"
286

    
287
                "movq (%1, %2, 4), %%mm2                        \n\t"
288
                "psubb %%mm2, %%mm1                                \n\t"
289
                "paddb %%mm7, %%mm1                                \n\t"
290
                "pcmpgtb %%mm6, %%mm1                                \n\t"
291
                "paddb %%mm1, %%mm0                                \n\t"
292

    
293
                "movq (%%ebx), %%mm1                                \n\t"
294
                "psubb %%mm1, %%mm2                                \n\t"
295
                "paddb %%mm7, %%mm2                                \n\t"
296
                "pcmpgtb %%mm6, %%mm2                                \n\t"
297
                "paddb %%mm2, %%mm0                                \n\t"
298

    
299
                "movq (%%ebx, %2), %%mm2                        \n\t"
300
                "psubb %%mm2, %%mm1                                \n\t"
301
                "paddb %%mm7, %%mm1                                \n\t"
302
                "pcmpgtb %%mm6, %%mm1                                \n\t"
303
                "paddb %%mm1, %%mm0                                \n\t"
304

    
305
                "movq (%%ebx, %2, 2), %%mm1                        \n\t"
306
                "psubb %%mm1, %%mm2                                \n\t"
307
                "paddb %%mm7, %%mm2                                \n\t"
308
                "pcmpgtb %%mm6, %%mm2                                \n\t"
309
                "paddb %%mm2, %%mm0                                \n\t"
310

    
311
                "                                                \n\t"
312
                "movq %%mm0, %%mm1                                \n\t"
313
                "psrlw $8, %%mm0                                \n\t"
314
                "paddb %%mm1, %%mm0                                \n\t"
315
#ifdef HAVE_MMX2
316
                "pshufw $0xF9, %%mm0, %%mm1                        \n\t"
317
                "paddb %%mm1, %%mm0                                \n\t"
318
                "pshufw $0xFE, %%mm0, %%mm1                        \n\t"
319
#else
320
                "movq %%mm0, %%mm1                                \n\t"
321
                "psrlq $16, %%mm0                                \n\t"
322
                "paddb %%mm1, %%mm0                                \n\t"
323
                "movq %%mm0, %%mm1                                \n\t"
324
                "psrlq $32, %%mm0                                \n\t"
325
#endif
326
                "paddb %%mm1, %%mm0                                \n\t"
327
                "movd %%mm0, %0                                        \n\t"
328
                : "=r" (numEq)
329
                : "r" (src), "r" (stride)
330
                : "%eax", "%ebx"
331
                );
332

    
333
        numEq= (256 - numEq) &0xFF;
334

    
335
#else
336
        for(y=0; y<BLOCK_SIZE-1; y++)
337
        {
338
                if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
339
                if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
340
                if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
341
                if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
342
                if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
343
                if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
344
                if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
345
                if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
346
                src+= stride;
347
        }
348
#endif
349
/*        if(abs(numEq - asmEq) > 0)
350
        {
351
                printf("\nasm:%d  c:%d\n", asmEq, numEq);
352
                for(int y=0; y<8; y++)
353
                {
354
                        for(int x=0; x<8; x++)
355
                        {
356
                                printf("%d ", temp[x + y*stride]);
357
                        }
358
                        printf("\n");
359
                }
360
        }
361
*/
362
//        for(int i=0; i<numEq/8; i++) src[i]=255;
363
        return (numEq > vFlatnessThreshold) ? 1 : 0;
364
}
365

    
366
static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
367
{
368
#ifdef HAVE_MMX
369
        int isOk;
370
        src+= stride*3;
371
        asm volatile(
372
//                "int $3 \n\t"
373
                "movq (%1, %2), %%mm0                                \n\t"
374
                "movq (%1, %2, 8), %%mm1                        \n\t"
375
                "movq %%mm0, %%mm2                                \n\t"
376
                "psubusb %%mm1, %%mm0                                \n\t"
377
                "psubusb %%mm2, %%mm1                                \n\t"
378
                "por %%mm1, %%mm0                                \n\t" // ABS Diff
379

    
380
                "movq pQPb, %%mm7                                \n\t" // QP,..., QP
381
                "paddusb %%mm7, %%mm7                                \n\t" // 2QP ... 2QP
382
                "psubusb %%mm7, %%mm0                                \n\t" // Diff <= 2QP -> 0
383
                "pcmpeqd b00, %%mm0                                \n\t"
384
                "psrlq $16, %%mm0                                \n\t"
385
                "pcmpeqd bFF, %%mm0                                \n\t"
386
//                "movd %%mm0, (%1, %2, 4)\n\t"
387
                "movd %%mm0, %0                                        \n\t"
388
                : "=r" (isOk)
389
                : "r" (src), "r" (stride)
390
                );
391
        return isOk;
392
#else
393

    
394
        int isOk2= 1;
395
        int x;
396
        src+= stride*3;
397
        for(x=0; x<BLOCK_SIZE; x++)
398
        {
399
                if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
400
        }
401
/*        if(isOk && !isOk2 || !isOk && isOk2)
402
        {
403
                printf("\nasm:%d  c:%d QP:%d\n", isOk, isOk2, QP);
404
                for(int y=0; y<9; y++)
405
                {
406
                        for(int x=0; x<8; x++)
407
                        {
408
                                printf("%d ", src[x + y*stride]);
409
                        }
410
                        printf("\n");
411
                }
412
        } */
413

    
414
        return isOk2;
415
#endif
416

    
417
}
418

    
419
/**
420
 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
421
 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
422
 */
423
static inline void doVertLowPass(uint8_t *src, int stride, int QP)
424
{
425
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
426
        src+= stride*3;
427
        asm volatile(        //"movv %0 %1 %2\n\t"
428
                "movq pQPb, %%mm0                                \n\t"  // QP,..., QP
429

    
430
                "movq (%0), %%mm6                                \n\t"
431
                "movq (%0, %1), %%mm5                                \n\t"
432
                "movq %%mm5, %%mm1                                \n\t"
433
                "movq %%mm6, %%mm2                                \n\t"
434
                "psubusb %%mm6, %%mm5                                \n\t"
435
                "psubusb %%mm1, %%mm2                                \n\t"
436
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
437
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
438
                "pcmpeqb b00, %%mm2                                \n\t" // diff <= QP -> FF
439

    
440
                "pand %%mm2, %%mm6                                \n\t"
441
                "pandn %%mm1, %%mm2                                \n\t"
442
                "por %%mm2, %%mm6                                \n\t"// First Line to Filter
443

    
444
                "movq (%0, %1, 8), %%mm5                        \n\t"
445
                "leal (%0, %1, 4), %%eax                        \n\t"
446
                "leal (%0, %1, 8), %%ebx                        \n\t"
447
                "subl %1, %%ebx                                        \n\t"
448
                "addl %1, %0                                        \n\t" // %0 points to line 1 not 0
449
                "movq (%0, %1, 8), %%mm7                        \n\t"
450
                "movq %%mm5, %%mm1                                \n\t"
451
                "movq %%mm7, %%mm2                                \n\t"
452
                "psubusb %%mm7, %%mm5                                \n\t"
453
                "psubusb %%mm1, %%mm2                                \n\t"
454
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
455
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
456
                "pcmpeqb b00, %%mm2                                \n\t" // diff <= QP -> FF
457

    
458
                "pand %%mm2, %%mm7                                \n\t"
459
                "pandn %%mm1, %%mm2                                \n\t"
460
                "por %%mm2, %%mm7                                \n\t" // First Line to Filter
461

    
462

    
463
                //         1        2        3        4        5        6        7        8
464
                //        %0        %0+%1        %0+2%1        eax        %0+4%1        eax+2%1        ebx        eax+4%1
465
                // 6 4 2 2 1 1
466
                // 6 4 4 2
467
                // 6 8 2
468

    
469
                "movq (%0, %1), %%mm0                                \n\t" //  1
470
                "movq %%mm0, %%mm1                                \n\t" //  1
471
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
472
                PAVGB(%%mm6, %%mm0)                                      //3 1        /4
473

    
474
                "movq (%0, %1, 4), %%mm2                        \n\t" //     1
475
                "movq %%mm2, %%mm5                                \n\t" //     1
476
                PAVGB((%%eax), %%mm2)                                      //    11        /2
477
                PAVGB((%0, %1, 2), %%mm2)                              //   211        /4
478
                "movq %%mm2, %%mm3                                \n\t" //   211        /4
479
                "movq (%0), %%mm4                                \n\t" // 1
480
                PAVGB(%%mm4, %%mm3)                                      // 4 211        /8
481
                PAVGB(%%mm0, %%mm3)                                      //642211        /16
482
                "movq %%mm3, (%0)                                \n\t" // X
483
                // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
484
                "movq %%mm1, %%mm0                                \n\t" //  1
485
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
486
                "movq %%mm4, %%mm3                                \n\t" // 1
487
                PAVGB((%0,%1,2), %%mm3)                                      // 1 1        /2
488
                PAVGB((%%eax,%1,2), %%mm5)                              //     11        /2
489
                PAVGB((%%eax), %%mm5)                                      //    211 /4
490
                PAVGB(%%mm5, %%mm3)                                      // 2 2211 /8
491
                PAVGB(%%mm0, %%mm3)                                      //4242211 /16
492
                "movq %%mm3, (%0,%1)                                \n\t" //  X
493
                // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
494
                PAVGB(%%mm4, %%mm6)                                      //11        /2
495
                "movq (%%ebx), %%mm0                                \n\t" //       1
496
                PAVGB((%%eax, %1, 2), %%mm0)                              //      11/2
497
                "movq %%mm0, %%mm3                                \n\t" //      11/2
498
                PAVGB(%%mm1, %%mm0)                                      //  2   11/4
499
                PAVGB(%%mm6, %%mm0)                                      //222   11/8
500
                PAVGB(%%mm2, %%mm0)                                      //22242211/16
501
                "movq (%0, %1, 2), %%mm2                        \n\t" //   1
502
                "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
503
                // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
504
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
505
                PAVGB((%%ebx), %%mm0)                                      //       11        /2
506
                PAVGB(%%mm0, %%mm6)                                      //11     11        /4
507
                PAVGB(%%mm1, %%mm4)                                      // 11                /2
508
                PAVGB(%%mm2, %%mm1)                                      //  11                /2
509
                PAVGB(%%mm1, %%mm6)                                      //1122   11        /8
510
                PAVGB(%%mm5, %%mm6)                                      //112242211        /16
511
                "movq (%%eax), %%mm5                                \n\t" //    1
512
                "movq %%mm6, (%%eax)                                \n\t" //    X
513
                // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
514
                "movq (%%eax, %1, 4), %%mm6                        \n\t" //        1
515
                PAVGB(%%mm7, %%mm6)                                      //        11        /2
516
                PAVGB(%%mm4, %%mm6)                                      // 11     11        /4
517
                PAVGB(%%mm3, %%mm6)                                      // 11   2211        /8
518
                PAVGB(%%mm5, %%mm2)                                      //   11                /2
519
                "movq (%0, %1, 4), %%mm4                        \n\t" //     1
520
                PAVGB(%%mm4, %%mm2)                                      //   112                /4
521
                PAVGB(%%mm2, %%mm6)                                      // 112242211        /16
522
                "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
523
                // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
524
                PAVGB(%%mm7, %%mm1)                                      //  11     2        /4
525
                PAVGB(%%mm4, %%mm5)                                      //    11                /2
526
                PAVGB(%%mm5, %%mm0)                                      //    11 11        /4
527
                "movq (%%eax, %1, 2), %%mm6                        \n\t" //      1
528
                PAVGB(%%mm6, %%mm1)                                      //  11  4  2        /8
529
                PAVGB(%%mm0, %%mm1)                                      //  11224222        /16
530
                "movq %%mm1, (%%eax, %1, 2)                        \n\t" //      X
531
                // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
532
                PAVGB((%%ebx), %%mm2)                                      //   112 4        /8
533
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
534
                PAVGB(%%mm0, %%mm6)                                      //      1 1        /2
535
                PAVGB(%%mm7, %%mm6)                                      //      1 12        /4
536
                PAVGB(%%mm2, %%mm6)                                      //   1122424        /4
537
                "movq %%mm6, (%%ebx)                                \n\t" //       X
538
                // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
539
                PAVGB(%%mm7, %%mm5)                                      //    11   2        /4
540
                PAVGB(%%mm7, %%mm5)                                      //    11   6        /8
541

    
542
                PAVGB(%%mm3, %%mm0)                                      //      112        /4
543
                PAVGB(%%mm0, %%mm5)                                      //    112246        /16
544
                "movq %%mm5, (%%eax, %1, 4)                        \n\t" //        X
545
                "subl %1, %0                                        \n\t"
546

    
547
                :
548
                : "r" (src), "r" (stride)
549
                : "%eax", "%ebx"
550
        );
551
#else
552
        const int l1= stride;
553
        const int l2= stride + l1;
554
        const int l3= stride + l2;
555
        const int l4= stride + l3;
556
        const int l5= stride + l4;
557
        const int l6= stride + l5;
558
        const int l7= stride + l6;
559
        const int l8= stride + l7;
560
        const int l9= stride + l8;
561
        int x;
562
        src+= stride*3;
563
        for(x=0; x<BLOCK_SIZE; x++)
564
        {
565
                const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
566
                const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
567

    
568
                int sums[9];
569
                sums[0] = first + src[l1];
570
                sums[1] = src[l1] + src[l2];
571
                sums[2] = src[l2] + src[l3];
572
                sums[3] = src[l3] + src[l4];
573
                sums[4] = src[l4] + src[l5];
574
                sums[5] = src[l5] + src[l6];
575
                sums[6] = src[l6] + src[l7];
576
                sums[7] = src[l7] + src[l8];
577
                sums[8] = src[l8] + last;
578

    
579
                src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
580
                src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
581
                src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
582
                src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
583
                src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
584
                src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
585
                src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
586
                src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
587

    
588
                src++;
589
        }
590

    
591
#endif
592
}
593

    
594
/**
595
 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
596
 * values are correctly clipped (MMX2)
597
 * values are wraparound (C)
598
 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
599
        0 8 16 24
600
        x = 8
601
        x/2 = 4
602
        x/8 = 1
603
        1 12 12 23
604
 */
605
static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
606
{
607
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
608
        src+= stride*3;
609
// FIXME rounding
610
        asm volatile(
611
                "pxor %%mm7, %%mm7                                \n\t" // 0
612
                "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
613
                "leal (%0, %1), %%eax                                \n\t"
614
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
615
//        0        1        2        3        4        5        6        7        8        9
616
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
617
                "movq pQPb, %%mm0                                \n\t" // QP,..., QP
618
                "movq %%mm0, %%mm1                                \n\t" // QP,..., QP
619
                "paddusb b02, %%mm0                                \n\t"
620
                "psrlw $2, %%mm0                                \n\t"
621
                "pand b3F, %%mm0                                \n\t" // QP/4,..., QP/4
622
                "paddusb %%mm1, %%mm0                                \n\t" // QP*1.25 ...
623
                "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
624
                "movq (%%ebx), %%mm3                                \n\t" // line 5
625
                "movq %%mm2, %%mm4                                \n\t" // line 4
626
                "pcmpeqb %%mm5, %%mm5                                \n\t" // -1
627
                "pxor %%mm2, %%mm5                                \n\t" // -line 4 - 1
628
                PAVGB(%%mm3, %%mm5)
629
                "paddb %%mm6, %%mm5                                \n\t" // (l5-l4)/2
630
                "psubusb %%mm3, %%mm4                                \n\t"
631
                "psubusb %%mm2, %%mm3                                \n\t"
632
                "por %%mm3, %%mm4                                \n\t" // |l4 - l5|
633
                "psubusb %%mm0, %%mm4                                \n\t"
634
                "pcmpeqb %%mm7, %%mm4                                \n\t"
635
                "pand %%mm4, %%mm5                                \n\t" // d/2
636

    
637
//                "paddb %%mm6, %%mm2                                \n\t" // line 4 + 0x80
638
                "paddb %%mm5, %%mm2                                \n\t"
639
//                "psubb %%mm6, %%mm2                                \n\t"
640
                "movq %%mm2, (%0,%1, 4)                                \n\t"
641

    
642
                "movq (%%ebx), %%mm2                                \n\t"
643
//                "paddb %%mm6, %%mm2                                \n\t" // line 5 + 0x80
644
                "psubb %%mm5, %%mm2                                \n\t"
645
//                "psubb %%mm6, %%mm2                                \n\t"
646
                "movq %%mm2, (%%ebx)                                \n\t"
647

    
648
                "paddb %%mm6, %%mm5                                \n\t"
649
                "psrlw $2, %%mm5                                \n\t"
650
                "pand b3F, %%mm5                                \n\t"
651
                "psubb b20, %%mm5                                \n\t" // (l5-l4)/8
652

    
653
                "movq (%%eax, %1, 2), %%mm2                        \n\t"
654
                "paddb %%mm6, %%mm2                                \n\t" // line 3 + 0x80
655
                "paddsb %%mm5, %%mm2                                \n\t"
656
                "psubb %%mm6, %%mm2                                \n\t"
657
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
658

    
659
                "movq (%%ebx, %1), %%mm2                        \n\t"
660
                "paddb %%mm6, %%mm2                                \n\t" // line 6 + 0x80
661
                "psubsb %%mm5, %%mm2                                \n\t"
662
                "psubb %%mm6, %%mm2                                \n\t"
663
                "movq %%mm2, (%%ebx, %1)                        \n\t"
664

    
665
                :
666
                : "r" (src), "r" (stride)
667
                : "%eax", "%ebx"
668
        );
669
#else
670
         const int l1= stride;
671
        const int l2= stride + l1;
672
        const int l3= stride + l2;
673
        const int l4= stride + l3;
674
        const int l5= stride + l4;
675
        const int l6= stride + l5;
676
//        const int l7= stride + l6;
677
//        const int l8= stride + l7;
678
//        const int l9= stride + l8;
679
        int x;
680
        const int QP15= QP + (QP>>2);
681
        src+= stride*3;
682
        for(x=0; x<BLOCK_SIZE; x++)
683
        {
684
                const int v = (src[x+l5] - src[x+l4]);
685
                if(ABS(v) < QP15)
686
                {
687
                        src[x+l3] +=v>>3;
688
                        src[x+l4] +=v>>1;
689
                        src[x+l5] -=v>>1;
690
                        src[x+l6] -=v>>3;
691

    
692
                }
693
        }
694

    
695
#endif
696
}
697

    
698
/**
699
 * Experimental Filter 1
700
 * will not damage linear gradients
701
 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
702
 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
703
 * MMX2 version does correct clipping C version doesnt
704
 */
705
static inline void vertX1Filter(uint8_t *src, int stride, int QP)
706
{
707
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
708
        src+= stride*3;
709

    
710
        asm volatile(
711
                "pxor %%mm7, %%mm7                                \n\t" // 0
712
//                "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
713
                "leal (%0, %1), %%eax                                \n\t"
714
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
715
//        0        1        2        3        4        5        6        7        8        9
716
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
717
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
718
                "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
719
                "movq %%mm1, %%mm2                                \n\t" // line 4
720
                "psubusb %%mm0, %%mm1                                \n\t"
721
                "psubusb %%mm2, %%mm0                                \n\t"
722
                "por %%mm1, %%mm0                                \n\t" // |l2 - l3|
723
                "movq (%%ebx), %%mm3                                \n\t" // line 5
724
                "movq (%%ebx, %1), %%mm4                                \n\t" // line 6
725
                "movq %%mm3, %%mm5                                \n\t" // line 5
726
                "psubusb %%mm4, %%mm3                                \n\t"
727
                "psubusb %%mm5, %%mm4                                \n\t"
728
                "por %%mm4, %%mm3                                \n\t" // |l5 - l6|
729
                PAVGB(%%mm3, %%mm0)                                      // (|l2 - l3| + |l5 - l6|)/2
730
                "movq %%mm2, %%mm1                                \n\t" // line 4
731
                "psubusb %%mm5, %%mm2                                \n\t"
732
                "movq %%mm2, %%mm4                                \n\t"
733
                "pcmpeqb %%mm7, %%mm2                                \n\t" // (l4 - l5) <= 0 ? -1 : 0
734
                "psubusb %%mm1, %%mm5                                \n\t"
735
                "por %%mm5, %%mm4                                \n\t" // |l4 - l5|
736
                "psubusb %%mm0, %%mm4                \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
737
                "movq %%mm4, %%mm3                                \n\t" // d
738
                "psubusb pQPb, %%mm4                                \n\t"
739
                "pcmpeqb %%mm7, %%mm4                                \n\t" // d <= QP ? -1 : 0
740
                "psubusb b01, %%mm3                                \n\t"
741
                "pand %%mm4, %%mm3                                \n\t" // d <= QP ? d : 0
742

    
743
                PAVGB(%%mm7, %%mm3)                                      // d/2
744
                "movq %%mm3, %%mm1                                \n\t" // d/2
745
                PAVGB(%%mm7, %%mm3)                                      // d/4
746
                PAVGB(%%mm1, %%mm3)                                      // 3*d/8
747

    
748
                "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
749
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
750
                "psubusb %%mm3, %%mm0                                \n\t"
751
                "pxor %%mm2, %%mm0                                \n\t"
752
                "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
753

    
754
                "movq (%%ebx), %%mm0                                \n\t" // line 5
755
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
756
                "paddusb %%mm3, %%mm0                                \n\t"
757
                "pxor %%mm2, %%mm0                                \n\t"
758
                "movq %%mm0, (%%ebx)                                \n\t" // line 5
759

    
760
                PAVGB(%%mm7, %%mm1)                                      // d/4
761

    
762
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
763
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
764
                "psubusb %%mm1, %%mm0                                \n\t"
765
                "pxor %%mm2, %%mm0                                \n\t"
766
                "movq %%mm0, (%%eax, %1, 2)                        \n\t" // line 3
767

    
768
                "movq (%%ebx, %1), %%mm0                        \n\t" // line 6
769
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
770
                "paddusb %%mm1, %%mm0                                \n\t"
771
                "pxor %%mm2, %%mm0                                \n\t"
772
                "movq %%mm0, (%%ebx, %1)                        \n\t" // line 6
773

    
774
                PAVGB(%%mm7, %%mm1)                                      // d/8
775

    
776
                "movq (%%eax, %1), %%mm0                        \n\t" // line 2
777
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
778
                "psubusb %%mm1, %%mm0                                \n\t"
779
                "pxor %%mm2, %%mm0                                \n\t"
780
                "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
781

    
782
                "movq (%%ebx, %1, 2), %%mm0                        \n\t" // line 7
783
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
784
                "paddusb %%mm1, %%mm0                                \n\t"
785
                "pxor %%mm2, %%mm0                                \n\t"
786
                "movq %%mm0, (%%ebx, %1, 2)                        \n\t" // line 7
787

    
788
                :
789
                : "r" (src), "r" (stride)
790
                : "%eax", "%ebx"
791
        );
792
#else
793

    
794
         const int l1= stride;
795
        const int l2= stride + l1;
796
        const int l3= stride + l2;
797
        const int l4= stride + l3;
798
        const int l5= stride + l4;
799
        const int l6= stride + l5;
800
        const int l7= stride + l6;
801
//        const int l8= stride + l7;
802
//        const int l9= stride + l8;
803
        int x;
804

    
805
        src+= stride*3;
806
        for(x=0; x<BLOCK_SIZE; x++)
807
        {
808
                int a= src[l3] - src[l4];
809
                int b= src[l4] - src[l5];
810
                int c= src[l5] - src[l6];
811

    
812
                int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
813
                d= MAX(d, 0);
814

    
815
                if(d < QP)
816
                {
817
                        int v = d * SIGN(-b);
818

    
819
                        src[l2] +=v>>3;
820
                        src[l3] +=v>>2;
821
                        src[l4] +=(3*v)>>3;
822
                        src[l5] -=(3*v)>>3;
823
                        src[l6] -=v>>2;
824
                        src[l7] -=v>>3;
825

    
826
                }
827
                src++;
828
        }
829
        /*
830
         const int l1= stride;
831
        const int l2= stride + l1;
832
        const int l3= stride + l2;
833
        const int l4= stride + l3;
834
        const int l5= stride + l4;
835
        const int l6= stride + l5;
836
        const int l7= stride + l6;
837
        const int l8= stride + l7;
838
        const int l9= stride + l8;
839
        for(int x=0; x<BLOCK_SIZE; x++)
840
        {
841
                int v2= src[l2];
842
                int v3= src[l3];
843
                int v4= src[l4];
844
                int v5= src[l5];
845
                int v6= src[l6];
846
                int v7= src[l7];
847

848
                if(ABS(v4-v5)<QP &&  ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
849
                {
850
                        src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6         )/16;
851
                        src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7  )/16;
852
                        src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
853
                        src[l6] = (       1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
854
                }
855
                src++;
856
        }
857
*/
858
#endif
859
}
860

    
861
/**
862
 * Experimental Filter 1 (Horizontal)
863
 * will not damage linear gradients
864
 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
865
 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
866
 * MMX2 version does correct clipping C version doesnt
867
 * not identical with the vertical one
868
 */
869
static inline void horizX1Filter(uint8_t *src, int stride, int QP)
870
{
871
        int y;
872
        static uint64_t *lut= NULL;
873
        if(lut==NULL)
874
        {
875
                int i;
876
                lut= (uint64_t*)memalign(8, 256*8);
877
                for(i=0; i<256; i++)
878
                {
879
                        int v= i < 128 ? 2*i : 2*(i-256);
880
/*
881
//Simulate 112242211 9-Tap filter
882
                        uint64_t a= (v/16) & 0xFF;
883
                        uint64_t b= (v/8) & 0xFF;
884
                        uint64_t c= (v/4) & 0xFF;
885
                        uint64_t d= (3*v/8) & 0xFF;
886
*/
887
//Simulate piecewise linear interpolation
888
                        uint64_t a= (v/16) & 0xFF;
889
                        uint64_t b= (v*3/16) & 0xFF;
890
                        uint64_t c= (v*5/16) & 0xFF;
891
                        uint64_t d= (7*v/16) & 0xFF;
892
                        uint64_t A= (0x100 - a)&0xFF;
893
                        uint64_t B= (0x100 - b)&0xFF;
894
                        uint64_t C= (0x100 - c)&0xFF;
895
                        uint64_t D= (0x100 - c)&0xFF;
896

    
897
                        lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
898
                                (D<<24) | (C<<16) | (B<<8) | (A);
899
                        //lut[i] = (v<<32) | (v<<24);
900
                }
901
        }
902

    
903
#if 0
904
        asm volatile(
905
                "pxor %%mm7, %%mm7                                \n\t" // 0
906
//                "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
907
                "leal (%0, %1), %%eax                                \n\t"
908
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
909

910
                "movq b80, %%mm6                                \n\t"
911
                "movd pQPb, %%mm5                                \n\t" // QP
912
                "movq %%mm5, %%mm4                                \n\t"
913
                "paddusb %%mm5, %%mm5                                \n\t" // 2QP
914
                "paddusb %%mm5, %%mm4                                \n\t" // 3QP
915
                "pxor %%mm5, %%mm5                                \n\t" // 0
916
                "psubb %%mm4, %%mm5                                \n\t" // -3QP
917
                "por bm11111110, %%mm5                                \n\t" // ...,FF,FF,-3QP
918
                "psllq $24, %%mm5                                \n\t"
919

920
//        0        1        2        3        4        5        6        7        8        9
921
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
922

923
#define HX1old(a) \
924
                "movd " #a ", %%mm0                                \n\t"\
925
                "movd 4" #a ", %%mm1                                \n\t"\
926
                "punpckldq %%mm1, %%mm0                                \n\t"\
927
                "movq %%mm0, %%mm1                                \n\t"\
928
                "movq %%mm0, %%mm2                                \n\t"\
929
                "psrlq $8, %%mm1                                \n\t"\
930
                "psubusb %%mm1, %%mm2                                \n\t"\
931
                "psubusb %%mm0, %%mm1                                \n\t"\
932
                "por %%mm2, %%mm1                                \n\t" /* p?x = |px - p(x+1)| */\
933
                "pcmpeqb %%mm7, %%mm2                                \n\t" /* p?x = sgn[px - p(x+1)] */\
934
                "pshufw $0x00, %%mm1, %%mm3                        \n\t" /* p?5 = |p1 - p2| */\
935
                PAVGB(%%mm1, %%mm3)                                      /* p?5 = (|p2-p1| + |p6-p5|)/2 */\
936
                "psrlq $16, %%mm3                                \n\t" /* p?3 = (|p2-p1| + |p6-p5|)/2 */\
937
                "psubusb %%mm3, %%mm1                        \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
938
                "paddb %%mm5, %%mm1                                \n\t"\
939
                "psubusb %%mm5, %%mm1                                \n\t"\
940
                PAVGB(%%mm7, %%mm1)\
941
                "pxor %%mm2, %%mm1                                \n\t"\
942
                "psubb %%mm2, %%mm1                                \n\t"\
943
                "psrlq $24, %%mm1                                \n\t"\
944
                "movd %%mm1, %%ecx                                \n\t"\
945
                "paddb %%mm6, %%mm0                                \n\t"\
946
                "paddsb (%3, %%ecx, 8), %%mm0                        \n\t"\
947
                "paddb %%mm6, %%mm0                                \n\t"\
948
                "movq %%mm0, " #a "                                \n\t"\
949

950
/*
951
HX1old((%0))
952
HX1old((%%eax))
953
HX1old((%%eax, %1))
954
HX1old((%%eax, %1, 2))
955
HX1old((%0, %1, 4))
956
HX1old((%%ebx))
957
HX1old((%%ebx, %1))
958
HX1old((%%ebx, %1, 2))
959
*/
960

961
//FIXME add some comments, its unreadable ...
962
#define HX1b(a, c, b, d) \
963
                "movd " #a ", %%mm0                                \n\t"\
964
                "movd 4" #a ", %%mm1                                \n\t"\
965
                "punpckldq %%mm1, %%mm0                                \n\t"\
966
                "movd " #b ", %%mm4                                \n\t"\
967
                "movq %%mm0, %%mm1                                \n\t"\
968
                "movq %%mm0, %%mm2                                \n\t"\
969
                "psrlq $8, %%mm1                                \n\t"\
970
                "movd 4" #b ", %%mm3                                \n\t"\
971
                "psubusb %%mm1, %%mm2                                \n\t"\
972
                "psubusb %%mm0, %%mm1                                \n\t"\
973
                "por %%mm2, %%mm1                                \n\t" /* p?x = |px - p(x+1)| */\
974
                "pcmpeqb %%mm7, %%mm2                                \n\t" /* p?x = sgn[px - p(x+1)] */\
975
                "punpckldq %%mm3, %%mm4                                \n\t"\
976
                "movq %%mm1, %%mm3                                \n\t"\
977
                "psllq $32, %%mm3                                \n\t" /* p?5 = |p1 - p2| */\
978
                PAVGB(%%mm1, %%mm3)                                      /* p?5 = (|p2-p1| + |p6-p5|)/2 */\
979
                "paddb %%mm6, %%mm0                                \n\t"\
980
                "psrlq $16, %%mm3                                \n\t" /* p?3 = (|p2-p1| + |p6-p5|)/2 */\
981
                "psubusb %%mm3, %%mm1                        \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
982
                "movq %%mm4, %%mm3                                \n\t"\
983
                "paddb %%mm5, %%mm1                                \n\t"\
984
                "psubusb %%mm5, %%mm1                                \n\t"\
985
                "psrlq $8, %%mm3                                \n\t"\
986
                PAVGB(%%mm7, %%mm1)\
987
                "pxor %%mm2, %%mm1                                \n\t"\
988
                "psubb %%mm2, %%mm1                                \n\t"\
989
                "movq %%mm4, %%mm2                                \n\t"\
990
                "psrlq $24, %%mm1                                \n\t"\
991
                "psubusb %%mm3, %%mm2                                \n\t"\
992
                "movd %%mm1, %%ecx                                \n\t"\
993
                "psubusb %%mm4, %%mm3                                \n\t"\
994
                "paddsb (%2, %%ecx, 8), %%mm0                        \n\t"\
995
                "por %%mm2, %%mm3                                \n\t" /* p?x = |px - p(x+1)| */\
996
                "paddb %%mm6, %%mm0                                \n\t"\
997
                "pcmpeqb %%mm7, %%mm2                                \n\t" /* p?x = sgn[px - p(x+1)] */\
998
                "movq %%mm3, %%mm1                                \n\t"\
999
                "psllq $32, %%mm1                                \n\t" /* p?5 = |p1 - p2| */\
1000
                "movq %%mm0, " #a "                                \n\t"\
1001
                PAVGB(%%mm3, %%mm1)                                      /* p?5 = (|p2-p1| + |p6-p5|)/2 */\
1002
                "paddb %%mm6, %%mm4                                \n\t"\
1003
                "psrlq $16, %%mm1                                \n\t" /* p?3 = (|p2-p1| + |p6-p5|)/2 */\
1004
                "psubusb %%mm1, %%mm3                        \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
1005
                "paddb %%mm5, %%mm3                                \n\t"\
1006
                "psubusb %%mm5, %%mm3                                \n\t"\
1007
                PAVGB(%%mm7, %%mm3)\
1008
                "pxor %%mm2, %%mm3                                \n\t"\
1009
                "psubb %%mm2, %%mm3                                \n\t"\
1010
                "psrlq $24, %%mm3                                \n\t"\
1011
                "movd " #c ", %%mm0                                \n\t"\
1012
                "movd 4" #c ", %%mm1                                \n\t"\
1013
                "punpckldq %%mm1, %%mm0                                \n\t"\
1014
                "paddb %%mm6, %%mm0                                \n\t"\
1015
                "paddsb (%2, %%ecx, 8), %%mm0                        \n\t"\
1016
                "paddb %%mm6, %%mm0                                \n\t"\
1017
                "movq %%mm0, " #c "                                \n\t"\
1018
                "movd %%mm3, %%ecx                                \n\t"\
1019
                "movd " #d ", %%mm0                                \n\t"\
1020
                "paddsb (%2, %%ecx, 8), %%mm4                        \n\t"\
1021
                "movd 4" #d ", %%mm1                                \n\t"\
1022
                "paddb %%mm6, %%mm4                                \n\t"\
1023
                "punpckldq %%mm1, %%mm0                                \n\t"\
1024
                "movq %%mm4, " #b "                                \n\t"\
1025
                "paddb %%mm6, %%mm0                                \n\t"\
1026
                "paddsb (%2, %%ecx, 8), %%mm0                        \n\t"\
1027
                "paddb %%mm6, %%mm0                                \n\t"\
1028
                "movq %%mm0, " #d "                                \n\t"\
1029

1030
HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
1031
HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
1032

1033

1034
                :
1035
                : "r" (src), "r" (stride), "r" (lut)
1036
                : "%eax", "%ebx", "%ecx"
1037
        );
1038
#else
1039

    
1040
//FIXME (has little in common with the mmx2 version)
1041
        for(y=0; y<BLOCK_SIZE; y++)
1042
        {
1043
                int a= src[1] - src[2];
1044
                int b= src[3] - src[4];
1045
                int c= src[5] - src[6];
1046

    
1047
                int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
1048

    
1049
                if(d < QP)
1050
                {
1051
                        int v = d * SIGN(-b);
1052

    
1053
                        src[1] +=v/8;
1054
                        src[2] +=v/4;
1055
                        src[3] +=3*v/8;
1056
                        src[4] -=3*v/8;
1057
                        src[5] -=v/4;
1058
                        src[6] -=v/8;
1059

    
1060
                }
1061
                src+=stride;
1062
        }
1063
#endif
1064
}
1065

    
1066

    
1067
static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1068
{
1069
#ifdef HAVE_MMX
1070
        src+= stride*4;
1071
        //FIXME try pmul for *5 stuff
1072
//        src[0]=0;
1073
        asm volatile(
1074
                "pxor %%mm7, %%mm7                                \n\t"
1075
                "leal (%0, %1), %%eax                                \n\t"
1076
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1077
//        0        1        2        3        4        5        6        7
1078
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ebx+%1        ebx+2%1
1079
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1
1080

    
1081
                "movq (%0), %%mm0                                \n\t"
1082
                "movq %%mm0, %%mm1                                \n\t"
1083
                "punpcklbw %%mm7, %%mm0                                \n\t" // low part of line 0
1084
                "punpckhbw %%mm7, %%mm1                                \n\t" // high part of line 0
1085

    
1086
                "movq (%%eax), %%mm2                                \n\t"
1087
                "movq %%mm2, %%mm3                                \n\t"
1088
                "punpcklbw %%mm7, %%mm2                                \n\t" // low part of line 1
1089
                "punpckhbw %%mm7, %%mm3                                \n\t" // high part of line 1
1090

    
1091
                "movq (%%eax, %1), %%mm4                        \n\t"
1092
                "movq %%mm4, %%mm5                                \n\t"
1093
                "punpcklbw %%mm7, %%mm4                                \n\t" // low part of line 2
1094
                "punpckhbw %%mm7, %%mm5                                \n\t" // high part of line 2
1095

    
1096
                "paddw %%mm0, %%mm0                                \n\t" // 2L0
1097
                "paddw %%mm1, %%mm1                                \n\t" // 2H0
1098
                "psubw %%mm4, %%mm2                                \n\t" // L1 - L2
1099
                "psubw %%mm5, %%mm3                                \n\t" // H1 - H2
1100
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - L1 + L2
1101
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - H1 + H2
1102

    
1103
                "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
1104
                "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
1105
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2
1106
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2
1107

    
1108
                "movq (%%eax, %1, 2), %%mm2                        \n\t"
1109
                "movq %%mm2, %%mm3                                \n\t"
1110
                "punpcklbw %%mm7, %%mm2                                \n\t" // L3
1111
                "punpckhbw %%mm7, %%mm3                                \n\t" // H3
1112

    
1113
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - L3
1114
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - H3
1115
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1116
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1117
                "movq %%mm0, temp0                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1118
                "movq %%mm1, temp1                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1119

    
1120
                "movq (%0, %1, 4), %%mm0                        \n\t"
1121
                "movq %%mm0, %%mm1                                \n\t"
1122
                "punpcklbw %%mm7, %%mm0                                \n\t" // L4
1123
                "punpckhbw %%mm7, %%mm1                                \n\t" // H4
1124

    
1125
                "psubw %%mm0, %%mm2                                \n\t" // L3 - L4
1126
                "psubw %%mm1, %%mm3                                \n\t" // H3 - H4
1127
                "movq %%mm2, temp2                                \n\t" // L3 - L4
1128
                "movq %%mm3, temp3                                \n\t" // H3 - H4
1129
                "paddw %%mm4, %%mm4                                \n\t" // 2L2
1130
                "paddw %%mm5, %%mm5                                \n\t" // 2H2
1131
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - L3 + L4
1132
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - H3 + H4
1133

    
1134
                "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
1135
                "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
1136
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4
1137
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4
1138
//50 opcodes so far
1139
                "movq (%%ebx), %%mm2                                \n\t"
1140
                "movq %%mm2, %%mm3                                \n\t"
1141
                "punpcklbw %%mm7, %%mm2                                \n\t" // L5
1142
                "punpckhbw %%mm7, %%mm3                                \n\t" // H5
1143
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - L5
1144
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - H5
1145
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1146
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1147

    
1148
                "movq (%%ebx, %1), %%mm6                        \n\t"
1149
                "punpcklbw %%mm7, %%mm6                                \n\t" // L6
1150
                "psubw %%mm6, %%mm2                                \n\t" // L5 - L6
1151
                "movq (%%ebx, %1), %%mm6                        \n\t"
1152
                "punpckhbw %%mm7, %%mm6                                \n\t" // H6
1153
                "psubw %%mm6, %%mm3                                \n\t" // H5 - H6
1154

    
1155
                "paddw %%mm0, %%mm0                                \n\t" // 2L4
1156
                "paddw %%mm1, %%mm1                                \n\t" // 2H4
1157
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - L5 + L6
1158
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - H5 + H6
1159

    
1160
                "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
1161
                "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
1162
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6
1163
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6
1164

    
1165
                "movq (%%ebx, %1, 2), %%mm2                        \n\t"
1166
                "movq %%mm2, %%mm3                                \n\t"
1167
                "punpcklbw %%mm7, %%mm2                                \n\t" // L7
1168
                "punpckhbw %%mm7, %%mm3                                \n\t" // H7
1169

    
1170
                "paddw %%mm2, %%mm2                                \n\t" // 2L7
1171
                "paddw %%mm3, %%mm3                                \n\t" // 2H7
1172
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1173
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1174

    
1175
                "movq temp0, %%mm2                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1176
                "movq temp1, %%mm3                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1177

    
1178
#ifdef HAVE_MMX2
1179
                "movq %%mm7, %%mm6                                \n\t" // 0
1180
                "psubw %%mm0, %%mm6                                \n\t"
1181
                "pmaxsw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1182
                "movq %%mm7, %%mm6                                \n\t" // 0
1183
                "psubw %%mm1, %%mm6                                \n\t"
1184
                "pmaxsw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1185
                "movq %%mm7, %%mm6                                \n\t" // 0
1186
                "psubw %%mm2, %%mm6                                \n\t"
1187
                "pmaxsw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1188
                "movq %%mm7, %%mm6                                \n\t" // 0
1189
                "psubw %%mm3, %%mm6                                \n\t"
1190
                "pmaxsw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1191
#else
1192
                "movq %%mm7, %%mm6                                \n\t" // 0
1193
                "pcmpgtw %%mm0, %%mm6                                \n\t"
1194
                "pxor %%mm6, %%mm0                                \n\t"
1195
                "psubw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1196
                "movq %%mm7, %%mm6                                \n\t" // 0
1197
                "pcmpgtw %%mm1, %%mm6                                \n\t"
1198
                "pxor %%mm6, %%mm1                                \n\t"
1199
                "psubw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1200
                "movq %%mm7, %%mm6                                \n\t" // 0
1201
                "pcmpgtw %%mm2, %%mm6                                \n\t"
1202
                "pxor %%mm6, %%mm2                                \n\t"
1203
                "psubw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1204
                "movq %%mm7, %%mm6                                \n\t" // 0
1205
                "pcmpgtw %%mm3, %%mm6                                \n\t"
1206
                "pxor %%mm6, %%mm3                                \n\t"
1207
                "psubw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1208
#endif
1209

    
1210
#ifdef HAVE_MMX2
1211
                "pminsw %%mm2, %%mm0                                \n\t"
1212
                "pminsw %%mm3, %%mm1                                \n\t"
1213
#else
1214
                "movq %%mm0, %%mm6                                \n\t"
1215
                "psubusw %%mm2, %%mm6                                \n\t"
1216
                "psubw %%mm6, %%mm0                                \n\t"
1217
                "movq %%mm1, %%mm6                                \n\t"
1218
                "psubusw %%mm3, %%mm6                                \n\t"
1219
                "psubw %%mm6, %%mm1                                \n\t"
1220
#endif
1221

    
1222
                "movq %%mm7, %%mm6                                \n\t" // 0
1223
                "pcmpgtw %%mm4, %%mm6                                \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1224
                "pxor %%mm6, %%mm4                                \n\t"
1225
                "psubw %%mm6, %%mm4                                \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1226
                "pcmpgtw %%mm5, %%mm7                                \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1227
                "pxor %%mm7, %%mm5                                \n\t"
1228
                "psubw %%mm7, %%mm5                                \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1229
// 100 opcodes
1230
                "movd %2, %%mm2                                        \n\t" // QP
1231
                "punpcklwd %%mm2, %%mm2                                \n\t"
1232
                "punpcklwd %%mm2, %%mm2                                \n\t"
1233
                "psllw $3, %%mm2                                \n\t" // 8QP
1234
                "movq %%mm2, %%mm3                                \n\t" // 8QP
1235
                "pcmpgtw %%mm4, %%mm2                                \n\t"
1236
                "pcmpgtw %%mm5, %%mm3                                \n\t"
1237
                "pand %%mm2, %%mm4                                \n\t"
1238
                "pand %%mm3, %%mm5                                \n\t"
1239

    
1240

    
1241
                "psubusw %%mm0, %%mm4                                \n\t" // hd
1242
                "psubusw %%mm1, %%mm5                                \n\t" // ld
1243

    
1244

    
1245
                "movq w05, %%mm2                                \n\t" // 5
1246
                "pmullw %%mm2, %%mm4                                \n\t"
1247
                "pmullw %%mm2, %%mm5                                \n\t"
1248
                "movq w20, %%mm2                                \n\t" // 32
1249
                "paddw %%mm2, %%mm4                                \n\t"
1250
                "paddw %%mm2, %%mm5                                \n\t"
1251
                "psrlw $6, %%mm4                                \n\t"
1252
                "psrlw $6, %%mm5                                \n\t"
1253

    
1254
/*
1255
                "movq w06, %%mm2                                \n\t" // 6
1256
                "paddw %%mm2, %%mm4                                \n\t"
1257
                "paddw %%mm2, %%mm5                                \n\t"
1258
                "movq w1400, %%mm2                                \n\t" // 1400h = 5120 = 5/64*2^16
1259
//FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1260
                "pmulhw %%mm2, %%mm4                                \n\t" // hd/13
1261
                "pmulhw %%mm2, %%mm5                                \n\t" // ld/13
1262
*/
1263

    
1264
                "movq temp2, %%mm0                                \n\t" // L3 - L4
1265
                "movq temp3, %%mm1                                \n\t" // H3 - H4
1266

    
1267
                "pxor %%mm2, %%mm2                                \n\t"
1268
                "pxor %%mm3, %%mm3                                \n\t"
1269

    
1270
                "pcmpgtw %%mm0, %%mm2                                \n\t" // sign (L3-L4)
1271
                "pcmpgtw %%mm1, %%mm3                                \n\t" // sign (H3-H4)
1272
                "pxor %%mm2, %%mm0                                \n\t"
1273
                "pxor %%mm3, %%mm1                                \n\t"
1274
                "psubw %%mm2, %%mm0                                \n\t" // |L3-L4|
1275
                "psubw %%mm3, %%mm1                                \n\t" // |H3-H4|
1276
                "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1277
                "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1278

    
1279
                "pxor %%mm6, %%mm2                                \n\t"
1280
                "pxor %%mm7, %%mm3                                \n\t"
1281
                "pand %%mm2, %%mm4                                \n\t"
1282
                "pand %%mm3, %%mm5                                \n\t"
1283

    
1284
#ifdef HAVE_MMX2
1285
                "pminsw %%mm0, %%mm4                                \n\t"
1286
                "pminsw %%mm1, %%mm5                                \n\t"
1287
#else
1288
                "movq %%mm4, %%mm2                                \n\t"
1289
                "psubusw %%mm0, %%mm2                                \n\t"
1290
                "psubw %%mm2, %%mm4                                \n\t"
1291
                "movq %%mm5, %%mm2                                \n\t"
1292
                "psubusw %%mm1, %%mm2                                \n\t"
1293
                "psubw %%mm2, %%mm5                                \n\t"
1294
#endif
1295
                "pxor %%mm6, %%mm4                                \n\t"
1296
                "pxor %%mm7, %%mm5                                \n\t"
1297
                "psubw %%mm6, %%mm4                                \n\t"
1298
                "psubw %%mm7, %%mm5                                \n\t"
1299
                "packsswb %%mm5, %%mm4                                \n\t"
1300
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
1301
                "paddb   %%mm4, %%mm0                                \n\t"
1302
                "movq %%mm0, (%%eax, %1, 2)                         \n\t"
1303
                "movq (%0, %1, 4), %%mm0                        \n\t"
1304
                "psubb %%mm4, %%mm0                                \n\t"
1305
                "movq %%mm0, (%0, %1, 4)                         \n\t"
1306

    
1307
                :
1308
                : "r" (src), "r" (stride), "r" (QP)
1309
                : "%eax", "%ebx"
1310
        );
1311
#else
1312
        const int l1= stride;
1313
        const int l2= stride + l1;
1314
        const int l3= stride + l2;
1315
        const int l4= stride + l3;
1316
        const int l5= stride + l4;
1317
        const int l6= stride + l5;
1318
        const int l7= stride + l6;
1319
        const int l8= stride + l7;
1320
//        const int l9= stride + l8;
1321
        int x;
1322
        src+= stride*3;
1323
        for(x=0; x<BLOCK_SIZE; x++)
1324
        {
1325
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1326
                if(ABS(middleEnergy) < 8*QP)
1327
                {
1328
                        const int q=(src[l4] - src[l5])/2;
1329
                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1330
                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1331

    
1332
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1333
                        d= MAX(d, 0);
1334

    
1335
                        d= (5*d + 32) >> 6;
1336
                        d*= SIGN(-middleEnergy);
1337

    
1338
                        if(q>0)
1339
                        {
1340
                                d= d<0 ? 0 : d;
1341
                                d= d>q ? q : d;
1342
                        }
1343
                        else
1344
                        {
1345
                                d= d>0 ? 0 : d;
1346
                                d= d<q ? q : d;
1347
                        }
1348

    
1349
                        src[l4]-= d;
1350
                        src[l5]+= d;
1351
                }
1352
                src++;
1353
        }
1354
#endif
1355
}
1356

    
1357
//FIXME?  |255-0| = 1
1358
/**
1359
 * Check if the given 8x8 Block is mostly "flat"
1360
 */
1361
static inline int isHorizDC(uint8_t src[], int stride)
1362
{
1363
//        src++;
1364
        int numEq= 0;
1365
#if 0
1366
asm volatile (
1367
//                "int $3 \n\t"
1368
                "leal (%1, %2), %%ecx                                \n\t"
1369
                "leal (%%ecx, %2, 4), %%ebx                        \n\t"
1370
//        0        1        2        3        4        5        6        7        8        9
1371
//        %1        ecx        ecx+%2        ecx+2%2        %1+4%2        ebx        ebx+%2        ebx+2%2        %1+8%2        ebx+4%2
1372
                "movq b7E, %%mm7                                \n\t" // mm7 = 0x7F
1373
                "movq b7C, %%mm6                                \n\t" // mm6 = 0x7D
1374
                "pxor %%mm0, %%mm0                                \n\t"
1375
                "movl %1, %%eax                                        \n\t"
1376
                "andl $0x1F, %%eax                                \n\t"
1377
                "cmpl $24, %%eax                                \n\t"
1378
                "leal tempBlock, %%eax                                \n\t"
1379
                "jb 1f                                                \n\t"
1380

1381
#define HDC_CHECK_AND_CPY(src, dst) \
1382
                "movd " #src ", %%mm2                                \n\t"\
1383
                "punpckldq 4" #src ", %%mm2                                \n\t" /* (%1) */\
1384
                "movq %%mm2, %%mm1                                \n\t"\
1385
                "psrlq $8, %%mm2                                \n\t"\
1386
                "psubb %%mm1, %%mm2                                \n\t"\
1387
                "paddb %%mm7, %%mm2                                \n\t"\
1388
                "pcmpgtb %%mm6, %%mm2                                \n\t"\
1389
                "paddb %%mm2, %%mm0                                \n\t"\
1390
                "movq %%mm1," #dst "(%%eax)                        \n\t"
1391

1392
                HDC_CHECK_AND_CPY((%1),0)
1393
                HDC_CHECK_AND_CPY((%%ecx),8)
1394
                HDC_CHECK_AND_CPY((%%ecx, %2),16)
1395
                HDC_CHECK_AND_CPY((%%ecx, %2, 2),24)
1396
                HDC_CHECK_AND_CPY((%1, %2, 4),32)
1397
                HDC_CHECK_AND_CPY((%%ebx),40)
1398
                HDC_CHECK_AND_CPY((%%ebx, %2),48)
1399
                HDC_CHECK_AND_CPY((%%ebx, %2, 2),56)
1400
                "jmp 2f                                                \n\t"
1401
                "1:                                                \n\t"
1402
// src does not cross a 32 byte cache line so dont waste time with alignment
1403
#define HDC_CHECK_AND_CPY2(src, dst) \
1404
                "movq " #src ", %%mm2                                \n\t"\
1405
                "movq " #src ", %%mm1                                \n\t"\
1406
                "psrlq $8, %%mm2                                \n\t"\
1407
                "psubb %%mm1, %%mm2                                \n\t"\
1408
                "paddb %%mm7, %%mm2                                \n\t"\
1409
                "pcmpgtb %%mm6, %%mm2                                \n\t"\
1410
                "paddb %%mm2, %%mm0                                \n\t"\
1411
                "movq %%mm1," #dst "(%%eax)                        \n\t"
1412

1413
                HDC_CHECK_AND_CPY2((%1),0)
1414
                HDC_CHECK_AND_CPY2((%%ecx),8)
1415
                HDC_CHECK_AND_CPY2((%%ecx, %2),16)
1416
                HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24)
1417
                HDC_CHECK_AND_CPY2((%1, %2, 4),32)
1418
                HDC_CHECK_AND_CPY2((%%ebx),40)
1419
                HDC_CHECK_AND_CPY2((%%ebx, %2),48)
1420
                HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56)
1421
                "2:                                                \n\t"
1422
                "psllq $8, %%mm0                                \n\t" // remove dummy value
1423
                "movq %%mm0, %%mm1                                \n\t"
1424
                "psrlw $8, %%mm0                                \n\t"
1425
                "paddb %%mm1, %%mm0                                \n\t"
1426
                "movq %%mm0, %%mm1                                \n\t"
1427
                "psrlq $16, %%mm0                                \n\t"
1428
                "paddb %%mm1, %%mm0                                \n\t"
1429
                "movq %%mm0, %%mm1                                \n\t"
1430
                "psrlq $32, %%mm0                                \n\t"
1431
                "paddb %%mm1, %%mm0                                \n\t"
1432
                "movd %%mm0, %0                                        \n\t"
1433
                : "=r" (numEq)
1434
                : "r" (src), "r" (stride)
1435
                : "%eax", "%ebx", "%ecx"
1436
                );
1437
//        printf("%d\n", numEq);
1438
        numEq= (256 - numEq) &0xFF;
1439
#else
1440
        int y;
1441
        for(y=0; y<BLOCK_SIZE; y++)
1442
        {
1443
                if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1444
                if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1445
                if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1446
                if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1447
                if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1448
                if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1449
                if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1450
                src+= stride;
1451
        }
1452
#endif
1453
/*        if(abs(numEq - asmEq) > 0)
1454
        {
1455
//                printf("\nasm:%d  c:%d\n", asmEq, numEq);
1456
                for(int y=0; y<8; y++)
1457
                {
1458
                        for(int x=0; x<8; x++)
1459
                        {
1460
                                printf("%d ", src[x + y*stride]);
1461
                        }
1462
                        printf("\n");
1463
                }
1464
        }
1465
*/
1466
//        printf("%d\n", numEq);
1467
        return numEq > hFlatnessThreshold;
1468
}
1469

    
1470
static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1471
{
1472
        if(abs(src[0] - src[7]) > 2*QP) return 0;
1473

    
1474
        return 1;
1475
}
1476

    
1477
static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1478
{
1479
#if 0
1480
        asm volatile(
1481
                "leal (%0, %1), %%ecx                                \n\t"
1482
                "leal (%%ecx, %1, 4), %%ebx                        \n\t"
1483
//        0        1        2        3        4        5        6        7        8        9
1484
//        %0        ecx        ecx+%1        ecx+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1485
                "pxor %%mm7, %%mm7                                \n\t"
1486
                "movq bm00001000, %%mm6                                \n\t"
1487
                "movd %2, %%mm5                                        \n\t" // QP
1488
                "movq %%mm5, %%mm4                                \n\t"
1489
                "paddusb %%mm5, %%mm5                                \n\t" // 2QP
1490
                "paddusb %%mm5, %%mm4                                \n\t" // 3QP
1491
                "psllq $24, %%mm4                                \n\t"
1492
                "pxor %%mm5, %%mm5                                \n\t" // 0
1493
                "psubb %%mm4, %%mm5                                \n\t" // -QP
1494
                "leal tempBlock, %%eax                                \n\t"
1495

1496
//FIXME? "unroll by 2" and mix
1497
#ifdef HAVE_MMX2
1498
#define HDF(src, dst)        \
1499
                "movq " #src "(%%eax), %%mm0                        \n\t"\
1500
                "movq " #src "(%%eax), %%mm1                        \n\t"\
1501
                "movq " #src "(%%eax), %%mm2                        \n\t"\
1502
                "psrlq $8, %%mm1                                \n\t"\
1503
                "psubusb %%mm1, %%mm2                                \n\t"\
1504
                "psubusb %%mm0, %%mm1                                \n\t"\
1505
                "por %%mm2, %%mm1                                \n\t" /* p?x = |px - p(x+1)| */\
1506
                "pcmpeqb %%mm7, %%mm2                                \n\t" /* p?x = sgn[px - p(x+1)] */\
1507
                "pshufw $0x00, %%mm1, %%mm3                        \n\t" /* p?5 = |p1 - p2| */\
1508
                "pminub %%mm1, %%mm3                                \n\t" /* p?5 = min(|p2-p1|, |p6-p5|)*/\
1509
                "psrlq $16, %%mm3                                \n\t" /* p?3 = min(|p2-p1|, |p6-p5|)*/\
1510
                "psubusb %%mm3, %%mm1                        \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1511
                "paddb %%mm5, %%mm1                                \n\t"\
1512
                "psubusb %%mm5, %%mm1                                \n\t"\
1513
                "psrlw $2, %%mm1                                \n\t"\
1514
                "pxor %%mm2, %%mm1                                \n\t"\
1515
                "psubb %%mm2, %%mm1                                \n\t"\
1516
                "pand %%mm6, %%mm1                                \n\t"\
1517
                "psubb %%mm1, %%mm0                                \n\t"\
1518
                "psllq $8, %%mm1                                \n\t"\
1519
                "paddb %%mm1, %%mm0                                \n\t"\
1520
                "movd %%mm0, " #dst"                                \n\t"\
1521
                "psrlq $32, %%mm0                                \n\t"\
1522
                "movd %%mm0, 4" #dst"                                \n\t"
1523
#else
1524
#define HDF(src, dst)\
1525
                "movq " #src "(%%eax), %%mm0                        \n\t"\
1526
                "movq %%mm0, %%mm1                                \n\t"\
1527
                "movq %%mm0, %%mm2                                \n\t"\
1528
                "psrlq $8, %%mm1                                \n\t"\
1529
                "psubusb %%mm1, %%mm2                                \n\t"\
1530
                "psubusb %%mm0, %%mm1                                \n\t"\
1531
                "por %%mm2, %%mm1                                \n\t" /* p?x = |px - p(x+1)| */\
1532
                "pcmpeqb %%mm7, %%mm2                                \n\t" /* p?x = sgn[px - p(x+1)] */\
1533
                "movq %%mm1, %%mm3                                \n\t"\
1534
                "psllq $32, %%mm3                                \n\t"\
1535
                "movq %%mm3, %%mm4                                \n\t"\
1536
                "psubusb %%mm1, %%mm4                                \n\t"\
1537
                "psubb %%mm4, %%mm3                                \n\t"\
1538
                "psrlq $16, %%mm3                                \n\t" /* p?3 = min(|p2-p1|, |p6-p5|)*/\
1539
                "psubusb %%mm3, %%mm1                        \n\t" /* |p3-p4|-min(|p1-p2|,|p5,?6|) */\
1540
                "paddb %%mm5, %%mm1                                \n\t"\
1541
                "psubusb %%mm5, %%mm1                                \n\t"\
1542
                "psrlw $2, %%mm1                                \n\t"\
1543
                "pxor %%mm2, %%mm1                                \n\t"\
1544
                "psubb %%mm2, %%mm1                                \n\t"\
1545
                "pand %%mm6, %%mm1                                \n\t"\
1546
                "psubb %%mm1, %%mm0                                \n\t"\
1547
                "psllq $8, %%mm1                                \n\t"\
1548
                "paddb %%mm1, %%mm0                                \n\t"\
1549
                "movd %%mm0, " #dst "                                \n\t"\
1550
                "psrlq $32, %%mm0                                \n\t"\
1551
                "movd %%mm0, 4" #dst "                                \n\t"
1552
#endif
1553
                HDF(0,(%0))
1554
                HDF(8,(%%ecx))
1555
                HDF(16,(%%ecx, %1))
1556
                HDF(24,(%%ecx, %1, 2))
1557
                HDF(32,(%0, %1, 4))
1558
                HDF(40,(%%ebx))
1559
                HDF(48,(%%ebx, %1))
1560
                HDF(56,(%%ebx, %1, 2))
1561
                :
1562
                : "r" (dst), "r" (stride), "r" (QP)
1563
                : "%eax", "%ebx", "%ecx"
1564
        );
1565
#else
1566
        int y;
1567
        for(y=0; y<BLOCK_SIZE; y++)
1568
        {
1569
                const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1570

    
1571
                if(ABS(middleEnergy) < 8*QP)
1572
                {
1573
                        const int q=(dst[3] - dst[4])/2;
1574
                        const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1575
                        const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1576

    
1577
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1578
                        d= MAX(d, 0);
1579

    
1580
                        d= (5*d + 32) >> 6;
1581
                        d*= SIGN(-middleEnergy);
1582

    
1583
                        if(q>0)
1584
                        {
1585
                                d= d<0 ? 0 : d;
1586
                                d= d>q ? q : d;
1587
                        }
1588
                        else
1589
                        {
1590
                                d= d>0 ? 0 : d;
1591
                                d= d<q ? q : d;
1592
                        }
1593

    
1594
                        dst[3]-= d;
1595
                        dst[4]+= d;
1596
                }
1597
                dst+= stride;
1598
        }
1599
#endif
1600
}
1601

    
1602
/**
1603
 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1604
 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1605
 * using the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
1606
 */
1607
static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1608
{
1609

    
1610
#if 0
1611
        asm volatile(
1612
                "leal (%0, %1), %%ecx                                \n\t"
1613
                "leal (%%ecx, %1, 4), %%ebx                        \n\t"
1614
//        0        1        2        3        4        5        6        7        8        9
1615
//        %0        ecx        ecx+%1        ecx+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1616
                "pxor %%mm7, %%mm7                                        \n\t"
1617
                "leal tempBlock, %%eax                                        \n\t"
1618
/*
1619
#define HLP1        "movq (%0), %%mm0                                        \n\t"\
1620
                "movq %%mm0, %%mm1                                        \n\t"\
1621
                "psllq $8, %%mm0                                        \n\t"\
1622
                PAVGB(%%mm1, %%mm0)\
1623
                "psrlw $8, %%mm0                                        \n\t"\
1624
                "pxor %%mm1, %%mm1                                        \n\t"\
1625
                "packuswb %%mm1, %%mm0                                        \n\t"\
1626
                "movq %%mm0, %%mm1                                        \n\t"\
1627
                "movq %%mm0, %%mm2                                        \n\t"\
1628
                "psllq $32, %%mm0                                        \n\t"\
1629
                "paddb %%mm0, %%mm1                                        \n\t"\
1630
                "psllq $16, %%mm2                                        \n\t"\
1631
                PAVGB(%%mm2, %%mm0)\
1632
                "movq %%mm0, %%mm3                                        \n\t"\
1633
                "pand bm11001100, %%mm0                                        \n\t"\
1634
                "paddusb %%mm0, %%mm3                                        \n\t"\
1635
                "psrlq $8, %%mm3                                        \n\t"\
1636
                PAVGB(%%mm1, %%mm4)\
1637
                PAVGB(%%mm3, %%mm2)\
1638
                "psrlq $16, %%mm2                                        \n\t"\
1639
                "punpcklbw %%mm2, %%mm2                                        \n\t"\
1640
                "movq %%mm2, (%0)                                        \n\t"\
1641

1642
#define HLP2        "movq (%0), %%mm0                                        \n\t"\
1643
                "movq %%mm0, %%mm1                                        \n\t"\
1644
                "psllq $8, %%mm0                                        \n\t"\
1645
                PAVGB(%%mm1, %%mm0)\
1646
                "psrlw $8, %%mm0                                        \n\t"\
1647
                "pxor %%mm1, %%mm1                                        \n\t"\
1648
                "packuswb %%mm1, %%mm0                                        \n\t"\
1649
                "movq %%mm0, %%mm2                                        \n\t"\
1650
                "psllq $32, %%mm0                                        \n\t"\
1651
                "psllq $16, %%mm2                                        \n\t"\
1652
                PAVGB(%%mm2, %%mm0)\
1653
                "movq %%mm0, %%mm3                                        \n\t"\
1654
                "pand bm11001100, %%mm0                                        \n\t"\
1655
                "paddusb %%mm0, %%mm3                                        \n\t"\
1656
                "psrlq $8, %%mm3                                        \n\t"\
1657
                PAVGB(%%mm3, %%mm2)\
1658
                "psrlq $16, %%mm2                                        \n\t"\
1659
                "punpcklbw %%mm2, %%mm2                                        \n\t"\
1660
                "movq %%mm2, (%0)                                        \n\t"\
1661
*/
1662
// approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1663
/*
1664
Implemented        Exact 7-Tap
1665
 9421                A321
1666
 36421                64321
1667
 334321                =
1668
 1234321        =
1669
  1234321        =
1670
   123433        =
1671
    12463          12346
1672
     1249           123A
1673

1674
*/
1675

1676
#ifdef HAVE_MMX2
1677
#define HLP3(i)        "movq " #i "(%%eax), %%mm0                                \n\t"\
1678
                "movq %%mm0, %%mm1                                        \n\t"\
1679
                "movq %%mm0, %%mm2                                        \n\t"\
1680
                "movq %%mm0, %%mm3                                        \n\t"\
1681
                "movq %%mm0, %%mm4                                        \n\t"\
1682
                "psllq $8, %%mm1                                        \n\t"\
1683
                "psrlq $8, %%mm2                                        \n\t"\
1684
                "pand bm00000001, %%mm3                                        \n\t"\
1685
                "pand bm10000000, %%mm4                                        \n\t"\
1686
                "por %%mm3, %%mm1                                        \n\t"\
1687
                "por %%mm4, %%mm2                                        \n\t"\
1688
                PAVGB(%%mm2, %%mm1)\
1689
                PAVGB(%%mm1, %%mm0)\
1690
\
1691
                "pshufw $0xF9, %%mm0, %%mm3                                \n\t"\
1692
                "pshufw $0x90, %%mm0, %%mm4                                \n\t"\
1693
                PAVGB(%%mm3, %%mm4)\
1694
                PAVGB(%%mm4, %%mm0)\
1695
                "movd %%mm0, (%0)                                        \n\t"\
1696
                "psrlq $32, %%mm0                                        \n\t"\
1697
                "movd %%mm0, 4(%0)                                        \n\t"
1698
#else
1699
#define HLP3(i)        "movq " #i "(%%eax), %%mm0                                \n\t"\
1700
                "movq %%mm0, %%mm1                                        \n\t"\
1701
                "movq %%mm0, %%mm2                                        \n\t"\
1702
                "movd -4(%0), %%mm3                                        \n\t" /*0001000*/\
1703
                "movd 8(%0), %%mm4                                        \n\t" /*0001000*/\
1704
                "psllq $8, %%mm1                                        \n\t"\
1705
                "psrlq $8, %%mm2                                        \n\t"\
1706
                "psrlq $24, %%mm3                                        \n\t"\
1707
                "psllq $56, %%mm4                                        \n\t"\
1708
                "por %%mm3, %%mm1                                        \n\t"\
1709
                "por %%mm4, %%mm2                                        \n\t"\
1710
                PAVGB(%%mm2, %%mm1)\
1711
                PAVGB(%%mm1, %%mm0)\
1712
\
1713
                "movq %%mm0, %%mm3                                        \n\t"\
1714
                "movq %%mm0, %%mm4                                        \n\t"\
1715
                "movq %%mm0, %%mm5                                        \n\t"\
1716
                "psrlq $16, %%mm3                                        \n\t"\
1717
                "psllq $16, %%mm4                                        \n\t"\
1718
                "pand bm11000000, %%mm5                                        \n\t"\
1719
                "por %%mm5, %%mm3                                        \n\t"\
1720
                "movq %%mm0, %%mm5                                        \n\t"\
1721
                "pand bm00000011, %%mm5                                        \n\t"\
1722
                "por %%mm5, %%mm4                                        \n\t"\
1723
                PAVGB(%%mm3, %%mm4)\
1724
                PAVGB(%%mm4, %%mm0)\
1725
                "movd %%mm0, (%0)                                        \n\t"\
1726
                "psrlq $32, %%mm0                                        \n\t"\
1727
                "movd %%mm0, 4(%0)                                        \n\t"
1728
#endif
1729

    
1730
/* uses the 7-Tap Filter: 1112111 */
1731
#define NEW_HLP(src, dst)\
1732
                "movq " #src "(%%eax), %%mm1                                \n\t"\
1733
                "movq " #src "(%%eax), %%mm2                                \n\t"\
1734
                "psllq $8, %%mm1                                        \n\t"\
1735
                "psrlq $8, %%mm2                                        \n\t"\
1736
                "movd -4" #dst ", %%mm3                                        \n\t" /*0001000*/\
1737
                "movd 8" #dst ", %%mm4                                        \n\t" /*0001000*/\
1738
                "psrlq $24, %%mm3                                        \n\t"\
1739
                "psllq $56, %%mm4                                        \n\t"\
1740
                "por %%mm3, %%mm1                                        \n\t"\
1741
                "por %%mm4, %%mm2                                        \n\t"\
1742
                "movq %%mm1, %%mm5                                        \n\t"\
1743
                PAVGB(%%mm2, %%mm1)\
1744
                "movq " #src "(%%eax), %%mm0                                \n\t"\
1745
                PAVGB(%%mm1, %%mm0)\
1746
                "psllq $8, %%mm5                                        \n\t"\
1747
                "psrlq $8, %%mm2                                        \n\t"\
1748
                "por %%mm3, %%mm5                                        \n\t"\
1749
                "por %%mm4, %%mm2                                        \n\t"\
1750
                "movq %%mm5, %%mm1                                        \n\t"\
1751
                PAVGB(%%mm2, %%mm5)\
1752
                "psllq $8, %%mm1                                        \n\t"\
1753
                "psrlq $8, %%mm2                                        \n\t"\
1754
                "por %%mm3, %%mm1                                        \n\t"\
1755
                "por %%mm4, %%mm2                                        \n\t"\
1756
                PAVGB(%%mm2, %%mm1)\
1757
                PAVGB(%%mm1, %%mm5)\
1758
                PAVGB(%%mm5, %%mm0)\
1759
                "movd %%mm0, " #dst "                                        \n\t"\
1760
                "psrlq $32, %%mm0                                        \n\t"\
1761
                "movd %%mm0, 4" #dst "                                        \n\t"
1762

    
1763
/* uses the 9-Tap Filter: 112242211 */
1764
#define NEW_HLP2(i)\
1765
                "movq " #i "(%%eax), %%mm0                                \n\t" /*0001000*/\
1766
                "movq %%mm0, %%mm1                                        \n\t" /*0001000*/\
1767
                "movq %%mm0, %%mm2                                        \n\t" /*0001000*/\
1768
                "movd -4(%0), %%mm3                                        \n\t" /*0001000*/\
1769
                "movd 8(%0), %%mm4                                        \n\t" /*0001000*/\
1770
                "psllq $8, %%mm1                                        \n\t"\
1771
                "psrlq $8, %%mm2                                        \n\t"\
1772
                "psrlq $24, %%mm3                                        \n\t"\
1773
                "psllq $56, %%mm4                                        \n\t"\
1774
                "por %%mm3, %%mm1                                        \n\t" /*0010000*/\
1775
                "por %%mm4, %%mm2                                        \n\t" /*0000100*/\
1776
                "movq %%mm1, %%mm5                                        \n\t" /*0010000*/\
1777
                PAVGB(%%mm2, %%mm1)                                              /*0010100*/\
1778
                PAVGB(%%mm1, %%mm0)                                              /*0012100*/\
1779
                "psllq $8, %%mm5                                        \n\t"\
1780
                "psrlq $8, %%mm2                                        \n\t"\
1781
                "por %%mm3, %%mm5                                        \n\t" /*0100000*/\
1782
                "por %%mm4, %%mm2                                        \n\t" /*0000010*/\
1783
                "movq %%mm5, %%mm1                                        \n\t" /*0100000*/\
1784
                PAVGB(%%mm2, %%mm5)                                              /*0100010*/\
1785
                "psllq $8, %%mm1                                        \n\t"\
1786
                "psrlq $8, %%mm2                                        \n\t"\
1787
                "por %%mm3, %%mm1                                        \n\t" /*1000000*/\
1788
                "por %%mm4, %%mm2                                        \n\t" /*0000001*/\
1789
                "movq %%mm1, %%mm6                                        \n\t" /*1000000*/\
1790
                PAVGB(%%mm2, %%mm1)                                              /*1000001*/\
1791
                "psllq $8, %%mm6                                        \n\t"\
1792
                "psrlq $8, %%mm2                                        \n\t"\
1793
                "por %%mm3, %%mm6                                        \n\t"/*100000000*/\
1794
                "por %%mm4, %%mm2                                        \n\t"/*000000001*/\
1795
                PAVGB(%%mm2, %%mm6)                                             /*100000001*/\
1796
                PAVGB(%%mm6, %%mm1)                                             /*110000011*/\
1797
                PAVGB(%%mm1, %%mm5)                                             /*112000211*/\
1798
                PAVGB(%%mm5, %%mm0)                                             /*112242211*/\
1799
                "movd %%mm0, (%0)                                        \n\t"\
1800
                "psrlq $32, %%mm0                                        \n\t"\
1801
                "movd %%mm0, 4(%0)                                        \n\t"
1802

    
1803
#define HLP(src, dst) NEW_HLP(src, dst)
1804

    
1805
                HLP(0, (%0))
1806
                HLP(8, (%%ecx))
1807
                HLP(16, (%%ecx, %1))
1808
                HLP(24, (%%ecx, %1, 2))
1809
                HLP(32, (%0, %1, 4))
1810
                HLP(40, (%%ebx))
1811
                HLP(48, (%%ebx, %1))
1812
                HLP(56, (%%ebx, %1, 2))
1813

    
1814
                :
1815
                : "r" (dst), "r" (stride)
1816
                : "%eax", "%ebx", "%ecx"
1817
        );
1818

    
1819
#else
1820
        int y;
1821
        for(y=0; y<BLOCK_SIZE; y++)
1822
        {
1823
                const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1824
                const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1825

    
1826
                int sums[9];
1827
                sums[0] = first + dst[0];
1828
                sums[1] = dst[0] + dst[1];
1829
                sums[2] = dst[1] + dst[2];
1830
                sums[3] = dst[2] + dst[3];
1831
                sums[4] = dst[3] + dst[4];
1832
                sums[5] = dst[4] + dst[5];
1833
                sums[6] = dst[5] + dst[6];
1834
                sums[7] = dst[6] + dst[7];
1835
                sums[8] = dst[7] + last;
1836

    
1837
                dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1838
                dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1839
                dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1840
                dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1841
                dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1842
                dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1843
                dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1844
                dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1845

    
1846
                dst+= stride;
1847
        }
1848
#endif
1849
}
1850

    
1851
static inline void dering(uint8_t src[], int stride, int QP)
1852
{
1853
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1854
        asm volatile(
1855
                "movq pQPb, %%mm0                                \n\t"
1856
                "paddusb %%mm0, %%mm0                                \n\t"
1857
                "movq %%mm0, pQPb2                                \n\t"
1858

    
1859
                "leal (%0, %1), %%eax                                \n\t"
1860
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1861
//        0        1        2        3        4        5        6        7        8        9
1862
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1863

    
1864
                "pcmpeqb %%mm6, %%mm6                                \n\t"
1865
                "pxor %%mm7, %%mm7                                \n\t"
1866
#ifdef HAVE_MMX2
1867
#define FIND_MIN_MAX(addr)\
1868
                "movq " #addr ", %%mm0                                \n\t"\
1869
                "pminub %%mm0, %%mm6                                \n\t"\
1870
                "pmaxub %%mm0, %%mm7                                \n\t"
1871
#else
1872
#define FIND_MIN_MAX(addr)\
1873
                "movq " #addr ", %%mm0                                \n\t"\
1874
                "movq %%mm6, %%mm1                                \n\t"\
1875
                "psubusb %%mm0, %%mm7                                \n\t"\
1876
                "paddb %%mm0, %%mm7                                \n\t"\
1877
                "psubusb %%mm0, %%mm1                                \n\t"\
1878
                "psubb %%mm1, %%mm6                                \n\t"
1879
#endif
1880

    
1881
FIND_MIN_MAX((%%eax))
1882
FIND_MIN_MAX((%%eax, %1))
1883
FIND_MIN_MAX((%%eax, %1, 2))
1884
FIND_MIN_MAX((%0, %1, 4))
1885
FIND_MIN_MAX((%%ebx))
1886
FIND_MIN_MAX((%%ebx, %1))
1887
FIND_MIN_MAX((%%ebx, %1, 2))
1888
FIND_MIN_MAX((%0, %1, 8))
1889

    
1890
                "movq %%mm6, %%mm4                                \n\t"
1891
                "psrlq $8, %%mm6                                \n\t"
1892
#ifdef HAVE_MMX2
1893
                "pminub %%mm4, %%mm6                                \n\t" // min of pixels
1894
                "pshufw $0xF9, %%mm6, %%mm4                        \n\t"
1895
                "pminub %%mm4, %%mm6                                \n\t" // min of pixels
1896
                "pshufw $0xFE, %%mm6, %%mm4                        \n\t"
1897
                "pminub %%mm4, %%mm6                                \n\t"
1898
#else
1899
                "movq %%mm6, %%mm1                                \n\t"
1900
                "psubusb %%mm4, %%mm1                                \n\t"
1901
                "psubb %%mm1, %%mm6                                \n\t"
1902
                "movq %%mm6, %%mm4                                \n\t"
1903
                "psrlq $16, %%mm6                                \n\t"
1904
                "movq %%mm6, %%mm1                                \n\t"
1905
                "psubusb %%mm4, %%mm1                                \n\t"
1906
                "psubb %%mm1, %%mm6                                \n\t"
1907
                "movq %%mm6, %%mm4                                \n\t"
1908
                "psrlq $32, %%mm6                                \n\t"
1909
                "movq %%mm6, %%mm1                                \n\t"
1910
                "psubusb %%mm4, %%mm1                                \n\t"
1911
                "psubb %%mm1, %%mm6                                \n\t"
1912
#endif
1913

    
1914

    
1915
                "movq %%mm7, %%mm4                                \n\t"
1916
                "psrlq $8, %%mm7                                \n\t"
1917
#ifdef HAVE_MMX2
1918
                "pmaxub %%mm4, %%mm7                                \n\t" // max of pixels
1919
                "pshufw $0xF9, %%mm7, %%mm4                        \n\t"
1920
                "pmaxub %%mm4, %%mm7                                \n\t"
1921
                "pshufw $0xFE, %%mm7, %%mm4                        \n\t"
1922
                "pmaxub %%mm4, %%mm7                                \n\t"
1923
#else
1924
                "psubusb %%mm4, %%mm7                                \n\t"
1925
                "paddb %%mm4, %%mm7                                \n\t"
1926
                "movq %%mm7, %%mm4                                \n\t"
1927
                "psrlq $16, %%mm7                                \n\t"
1928
                "psubusb %%mm4, %%mm7                                \n\t"
1929
                "paddb %%mm4, %%mm7                                \n\t"
1930
                "movq %%mm7, %%mm4                                \n\t"
1931
                "psrlq $32, %%mm7                                \n\t"
1932
                "psubusb %%mm4, %%mm7                                \n\t"
1933
                "paddb %%mm4, %%mm7                                \n\t"
1934
#endif
1935
                PAVGB(%%mm6, %%mm7)                                      // a=(max + min)/2
1936
                "punpcklbw %%mm7, %%mm7                                \n\t"
1937
                "punpcklbw %%mm7, %%mm7                                \n\t"
1938
                "punpcklbw %%mm7, %%mm7                                \n\t"
1939
                "movq %%mm7, temp0                                \n\t"
1940

    
1941
                "movq (%0), %%mm0                                \n\t" // L10
1942
                "movq %%mm0, %%mm1                                \n\t" // L10
1943
                "movq %%mm0, %%mm2                                \n\t" // L10
1944
                "psllq $8, %%mm1                                \n\t"
1945
                "psrlq $8, %%mm2                                \n\t"
1946
                "movd -4(%0), %%mm3                                \n\t"
1947
                "movd 8(%0), %%mm4                                \n\t"
1948
                "psrlq $24, %%mm3                                \n\t"
1949
                "psllq $56, %%mm4                                \n\t"
1950
                "por %%mm3, %%mm1                                \n\t" // L00
1951
                "por %%mm4, %%mm2                                \n\t" // L20
1952
                "movq %%mm1, %%mm3                                \n\t" // L00
1953
                PAVGB(%%mm2, %%mm1)                                      // (L20 + L00)/2
1954
                PAVGB(%%mm0, %%mm1)                                      // (L20 + L00 + 2L10)/4
1955
                "psubusb %%mm7, %%mm0                                \n\t"
1956
                "psubusb %%mm7, %%mm2                                \n\t"
1957
                "psubusb %%mm7, %%mm3                                \n\t"
1958
                "pcmpeqb b00, %%mm0                                \n\t" // L10 > a ? 0 : -1
1959
                "pcmpeqb b00, %%mm2                                \n\t" // L20 > a ? 0 : -1
1960
                "pcmpeqb b00, %%mm3                                \n\t" // L00 > a ? 0 : -1
1961
                "paddb %%mm2, %%mm0                                \n\t"
1962
                "paddb %%mm3, %%mm0                                \n\t"
1963

    
1964
                "movq (%%eax), %%mm2                                \n\t" // L11
1965
                "movq %%mm2, %%mm3                                \n\t" // L11
1966
                "movq %%mm2, %%mm4                                \n\t" // L11
1967
                "psllq $8, %%mm3                                \n\t"
1968
                "psrlq $8, %%mm4                                \n\t"
1969
                "movd -4(%%eax), %%mm5                                \n\t"
1970
                "movd 8(%%eax), %%mm6                                \n\t"
1971
                "psrlq $24, %%mm5                                \n\t"
1972
                "psllq $56, %%mm6                                \n\t"
1973
                "por %%mm5, %%mm3                                \n\t" // L01
1974
                "por %%mm6, %%mm4                                \n\t" // L21
1975
                "movq %%mm3, %%mm5                                \n\t" // L01
1976
                PAVGB(%%mm4, %%mm3)                                      // (L21 + L01)/2
1977
                PAVGB(%%mm2, %%mm3)                                      // (L21 + L01 + 2L11)/4
1978
                "psubusb %%mm7, %%mm2                                \n\t"
1979
                "psubusb %%mm7, %%mm4                                \n\t"
1980
                "psubusb %%mm7, %%mm5                                \n\t"
1981
                "pcmpeqb b00, %%mm2                                \n\t" // L11 > a ? 0 : -1
1982
                "pcmpeqb b00, %%mm4                                \n\t" // L21 > a ? 0 : -1
1983
                "pcmpeqb b00, %%mm5                                \n\t" // L01 > a ? 0 : -1
1984
                "paddb %%mm4, %%mm2                                \n\t"
1985
                "paddb %%mm5, %%mm2                                \n\t"
1986
// 0, 2, 3, 1
1987
#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1988
                "movq " #src ", " #sx "                                \n\t" /* src[0] */\
1989
                "movq " #sx ", " #lx "                                \n\t" /* src[0] */\
1990
                "movq " #sx ", " #t0 "                                \n\t" /* src[0] */\
1991
                "psllq $8, " #lx "                                \n\t"\
1992
                "psrlq $8, " #t0 "                                \n\t"\
1993
                "movd -4" #src ", " #t1 "                        \n\t"\
1994
                "psrlq $24, " #t1 "                                \n\t"\
1995
                "por " #t1 ", " #lx "                                \n\t" /* src[-1] */\
1996
                "movd 8" #src ", " #t1 "                        \n\t"\
1997
                "psllq $56, " #t1 "                                \n\t"\
1998
                "por " #t1 ", " #t0 "                                \n\t" /* src[+1] */\
1999
                "movq " #lx ", " #t1 "                                \n\t" /* src[-1] */\
2000
                PAVGB(t0, lx)                                              /* (src[-1] + src[+1])/2 */\
2001
                PAVGB(sx, lx)                                      /* (src[-1] + 2src[0] + src[+1])/4 */\
2002
                PAVGB(lx, pplx)                                             \
2003
                "movq " #lx ", temp1                                \n\t"\
2004
                "movq temp0, " #lx "                                \n\t"\
2005
                "psubusb " #lx ", " #t1 "                        \n\t"\
2006
                "psubusb " #lx ", " #t0 "                        \n\t"\
2007
                "psubusb " #lx ", " #sx "                        \n\t"\
2008
                "movq b00, " #lx "                                \n\t"\
2009
                "pcmpeqb " #lx ", " #t1 "                        \n\t" /* src[-1] > a ? 0 : -1*/\
2010
                "pcmpeqb " #lx ", " #t0 "                        \n\t" /* src[+1] > a ? 0 : -1*/\
2011
                "pcmpeqb " #lx ", " #sx "                        \n\t" /* src[0]  > a ? 0 : -1*/\
2012
                "paddb " #t1 ", " #t0 "                                \n\t"\
2013
                "paddb " #t0 ", " #sx "                                \n\t"\
2014
\
2015
                PAVGB(plx, pplx)                                      /* filtered */\
2016
                "movq " #dst ", " #t0 "                                \n\t" /* dst */\
2017
                "movq " #t0 ", " #t1 "                                \n\t" /* dst */\
2018
                "psubusb pQPb2, " #t0 "                                \n\t"\
2019
                "paddusb pQPb2, " #t1 "                                \n\t"\
2020
                PMAXUB(t0, pplx)\
2021
                PMINUB(t1, pplx, t0)\
2022
                "paddb " #sx ", " #ppsx "                        \n\t"\
2023
                "paddb " #psx ", " #ppsx "                        \n\t"\
2024
        "#paddb b02, " #ppsx "                                \n\t"\
2025
                "pand b08, " #ppsx "                                \n\t"\
2026
                "pcmpeqb " #lx ", " #ppsx "                        \n\t"\
2027
                "pand " #ppsx ", " #pplx "                        \n\t"\
2028
                "pandn " #dst ", " #ppsx "                        \n\t"\
2029
                "por " #pplx ", " #ppsx "                        \n\t"\
2030
                "movq " #ppsx ", " #dst "                        \n\t"\
2031
                "movq temp1, " #lx "                                \n\t"
2032

    
2033
/*
2034
0000000
2035
1111111
2036

2037
1111110
2038
1111101
2039
1111100
2040
1111011
2041
1111010
2042
1111001
2043

2044
1111000
2045
1110111
2046

2047
*/
2048
//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
2049
DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2050
DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2051
DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
2052
DERING_CORE((%0, %1, 4),(%%ebx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2053
DERING_CORE((%%ebx),(%%ebx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2054
DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
2055
DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2056
DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2057

    
2058

    
2059
                : : "r" (src), "r" (stride), "r" (QP)
2060
                : "%eax", "%ebx"
2061
        );
2062
#else
2063
        int y;
2064
        int min=255;
2065
        int max=0;
2066
        int avg;
2067
        uint8_t *p;
2068
        int s[10];
2069

    
2070
        for(y=1; y<9; y++)
2071
        {
2072
                int x;
2073
                p= src + stride*y;
2074
                for(x=1; x<9; x++)
2075
                {
2076
                        p++;
2077
                        if(*p > max) max= *p;
2078
                        if(*p < min) min= *p;
2079
                }
2080
        }
2081
        avg= (min + max + 1)/2;
2082

    
2083
        for(y=0; y<10; y++)
2084
        {
2085
                int x;
2086
                int t = 0;
2087
                p= src + stride*y;
2088
                for(x=0; x<10; x++)
2089
                {
2090
                        if(*p > avg) t |= (1<<x);
2091
                        p++;
2092
                }
2093
                t |= (~t)<<16;
2094
                t &= (t<<1) & (t>>1);
2095
                s[y] = t;
2096
        }
2097

    
2098
        for(y=1; y<9; y++)
2099
        {
2100
                int x;
2101
                int t = s[y-1] & s[y] & s[y+1];
2102
                t|= t>>16;
2103

    
2104
                p= src + stride*y;
2105
                for(x=1; x<9; x++)
2106
                {
2107
                        p++;
2108
                        if(t & (1<<x))
2109
                        {
2110
                                int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
2111
                                      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
2112
                                      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
2113
                                f= (f + 8)>>4;
2114

    
2115
                                if     (*p + 2*QP < f) *p= *p + 2*QP;
2116
                                else if(*p - 2*QP > f) *p= *p - 2*QP;
2117
                                else *p=f;
2118
                        }
2119
                }
2120
        }
2121

    
2122
#endif
2123
}
2124

    
2125
/**
2126
 * Deinterlaces the given block
2127
 * will be called for every 8x8 block and can read & write from line 4-15
2128
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2129
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2130
 */
2131
static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
2132
{
2133
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2134
        src+= 4*stride;
2135
        asm volatile(
2136
                "leal (%0, %1), %%eax                                \n\t"
2137
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2138
//        0        1        2        3        4        5        6        7        8        9
2139
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
2140

    
2141
                "movq (%0), %%mm0                                \n\t"
2142
                "movq (%%eax, %1), %%mm1                        \n\t"
2143
                PAVGB(%%mm1, %%mm0)
2144
                "movq %%mm0, (%%eax)                                \n\t"
2145
                "movq (%0, %1, 4), %%mm0                        \n\t"
2146
                PAVGB(%%mm0, %%mm1)
2147
                "movq %%mm1, (%%eax, %1, 2)                        \n\t"
2148
                "movq (%%ebx, %1), %%mm1                        \n\t"
2149
                PAVGB(%%mm1, %%mm0)
2150
                "movq %%mm0, (%%ebx)                                \n\t"
2151
                "movq (%0, %1, 8), %%mm0                        \n\t"
2152
                PAVGB(%%mm0, %%mm1)
2153
                "movq %%mm1, (%%ebx, %1, 2)                        \n\t"
2154

    
2155
                : : "r" (src), "r" (stride)
2156
                : "%eax", "%ebx"
2157
        );
2158
#else
2159
        int x;
2160
        src+= 4*stride;
2161
        for(x=0; x<8; x++)
2162
        {
2163
                src[stride]   = (src[0]        + src[stride*2])>>1;
2164
                src[stride*3] = (src[stride*2] + src[stride*4])>>1;
2165
                src[stride*5] = (src[stride*4] + src[stride*6])>>1;
2166
                src[stride*7] = (src[stride*6] + src[stride*8])>>1;
2167
                src++;
2168
        }
2169
#endif
2170
}
2171

    
2172
/**
2173
 * Deinterlaces the given block
2174
 * will be called for every 8x8 block and can read & write from line 4-15
2175
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2176
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2177
 * this filter will read lines 3-15 and write 7-13
2178
 * no cliping in C version
2179
 */
2180
static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
2181
{
2182
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2183
        src+= stride*3;
2184
        asm volatile(
2185
                "leal (%0, %1), %%eax                                \n\t"
2186
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2187
                "leal (%%ebx, %1, 4), %%ecx                        \n\t"
2188
                "addl %1, %%ecx                                        \n\t"
2189
                "pxor %%mm7, %%mm7                                \n\t"
2190
//        0        1        2        3        4        5        6        7        8        9        10
2191
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1 ecx
2192

    
2193
#define DEINT_CUBIC(a,b,c,d,e)\
2194
                "movq " #a ", %%mm0                                \n\t"\
2195
                "movq " #b ", %%mm1                                \n\t"\
2196
                "movq " #d ", %%mm2                                \n\t"\
2197
                "movq " #e ", %%mm3                                \n\t"\
2198
                PAVGB(%%mm2, %%mm1)                                        /* (b+d) /2 */\
2199
                PAVGB(%%mm3, %%mm0)                                        /* a(a+e) /2 */\
2200
                "movq %%mm0, %%mm2                                \n\t"\
2201
                "punpcklbw %%mm7, %%mm0                                \n\t"\
2202
                "punpckhbw %%mm7, %%mm2                                \n\t"\
2203
                "movq %%mm1, %%mm3                                \n\t"\
2204
                "punpcklbw %%mm7, %%mm1                                \n\t"\
2205
                "punpckhbw %%mm7, %%mm3                                \n\t"\
2206
                "psubw %%mm1, %%mm0                                \n\t"        /* L(a+e - (b+d))/2 */\
2207
                "psubw %%mm3, %%mm2                                \n\t"        /* H(a+e - (b+d))/2 */\
2208
                "psraw $3, %%mm0                                \n\t"        /* L(a+e - (b+d))/16 */\
2209
                "psraw $3, %%mm2                                \n\t"        /* H(a+e - (b+d))/16 */\
2210
                "psubw %%mm0, %%mm1                                \n\t"        /* L(9b + 9d - a - e)/16 */\
2211
                "psubw %%mm2, %%mm3                                \n\t"        /* H(9b + 9d - a - e)/16 */\
2212
                "packuswb %%mm3, %%mm1                                \n\t"\
2213
                "movq %%mm1, " #c "                                \n\t"
2214

    
2215
DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
2216
DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
2217
DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
2218
DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
2219

    
2220
                : : "r" (src), "r" (stride)
2221
                : "%eax", "%ebx", "ecx"
2222
        );
2223
#else
2224
        int x;
2225
        src+= stride*3;
2226
        for(x=0; x<8; x++)
2227
        {
2228
                src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
2229
                src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
2230
                src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
2231
                src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
2232
                src++;
2233
        }
2234
#endif
2235
}
2236

    
2237
/**
2238
 * Deinterlaces the given block
2239
 * will be called for every 8x8 block and can read & write from line 4-15
2240
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2241
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2242
 * will shift the image up by 1 line (FIXME if this is a problem)
2243
 * this filter will read lines 4-13 and write 4-11
2244
 */
2245
static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
2246
{
2247
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2248
        src+= 4*stride;
2249
        asm volatile(
2250
                "leal (%0, %1), %%eax                                \n\t"
2251
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2252
//        0        1        2        3        4        5        6        7        8        9
2253
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
2254

    
2255
                "movq (%0), %%mm0                                \n\t" // L0
2256
                "movq (%%eax, %1), %%mm1                        \n\t" // L2
2257
                PAVGB(%%mm1, %%mm0)                                      // L0+L2
2258
                "movq (%%eax), %%mm2                                \n\t" // L1
2259
                PAVGB(%%mm2, %%mm0)
2260
                "movq %%mm0, (%0)                                \n\t"
2261
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // L3
2262
                PAVGB(%%mm0, %%mm2)                                      // L1+L3
2263
                PAVGB(%%mm1, %%mm2)                                      // 2L2 + L1 + L3
2264
                "movq %%mm2, (%%eax)                                \n\t"
2265
                "movq (%0, %1, 4), %%mm2                        \n\t" // L4
2266
                PAVGB(%%mm2, %%mm1)                                      // L2+L4
2267
                PAVGB(%%mm0, %%mm1)                                      // 2L3 + L2 + L4
2268
                "movq %%mm1, (%%eax, %1)                        \n\t"
2269
                "movq (%%ebx), %%mm1                                \n\t" // L5
2270
                PAVGB(%%mm1, %%mm0)                                      // L3+L5
2271
                PAVGB(%%mm2, %%mm0)                                      // 2L4 + L3 + L5
2272
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
2273
                "movq (%%ebx, %1), %%mm0                        \n\t" // L6
2274
                PAVGB(%%mm0, %%mm2)                                      // L4+L6
2275
                PAVGB(%%mm1, %%mm2)                                      // 2L5 + L4 + L6
2276
                "movq %%mm2, (%0, %1, 4)                        \n\t"
2277
                "movq (%%ebx, %1, 2), %%mm2                        \n\t" // L7
2278
                PAVGB(%%mm2, %%mm1)                                      // L5+L7
2279
                PAVGB(%%mm0, %%mm1)                                      // 2L6 + L5 + L7
2280
                "movq %%mm1, (%%ebx)                                \n\t"
2281
                "movq (%0, %1, 8), %%mm1                        \n\t" // L8
2282
                PAVGB(%%mm1, %%mm0)                                      // L6+L8
2283
                PAVGB(%%mm2, %%mm0)                                      // 2L7 + L6 + L8
2284
                "movq %%mm0, (%%ebx, %1)                        \n\t"
2285
                "movq (%%ebx, %1, 4), %%mm0                        \n\t" // L9
2286
                PAVGB(%%mm0, %%mm2)                                      // L7+L9
2287
                PAVGB(%%mm1, %%mm2)                                      // 2L8 + L7 + L9
2288
                "movq %%mm2, (%%ebx, %1, 2)                        \n\t"
2289

    
2290

    
2291
                : : "r" (src), "r" (stride)
2292
                : "%eax", "%ebx"
2293
        );
2294
#else
2295
        int x;
2296
        src+= 4*stride;
2297
        for(x=0; x<8; x++)
2298
        {
2299
                src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2300
                src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2301
                src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2302
                src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2303
                src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2304
                src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2305
                src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2306
                src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2307
                src++;
2308
        }
2309
#endif
2310
}
2311

    
2312
/**
2313
 * Deinterlaces the given block
2314
 * will be called for every 8x8 block and can read & write from line 4-15,
2315
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2316
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2317
 */
2318
static inline void deInterlaceMedian(uint8_t src[], int stride)
2319
{
2320
#ifdef HAVE_MMX
2321
        src+= 4*stride;
2322
#ifdef HAVE_MMX2
2323
        asm volatile(
2324
                "leal (%0, %1), %%eax                                \n\t"
2325
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2326
//        0        1        2        3        4        5        6        7        8        9
2327
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
2328

    
2329
                "movq (%0), %%mm0                                \n\t" //
2330
                "movq (%%eax, %1), %%mm2                        \n\t" //
2331
                "movq (%%eax), %%mm1                                \n\t" //
2332
                "movq %%mm0, %%mm3                                \n\t"
2333
                "pmaxub %%mm1, %%mm0                                \n\t" //
2334
                "pminub %%mm3, %%mm1                                \n\t" //
2335
                "pmaxub %%mm2, %%mm1                                \n\t" //
2336
                "pminub %%mm1, %%mm0                                \n\t"
2337
                "movq %%mm0, (%%eax)                                \n\t"
2338

    
2339
                "movq (%0, %1, 4), %%mm0                        \n\t" //
2340
                "movq (%%eax, %1, 2), %%mm1                        \n\t" //
2341
                "movq %%mm2, %%mm3                                \n\t"
2342
                "pmaxub %%mm1, %%mm2                                \n\t" //
2343
                "pminub %%mm3, %%mm1                                \n\t" //
2344
                "pmaxub %%mm0, %%mm1                                \n\t" //
2345
                "pminub %%mm1, %%mm2                                \n\t"
2346
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
2347

    
2348
                "movq (%%ebx), %%mm2                                \n\t" //
2349
                "movq (%%ebx, %1), %%mm1                        \n\t" //
2350
                "movq %%mm2, %%mm3                                \n\t"
2351
                "pmaxub %%mm0, %%mm2                                \n\t" //
2352
                "pminub %%mm3, %%mm0                                \n\t" //
2353
                "pmaxub %%mm1, %%mm0                                \n\t" //
2354
                "pminub %%mm0, %%mm2                                \n\t"
2355
                "movq %%mm2, (%%ebx)                                \n\t"
2356

    
2357
                "movq (%%ebx, %1, 2), %%mm2                        \n\t" //
2358
                "movq (%0, %1, 8), %%mm0                        \n\t" //
2359
                "movq %%mm2, %%mm3                                \n\t"
2360
                "pmaxub %%mm0, %%mm2                                \n\t" //
2361
                "pminub %%mm3, %%mm0                                \n\t" //
2362
                "pmaxub %%mm1, %%mm0                                \n\t" //
2363
                "pminub %%mm0, %%mm2                                \n\t"
2364
                "movq %%mm2, (%%ebx, %1, 2)                        \n\t"
2365

    
2366

    
2367
                : : "r" (src), "r" (stride)
2368
                : "%eax", "%ebx"
2369
        );
2370

    
2371
#else // MMX without MMX2
2372
        asm volatile(
2373
                "leal (%0, %1), %%eax                                \n\t"
2374
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2375
//        0        1        2        3        4        5        6        7        8        9
2376
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
2377
                "pxor %%mm7, %%mm7                                \n\t"
2378

    
2379
#define MEDIAN(a,b,c)\
2380
                "movq " #a ", %%mm0                                \n\t"\
2381
                "movq " #b ", %%mm2                                \n\t"\
2382
                "movq " #c ", %%mm1                                \n\t"\
2383
                "movq %%mm0, %%mm3                                \n\t"\
2384
                "movq %%mm1, %%mm4                                \n\t"\
2385
                "movq %%mm2, %%mm5                                \n\t"\
2386
                "psubusb %%mm1, %%mm3                                \n\t"\
2387
                "psubusb %%mm2, %%mm4                                \n\t"\
2388
                "psubusb %%mm0, %%mm5                                \n\t"\
2389
                "pcmpeqb %%mm7, %%mm3                                \n\t"\
2390
                "pcmpeqb %%mm7, %%mm4                                \n\t"\
2391
                "pcmpeqb %%mm7, %%mm5                                \n\t"\
2392
                "movq %%mm3, %%mm6                                \n\t"\
2393
                "pxor %%mm4, %%mm3                                \n\t"\
2394
                "pxor %%mm5, %%mm4                                \n\t"\
2395
                "pxor %%mm6, %%mm5                                \n\t"\
2396
                "por %%mm3, %%mm1                                \n\t"\
2397
                "por %%mm4, %%mm2                                \n\t"\
2398
                "por %%mm5, %%mm0                                \n\t"\
2399
                "pand %%mm2, %%mm0                                \n\t"\
2400
                "pand %%mm1, %%mm0                                \n\t"\
2401
                "movq %%mm0, " #b "                                \n\t"
2402

    
2403
MEDIAN((%0), (%%eax), (%%eax, %1))
2404
MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2405
MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2406
MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2407

    
2408
                : : "r" (src), "r" (stride)
2409
                : "%eax", "%ebx"
2410
        );
2411
#endif // MMX
2412
#else
2413
        //FIXME
2414
        int x;
2415
        src+= 4*stride;
2416
        for(x=0; x<8; x++)
2417
        {
2418
                src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2419
                src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2420
                src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2421
                src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2422
                src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2423
                src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2424
                src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2425
                src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2426
                src++;
2427
        }
2428
#endif
2429
}
2430

    
2431
#ifdef HAVE_MMX
2432
/**
2433
 * transposes and shift the given 8x8 Block into dst1 and dst2
2434
 */
2435
static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2436
{
2437
        asm(
2438
                "leal (%0, %1), %%eax                                \n\t"
2439
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2440
//        0        1        2        3        4        5        6        7        8        9
2441
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
2442
                "movq (%0), %%mm0                \n\t" // 12345678
2443
                "movq (%%eax), %%mm1                \n\t" // abcdefgh
2444
                "movq %%mm0, %%mm2                \n\t" // 12345678
2445
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2446
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2447

    
2448
                "movq (%%eax, %1), %%mm1        \n\t"
2449
                "movq (%%eax, %1, 2), %%mm3        \n\t"
2450
                "movq %%mm1, %%mm4                \n\t"
2451
                "punpcklbw %%mm3, %%mm1                \n\t"
2452
                "punpckhbw %%mm3, %%mm4                \n\t"
2453

    
2454
                "movq %%mm0, %%mm3                \n\t"
2455
                "punpcklwd %%mm1, %%mm0                \n\t"
2456
                "punpckhwd %%mm1, %%mm3                \n\t"
2457
                "movq %%mm2, %%mm1                \n\t"
2458
                "punpcklwd %%mm4, %%mm2                \n\t"
2459
                "punpckhwd %%mm4, %%mm1                \n\t"
2460

    
2461
                "movd %%mm0, 128(%2)                \n\t"
2462
                "psrlq $32, %%mm0                \n\t"
2463
                "movd %%mm0, 144(%2)                \n\t"
2464
                "movd %%mm3, 160(%2)                \n\t"
2465
                "psrlq $32, %%mm3                \n\t"
2466
                "movd %%mm3, 176(%2)                \n\t"
2467
                "movd %%mm3, 48(%3)                \n\t"
2468
                "movd %%mm2, 192(%2)                \n\t"
2469
                "movd %%mm2, 64(%3)                \n\t"
2470
                "psrlq $32, %%mm2                \n\t"
2471
                "movd %%mm2, 80(%3)                \n\t"
2472
                "movd %%mm1, 96(%3)                \n\t"
2473
                "psrlq $32, %%mm1                \n\t"
2474
                "movd %%mm1, 112(%3)                \n\t"
2475

    
2476
                "movq (%0, %1, 4), %%mm0        \n\t" // 12345678
2477
                "movq (%%ebx), %%mm1                \n\t" // abcdefgh
2478
                "movq %%mm0, %%mm2                \n\t" // 12345678
2479
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2480
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2481

    
2482
                "movq (%%ebx, %1), %%mm1        \n\t"
2483
                "movq (%%ebx, %1, 2), %%mm3        \n\t"
2484
                "movq %%mm1, %%mm4                \n\t"
2485
                "punpcklbw %%mm3, %%mm1                \n\t"
2486
                "punpckhbw %%mm3, %%mm4                \n\t"
2487

    
2488
                "movq %%mm0, %%mm3                \n\t"
2489
                "punpcklwd %%mm1, %%mm0                \n\t"
2490
                "punpckhwd %%mm1, %%mm3                \n\t"
2491
                "movq %%mm2, %%mm1                \n\t"
2492
                "punpcklwd %%mm4, %%mm2                \n\t"
2493
                "punpckhwd %%mm4, %%mm1                \n\t"
2494

    
2495
                "movd %%mm0, 132(%2)                \n\t"
2496
                "psrlq $32, %%mm0                \n\t"
2497
                "movd %%mm0, 148(%2)                \n\t"
2498
                "movd %%mm3, 164(%2)                \n\t"
2499
                "psrlq $32, %%mm3                \n\t"
2500
                "movd %%mm3, 180(%2)                \n\t"
2501
                "movd %%mm3, 52(%3)                \n\t"
2502
                "movd %%mm2, 196(%2)                \n\t"
2503
                "movd %%mm2, 68(%3)                \n\t"
2504
                "psrlq $32, %%mm2                \n\t"
2505
                "movd %%mm2, 84(%3)                \n\t"
2506
                "movd %%mm1, 100(%3)                \n\t"
2507
                "psrlq $32, %%mm1                \n\t"
2508
                "movd %%mm1, 116(%3)                \n\t"
2509

    
2510

    
2511
        :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2512
        : "%eax", "%ebx"
2513
        );
2514
}
2515

    
2516
/**
2517
 * transposes the given 8x8 block
2518
 */
2519
static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2520
{
2521
        asm(
2522
                "leal (%0, %1), %%eax                                \n\t"
2523
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
2524
//        0        1        2        3        4        5        6        7        8        9
2525
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
2526
                "movq (%2), %%mm0                \n\t" // 12345678
2527
                "movq 16(%2), %%mm1                \n\t" // abcdefgh
2528
                "movq %%mm0, %%mm2                \n\t" // 12345678
2529
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2530
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2531

    
2532
                "movq 32(%2), %%mm1                \n\t"
2533
                "movq 48(%2), %%mm3                \n\t"
2534
                "movq %%mm1, %%mm4                \n\t"
2535
                "punpcklbw %%mm3, %%mm1                \n\t"
2536
                "punpckhbw %%mm3, %%mm4                \n\t"
2537

    
2538
                "movq %%mm0, %%mm3                \n\t"
2539
                "punpcklwd %%mm1, %%mm0                \n\t"
2540
                "punpckhwd %%mm1, %%mm3                \n\t"
2541
                "movq %%mm2, %%mm1                \n\t"
2542
                "punpcklwd %%mm4, %%mm2                \n\t"
2543
                "punpckhwd %%mm4, %%mm1                \n\t"
2544

    
2545
                "movd %%mm0, (%0)                \n\t"
2546
                "psrlq $32, %%mm0                \n\t"
2547
                "movd %%mm0, (%%eax)                \n\t"
2548
                "movd %%mm3, (%%eax, %1)        \n\t"
2549
                "psrlq $32, %%mm3                \n\t"
2550
                "movd %%mm3, (%%eax, %1, 2)        \n\t"
2551
                "movd %%mm2, (%0, %1, 4)        \n\t"
2552
                "psrlq $32, %%mm2                \n\t"
2553
                "movd %%mm2, (%%ebx)                \n\t"
2554
                "movd %%mm1, (%%ebx, %1)        \n\t"
2555
                "psrlq $32, %%mm1                \n\t"
2556
                "movd %%mm1, (%%ebx, %1, 2)        \n\t"
2557

    
2558

    
2559
                "movq 64(%2), %%mm0                \n\t" // 12345678
2560
                "movq 80(%2), %%mm1                \n\t" // abcdefgh
2561
                "movq %%mm0, %%mm2                \n\t" // 12345678
2562
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2563
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2564

    
2565
                "movq 96(%2), %%mm1                \n\t"
2566
                "movq 112(%2), %%mm3                \n\t"
2567
                "movq %%mm1, %%mm4                \n\t"
2568
                "punpcklbw %%mm3, %%mm1                \n\t"
2569
                "punpckhbw %%mm3, %%mm4                \n\t"
2570

    
2571
                "movq %%mm0, %%mm3                \n\t"
2572
                "punpcklwd %%mm1, %%mm0                \n\t"
2573
                "punpckhwd %%mm1, %%mm3                \n\t"
2574
                "movq %%mm2, %%mm1                \n\t"
2575
                "punpcklwd %%mm4, %%mm2                \n\t"
2576
                "punpckhwd %%mm4, %%mm1                \n\t"
2577

    
2578
                "movd %%mm0, 4(%0)                \n\t"
2579
                "psrlq $32, %%mm0                \n\t"
2580
                "movd %%mm0, 4(%%eax)                \n\t"
2581
                "movd %%mm3, 4(%%eax, %1)        \n\t"
2582
                "psrlq $32, %%mm3                \n\t"
2583
                "movd %%mm3, 4(%%eax, %1, 2)        \n\t"
2584
                "movd %%mm2, 4(%0, %1, 4)        \n\t"
2585
                "psrlq $32, %%mm2                \n\t"
2586
                "movd %%mm2, 4(%%ebx)                \n\t"
2587
                "movd %%mm1, 4(%%ebx, %1)        \n\t"
2588
                "psrlq $32, %%mm1                \n\t"
2589
                "movd %%mm1, 4(%%ebx, %1, 2)        \n\t"
2590

    
2591
        :: "r" (dst), "r" (dstStride), "r" (src)
2592
        : "%eax", "%ebx"
2593
        );
2594
}
2595
#endif
2596

    
2597
#ifdef HAVE_ODIVX_POSTPROCESS
2598
#include "../opendivx/postprocess.h"
2599
int use_old_pp=0;
2600
#endif
2601

    
2602
static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2603
        QP_STORE_T QPs[], int QPStride, int isColor, int mode);
2604

    
2605
/* -pp Command line Help
2606
NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2607

2608
-pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2609

2610
long form example:
2611
-pp vdeblock:autoq,hdeblock:autoq,linblenddeint                -pp default,-vdeblock
2612
short form example:
2613
-pp vb:a,hb:a,lb                                        -pp de,-vb
2614

2615
Filters                        Options
2616
short        long name        short        long option        Description
2617
*        *                a        autoq                cpu power dependant enabler
2618
                        c        chrom                chrominance filtring enabled
2619
                        y        nochrom                chrominance filtring disabled
2620
hb        hdeblock                                horizontal deblocking filter
2621
vb        vdeblock                                vertical deblocking filter
2622
vr        rkvdeblock
2623
h1        x1hdeblock                                Experimental horizontal deblock filter 1
2624
v1        x1vdeblock                                Experimental vertical deblock filter 1
2625
dr        dering                                        not implemented yet
2626
al        autolevels                                automatic brightness / contrast fixer
2627
                        f        fullyrange        stretch luminance range to (0..255)
2628
lb        linblenddeint                                linear blend deinterlacer
2629
li        linipoldeint                                linear interpolating deinterlacer
2630
ci        cubicipoldeint                                cubic interpolating deinterlacer
2631
md        mediandeint                                median deinterlacer
2632
de        default                                        hdeblock:a,vdeblock:a,dering:a,autolevels
2633
fa        fast                                        x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2634
*/
2635

    
2636
/**
2637
 * returns a PPMode struct which will have a non 0 error variable if an error occured
2638
 * name is the string after "-pp" on the command line
2639
 * quality is a number from 0 to GET_PP_QUALITY_MAX
2640
 */
2641
struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2642
{
2643
        char temp[GET_MODE_BUFFER_SIZE];
2644
        char *p= temp;
2645
        char *filterDelimiters= ",";
2646
        char *optionDelimiters= ":";
2647
        struct PPMode ppMode= {0,0,0,0,0,0};
2648
        char *filterToken;
2649

    
2650
        strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2651

    
2652
        for(;;){
2653
                char *filterName;
2654
                int q= GET_PP_QUALITY_MAX;
2655
                int chrom=-1;
2656
                char *option;
2657
                char *options[OPTIONS_ARRAY_SIZE];
2658
                int i;
2659
                int filterNameOk=0;
2660
                int numOfUnknownOptions=0;
2661
                int enable=1; //does the user want us to enabled or disabled the filter
2662

    
2663
                filterToken= strtok(p, filterDelimiters);
2664
                if(filterToken == NULL) break;
2665
                p+= strlen(filterToken) + 1;
2666
                filterName= strtok(filterToken, optionDelimiters);
2667
                printf("%s::%s\n", filterToken, filterName);
2668

    
2669
                if(*filterName == '-')
2670
                {
2671
                        enable=0;
2672
                        filterName++;
2673
                }
2674
                for(;;){ //for all options
2675
                        option= strtok(NULL, optionDelimiters);
2676
                        if(option == NULL) break;
2677

    
2678
                        printf("%s\n", option);
2679
                        if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2680
                        else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2681
                        else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2682
                        else
2683
                        {
2684
                                options[numOfUnknownOptions] = option;
2685
                                numOfUnknownOptions++;
2686
                                options[numOfUnknownOptions] = NULL;
2687
                        }
2688
                        if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2689
                }
2690

    
2691
                /* replace stuff from the replace Table */
2692
                for(i=0; replaceTable[2*i]!=NULL; i++)
2693
                {
2694
                        if(!strcmp(replaceTable[2*i], filterName))
2695
                        {
2696
                                int newlen= strlen(replaceTable[2*i + 1]);
2697
                                int plen;
2698
                                int spaceLeft;
2699

    
2700
                                if(p==NULL) p= temp, *p=0;         //last filter
2701
                                else p--, *p=',';                //not last filter
2702

    
2703
                                plen= strlen(p);
2704
                                spaceLeft= (int)p - (int)temp + plen;
2705
                                if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
2706
                                {
2707
                                        ppMode.error++;
2708
                                        break;
2709
                                }
2710
                                memmove(p + newlen, p, plen+1);
2711
                                memcpy(p, replaceTable[2*i + 1], newlen);
2712
                                filterNameOk=1;
2713
                        }
2714
                }
2715

    
2716
                for(i=0; filters[i].shortName!=NULL; i++)
2717
                {
2718
                        if(   !strcmp(filters[i].longName, filterName)
2719
                           || !strcmp(filters[i].shortName, filterName))
2720
                        {
2721
                                ppMode.lumMode &= ~filters[i].mask;
2722
                                ppMode.chromMode &= ~filters[i].mask;
2723

    
2724
                                filterNameOk=1;
2725
                                if(!enable) break; // user wants to disable it
2726

    
2727
                                if(q >= filters[i].minLumQuality)
2728
                                        ppMode.lumMode|= filters[i].mask;
2729
                                if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2730
                                        if(q >= filters[i].minChromQuality)
2731
                                                ppMode.chromMode|= filters[i].mask;
2732

    
2733
                                if(filters[i].mask == LEVEL_FIX)
2734
                                {
2735
                                        int o;
2736
                                        ppMode.minAllowedY= 16;
2737
                                        ppMode.maxAllowedY= 234;
2738
                                        for(o=0; options[o]!=NULL; o++)
2739
                                                if(  !strcmp(options[o],"fullyrange")
2740
                                                   ||!strcmp(options[o],"f"))
2741
                                                {
2742
                                                        ppMode.minAllowedY= 0;
2743
                                                        ppMode.maxAllowedY= 255;
2744
                                                        numOfUnknownOptions--;
2745
                                                }
2746
                                }
2747
                        }
2748
                }
2749
                if(!filterNameOk) ppMode.error++;
2750
                ppMode.error += numOfUnknownOptions;
2751
        }
2752

    
2753
        if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2754
        if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2755
        if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2756
        if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2757
        if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2758
        if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2759

    
2760
        return ppMode;
2761
}
2762

    
2763
/**
2764
 * ...
2765
 */
2766
void  postprocess(unsigned char * src[], int src_stride,
2767
                 unsigned char * dst[], int dst_stride,
2768
                 int horizontal_size,   int vertical_size,
2769
                 QP_STORE_T *QP_store,  int QP_stride,
2770
                                          int mode)
2771
{
2772
/*
2773
        static int qual=0;
2774

2775
        struct PPMode ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock", qual);
2776
        qual++;
2777
        qual%=7;
2778
        printf("\n%d %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error);
2779
        postprocess2(src, src_stride, dst, dst_stride,
2780
                 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
2781

2782
        return;
2783
*/
2784

    
2785
#ifdef HAVE_ODIVX_POSTPROCESS
2786
// Note: I could make this shit outside of this file, but it would mean one
2787
// more function call...
2788
        if(use_old_pp){
2789
            odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2790
            return;
2791
        }
2792
#endif
2793

    
2794
        postProcess(src[0], src_stride, dst[0], dst_stride,
2795
                horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
2796

    
2797
        horizontal_size >>= 1;
2798
        vertical_size   >>= 1;
2799
        src_stride      >>= 1;
2800
        dst_stride      >>= 1;
2801
        mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2802
//        mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER |
2803
//                 MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER);
2804

    
2805
        if(1)
2806
        {
2807
                postProcess(src[1], src_stride, dst[1], dst_stride,
2808
                        horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
2809
                postProcess(src[2], src_stride, dst[2], dst_stride,
2810
                        horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
2811
        }
2812
        else
2813
        {
2814
                memcpy(dst[1], src[1], src_stride*horizontal_size);
2815
                memcpy(dst[2], src[2], src_stride*horizontal_size);
2816
        }
2817
}
2818

    
2819
void  postprocess2(unsigned char * src[], int src_stride,
2820
                 unsigned char * dst[], int dst_stride,
2821
                 int horizontal_size,   int vertical_size,
2822
                 QP_STORE_T *QP_store,  int QP_stride,
2823
                 struct PPMode *mode)
2824
{
2825

    
2826
#ifdef HAVE_ODIVX_POSTPROCESS
2827
// Note: I could make this shit outside of this file, but it would mean one
2828
// more function call...
2829
        if(use_old_pp){
2830
            odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
2831
            mode->oldMode);
2832
            return;
2833
        }
2834
#endif
2835

    
2836
        postProcess(src[0], src_stride, dst[0], dst_stride,
2837
                horizontal_size, vertical_size, QP_store, QP_stride, 0, mode->lumMode);
2838

    
2839
        horizontal_size >>= 1;
2840
        vertical_size   >>= 1;
2841
        src_stride      >>= 1;
2842
        dst_stride      >>= 1;
2843

    
2844
        postProcess(src[1], src_stride, dst[1], dst_stride,
2845
                horizontal_size, vertical_size, QP_store, QP_stride, 1, mode->chromMode);
2846
        postProcess(src[2], src_stride, dst[2], dst_stride,
2847
                horizontal_size, vertical_size, QP_store, QP_stride, 2, mode->chromMode);
2848
}
2849

    
2850

    
2851
/**
2852
 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
2853
 * 0 <= quality <= 6
2854
 */
2855
int getPpModeForQuality(int quality){
2856
        int modes[1+GET_PP_QUALITY_MAX]= {
2857
                0,
2858
#if 1
2859
                // horizontal filters first
2860
                LUM_H_DEBLOCK,
2861
                LUM_H_DEBLOCK | LUM_V_DEBLOCK,
2862
                LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
2863
                LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
2864
                LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
2865
                LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
2866
#else
2867
                // vertical filters first
2868
                LUM_V_DEBLOCK,
2869
                LUM_V_DEBLOCK | LUM_H_DEBLOCK,
2870
                LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
2871
                LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
2872
                LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
2873
                LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
2874
#endif
2875
        };
2876

    
2877
#ifdef HAVE_ODIVX_POSTPROCESS
2878
        int odivx_modes[1+GET_PP_QUALITY_MAX]= {
2879
                0,
2880
                PP_DEBLOCK_Y_H,
2881
                PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
2882
                PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
2883
                PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
2884
                PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
2885
                PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
2886
        };
2887
        if(use_old_pp) return odivx_modes[quality];
2888
#endif
2889
        return modes[quality];
2890
}
2891

    
2892
/**
2893
 * Copies a block from src to dst and fixes the blacklevel
2894
 * numLines must be a multiple of 4
2895
 * levelFix == 0 -> dont touch the brighness & contrast
2896
 */
2897
static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2898
        int numLines, int levelFix)
2899
{
2900
#ifndef HAVE_MMX
2901
        int i;
2902
#endif
2903
        if(levelFix)
2904
        {
2905
#ifdef HAVE_MMX
2906
                                        asm volatile(
2907
                                                "leal (%2,%2), %%eax        \n\t"
2908
                                                "leal (%3,%3), %%ebx        \n\t"
2909
                                                "movq packedYOffset, %%mm2        \n\t"
2910
                                                "movq packedYScale, %%mm3        \n\t"
2911
                                                "pxor %%mm4, %%mm4        \n\t"
2912

    
2913
#define SCALED_CPY                                        \
2914
                                                "movq (%0), %%mm0        \n\t"\
2915
                                                "movq (%0), %%mm5        \n\t"\
2916
                                                "punpcklbw %%mm4, %%mm0 \n\t"\
2917
                                                "punpckhbw %%mm4, %%mm5 \n\t"\
2918
                                                "psubw %%mm2, %%mm0        \n\t"\
2919
                                                "psubw %%mm2, %%mm5        \n\t"\
2920
                                                "movq (%0,%2), %%mm1        \n\t"\
2921
                                                "psllw $6, %%mm0        \n\t"\
2922
                                                "psllw $6, %%mm5        \n\t"\
2923
                                                "pmulhw %%mm3, %%mm0        \n\t"\
2924
                                                "movq (%0,%2), %%mm6        \n\t"\
2925
                                                "pmulhw %%mm3, %%mm5        \n\t"\
2926
                                                "punpcklbw %%mm4, %%mm1 \n\t"\
2927
                                                "punpckhbw %%mm4, %%mm6 \n\t"\
2928
                                                "psubw %%mm2, %%mm1        \n\t"\
2929
                                                "psubw %%mm2, %%mm6        \n\t"\
2930
                                                "psllw $6, %%mm1        \n\t"\
2931
                                                "psllw $6, %%mm6        \n\t"\
2932
                                                "pmulhw %%mm3, %%mm1        \n\t"\
2933
                                                "pmulhw %%mm3, %%mm6        \n\t"\
2934
                                                "addl %%eax, %0                \n\t"\
2935
                                                "packuswb %%mm5, %%mm0        \n\t"\
2936
                                                "packuswb %%mm6, %%mm1        \n\t"\
2937
                                                "movq %%mm0, (%1)        \n\t"\
2938
                                                "movq %%mm1, (%1, %3)        \n\t"\
2939

    
2940
SCALED_CPY
2941
                                                "addl %%ebx, %1                \n\t"
2942
SCALED_CPY
2943
                                                "addl %%ebx, %1                \n\t"
2944
SCALED_CPY
2945
                                                "addl %%ebx, %1                \n\t"
2946
SCALED_CPY
2947

    
2948
                                                : "+r"(src),
2949
                                                "+r"(dst)
2950
                                                :"r" (srcStride),
2951
                                                "r" (dstStride)
2952
                                                : "%eax", "%ebx"
2953
                                        );
2954
#else
2955
                                for(i=0; i<numLines; i++)
2956
                                        memcpy(        &(dst[dstStride*i]),
2957
                                                &(src[srcStride*i]), BLOCK_SIZE);
2958
#endif
2959
        }
2960
        else
2961
        {
2962
#ifdef HAVE_MMX
2963
                                        asm volatile(
2964
                                                "movl %4, %%eax \n\t"
2965
                                                "movl %%eax, temp0\n\t"
2966
                                                "pushl %0 \n\t"
2967
                                                "pushl %1 \n\t"
2968
                                                "leal (%2,%2), %%eax        \n\t"
2969
                                                "leal (%3,%3), %%ebx        \n\t"
2970
                                                "movq packedYOffset, %%mm2        \n\t"
2971
                                                "movq packedYScale, %%mm3        \n\t"
2972

    
2973
#define SIMPLE_CPY                                        \
2974
                                                "movq (%0), %%mm0        \n\t"\
2975
                                                "movq (%0,%2), %%mm1        \n\t"\
2976
                                                "movq %%mm0, (%1)        \n\t"\
2977
                                                "movq %%mm1, (%1, %3)        \n\t"\
2978

    
2979
                                                "1:                        \n\t"
2980
SIMPLE_CPY
2981
                                                "addl %%eax, %0                \n\t"
2982
                                                "addl %%ebx, %1                \n\t"
2983
SIMPLE_CPY
2984
                                                "addl %%eax, %0                \n\t"
2985
                                                "addl %%ebx, %1                \n\t"
2986
                                                "decl temp0                \n\t"
2987
                                                "jnz 1b                        \n\t"
2988

    
2989
                                                "popl %1 \n\t"
2990
                                                "popl %0 \n\t"
2991
                                                : : "r" (src),
2992
                                                "r" (dst),
2993
                                                "r" (srcStride),
2994
                                                "r" (dstStride),
2995
                                                "m" (numLines>>2)
2996
                                                : "%eax", "%ebx"
2997
                                        );
2998
#else
2999
                                for(i=0; i<numLines; i++)
3000
                                        memcpy(        &(dst[dstStride*i]),
3001
                                                &(src[srcStride*i]), BLOCK_SIZE);
3002
#endif
3003
        }
3004
}
3005

    
3006

    
3007
/**
3008
 * Filters array of bytes (Y or U or V values)
3009
 */
3010
static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3011
        QP_STORE_T QPs[], int QPStride, int isColor, int mode)
3012
{
3013
        int x,y;
3014
        /* we need 64bit here otherwise we?ll going to have a problem
3015
           after watching a black picture for 5 hours*/
3016
        static uint64_t *yHistogram= NULL;
3017
        int black=0, white=255; // blackest black and whitest white in the picture
3018
        int QPCorrecture= 256;
3019

    
3020
        /* Temporary buffers for handling the last row(s) */
3021
        static uint8_t *tempDst= NULL;
3022
        static uint8_t *tempSrc= NULL;
3023

    
3024
        /* Temporary buffers for handling the last block */
3025
        static uint8_t *tempDstBlock= NULL;
3026
        static uint8_t *tempSrcBlock= NULL;
3027

    
3028
#ifdef PP_FUNNY_STRIDE
3029
        uint8_t *dstBlockPtrBackup;
3030
        uint8_t *srcBlockPtrBackup;
3031
#endif
3032

    
3033
#ifdef MORE_TIMING
3034
        long long T0, T1, diffTime=0;
3035
#endif
3036
#ifdef TIMING
3037
        long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
3038
        sumTime= rdtsc();
3039
#endif
3040
//mode= 0x7F;
3041

    
3042
        if(tempDst==NULL)
3043
        {
3044
                tempDst= (uint8_t*)memalign(8, 1024*24);
3045
                tempSrc= (uint8_t*)memalign(8, 1024*24);
3046
                tempDstBlock= (uint8_t*)memalign(8, 1024*24);
3047
                tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
3048
        }
3049

    
3050
        if(!yHistogram)
3051
        {
3052
                int i;
3053
                yHistogram= (uint64_t*)malloc(8*256);
3054
                for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
3055

    
3056
                if(mode & FULL_Y_RANGE)
3057
                {
3058
                        maxAllowedY=255;
3059
                        minAllowedY=0;
3060
                }
3061
        }
3062

    
3063
        if(!isColor)
3064
        {
3065
                uint64_t sum= 0;
3066
                int i;
3067
                static int framenum= -1;
3068
                uint64_t maxClipped;
3069
                uint64_t clipped;
3070
                double scale;
3071

    
3072
                framenum++;
3073
                if(framenum == 1) yHistogram[0]= width*height/64*15/256;
3074

    
3075
                for(i=0; i<256; i++)
3076
                {
3077
                        sum+= yHistogram[i];
3078
//                        printf("%d ", yHistogram[i]);
3079
                }
3080
//                printf("\n\n");
3081

    
3082
                /* we allways get a completly black picture first */
3083
                maxClipped= (uint64_t)(sum * maxClippedThreshold);
3084

    
3085
                clipped= sum;
3086
                for(black=255; black>0; black--)
3087
                {
3088
                        if(clipped < maxClipped) break;
3089
                        clipped-= yHistogram[black];
3090
                }
3091

    
3092
                clipped= sum;
3093
                for(white=0; white<256; white++)
3094
                {
3095
                        if(clipped < maxClipped) break;
3096
                        clipped-= yHistogram[white];
3097
                }
3098

    
3099
                packedYOffset= (black - minAllowedY) & 0xFFFF;
3100
                packedYOffset|= packedYOffset<<32;
3101
                packedYOffset|= packedYOffset<<16;
3102

    
3103
                scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
3104

    
3105
                packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3106
                packedYScale|= packedYScale<<32;
3107
                packedYScale|= packedYScale<<16;
3108
        }
3109
        else
3110
        {
3111
                packedYScale= 0x0100010001000100LL;
3112
                packedYOffset= 0;
3113
        }
3114

    
3115
        if(mode & LEVEL_FIX)        QPCorrecture= packedYScale &0xFFFF;
3116
        else                        QPCorrecture= 256;
3117

    
3118
        /* copy & deinterlace first row of blocks */
3119
        y=-BLOCK_SIZE;
3120
        {
3121
                //1% speedup if these are here instead of the inner loop
3122
                uint8_t *srcBlock= &(src[y*srcStride]);
3123
                uint8_t *dstBlock= &(dst[y*dstStride]);
3124

    
3125
                dstBlock= tempDst + dstStride;
3126

    
3127
                // From this point on it is guranteed that we can read and write 16 lines downward
3128
                // finish 1 block before the next otherwise we?ll might have a problem
3129
                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3130
                for(x=0; x<width; x+=BLOCK_SIZE)
3131
                {
3132

    
3133
#ifdef HAVE_MMX2
3134
/*
3135
                        prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3136
                        prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3137
                        prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3138
                        prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3139
*/
3140
/*
3141
                        prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3142
                        prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3143
                        prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3144
                        prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3145
*/
3146

    
3147
                        asm(
3148
                                "movl %4, %%eax                        \n\t"
3149
                                "shrl $2, %%eax                        \n\t"
3150
                                "andl $6, %%eax                        \n\t"
3151
                                "addl $8, %%eax                        \n\t"
3152
                                "movl %%eax, %%ebx                \n\t"
3153
                                "imul %1, %%eax                        \n\t"
3154
                                "imul %3, %%ebx                        \n\t"
3155
                                "prefetchnta 32(%%eax, %0)        \n\t"
3156
                                "prefetcht0 32(%%ebx, %2)        \n\t"
3157
                                "addl %1, %%eax                        \n\t"
3158
                                "addl %3, %%ebx                        \n\t"
3159
                                "prefetchnta 32(%%eax, %0)        \n\t"
3160
                                "prefetcht0 32(%%ebx, %2)        \n\t"
3161
                        :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3162
                        "m" (x)
3163
                        : "%eax", "%ebx"
3164
                        );
3165

    
3166
#elif defined(HAVE_3DNOW)
3167
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3168
/*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3169
                        prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3170
                        prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3171
                        prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3172
*/
3173
#endif
3174

    
3175
                        blockCopy(dstBlock + dstStride*8, dstStride,
3176
                                srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX);
3177

    
3178
                        if(mode & LINEAR_IPOL_DEINT_FILTER)
3179
                                deInterlaceInterpolateLinear(dstBlock, dstStride);
3180
                        else if(mode & LINEAR_BLEND_DEINT_FILTER)
3181
                                deInterlaceBlendLinear(dstBlock, dstStride);
3182
                        else if(mode & MEDIAN_DEINT_FILTER)
3183
                                deInterlaceMedian(dstBlock, dstStride);
3184
                        else if(mode & CUBIC_IPOL_DEINT_FILTER)
3185
                                deInterlaceInterpolateCubic(dstBlock, dstStride);
3186
/*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
3187
                                deInterlaceBlendCubic(dstBlock, dstStride);
3188
*/
3189
                        dstBlock+=8;
3190
                        srcBlock+=8;
3191
                }
3192
                memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride );
3193
        }
3194

    
3195
        for(y=0; y<height; y+=BLOCK_SIZE)
3196
        {
3197
                //1% speedup if these are here instead of the inner loop
3198
                uint8_t *srcBlock= &(src[y*srcStride]);
3199
                uint8_t *dstBlock= &(dst[y*dstStride]);
3200
#ifdef ARCH_X86
3201
                int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
3202
                int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
3203
                int QPFrac= QPDelta;
3204
                uint8_t *tempBlock1= tempBlocks;
3205
                uint8_t *tempBlock2= tempBlocks + 8;
3206
#endif
3207
                /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3208
                   if not than use a temporary buffer */
3209
                if(y+15 >= height)
3210
                {
3211
                        /* copy from line 8 to 15 of src, these will be copied with
3212
                           blockcopy to dst later */
3213
                        memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8,
3214
                                srcStride*MAX(height-y-8, 0) );
3215

    
3216
                        /* duplicate last line to fill the void upto line 15 */
3217
                        if(y+15 >= height)
3218
                        {
3219
                                int i;
3220
                                for(i=height-y; i<=15; i++)
3221
                                        memcpy(tempSrc + srcStride*i,
3222
                                                src + srcStride*(height-1), srcStride);
3223
                        }
3224

    
3225
                        /* copy up to 9 lines of dst */
3226
                        memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 9) );
3227
                        dstBlock= tempDst + dstStride;
3228
                        srcBlock= tempSrc;
3229
                }
3230

    
3231
                // From this point on it is guranteed that we can read and write 16 lines downward
3232
                // finish 1 block before the next otherwise we?ll might have a problem
3233
                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3234
                for(x=0; x<width; x+=BLOCK_SIZE)
3235
                {
3236
                        const int stride= dstStride;
3237
                        uint8_t *tmpXchg;
3238
#ifdef ARCH_X86
3239
                        int QP= *QPptr;
3240
                        asm volatile(
3241
                                "addl %2, %1                \n\t"
3242
                                "sbbl %%eax, %%eax        \n\t"
3243
                                "shll $2, %%eax                \n\t"
3244
                                "subl %%eax, %0                \n\t"
3245
                                : "+r" (QPptr), "+m" (QPFrac)
3246
                                : "r" (QPDelta)
3247
                                : "%eax"
3248
                        );
3249
#else
3250
                        int QP= isColor ?
3251
                                QPs[(y>>3)*QPStride + (x>>3)]:
3252
                                QPs[(y>>4)*QPStride + (x>>4)];
3253
#endif
3254
                        if(!isColor)
3255
                        {
3256
                                QP= (QP* QPCorrecture)>>8;
3257
                                yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3258
                        }
3259
#ifdef HAVE_MMX
3260
                        asm volatile(
3261
                                "movd %0, %%mm7                                        \n\t"
3262
                                "packuswb %%mm7, %%mm7                                \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3263
                                "packuswb %%mm7, %%mm7                                \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3264
                                "packuswb %%mm7, %%mm7                                \n\t" // QP,..., QP
3265
                                "movq %%mm7, pQPb                                \n\t"
3266
                                : : "r" (QP)
3267
                        );
3268
#endif
3269

    
3270
#ifdef MORE_TIMING
3271
                        T0= rdtsc();
3272
#endif
3273

    
3274
#ifdef HAVE_MMX2
3275
/*
3276
                        prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3277
                        prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3278
                        prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3279
                        prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3280
*/
3281
/*
3282
                        prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3283
                        prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3284
                        prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3285
                        prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3286
*/
3287

    
3288
                        asm(
3289
                                "movl %4, %%eax                        \n\t"
3290
                                "shrl $2, %%eax                        \n\t"
3291
                                "andl $6, %%eax                        \n\t"
3292
                                "addl $8, %%eax                        \n\t"
3293
                                "movl %%eax, %%ebx                \n\t"
3294
                                "imul %1, %%eax                        \n\t"
3295
                                "imul %3, %%ebx                        \n\t"
3296
                                "prefetchnta 32(%%eax, %0)        \n\t"
3297
                                "prefetcht0 32(%%ebx, %2)        \n\t"
3298
                                "addl %1, %%eax                        \n\t"
3299
                                "addl %3, %%ebx                        \n\t"
3300
                                "prefetchnta 32(%%eax, %0)        \n\t"
3301
                                "prefetcht0 32(%%ebx, %2)        \n\t"
3302
                        :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3303
                        "m" (x)
3304
                        : "%eax", "%ebx"
3305
                        );
3306

    
3307
#elif defined(HAVE_3DNOW)
3308
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3309
/*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3310
                        prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3311
                        prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3312
                        prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3313
*/
3314
#endif
3315

    
3316
#ifdef PP_FUNNY_STRIDE
3317
                        //can we mess with a 8x16 block, if not use a temp buffer, yes again
3318
                        if(x+7 >= width)
3319
                        {
3320
                                int i;
3321
                                dstBlockPtrBackup= dstBlock;
3322
                                srcBlockPtrBackup= srcBlock;
3323

    
3324
                                for(i=0;i<BLOCK_SIZE*2; i++)
3325
                                {
3326
                                        memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
3327
                                        memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
3328
                                }
3329

    
3330
                                dstBlock= tempDstBlock;
3331
                                srcBlock= tempSrcBlock;
3332
                        }
3333
#endif
3334

    
3335
                        blockCopy(dstBlock + dstStride*8, dstStride,
3336
                                srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX);
3337

    
3338
                        if(mode & LINEAR_IPOL_DEINT_FILTER)
3339
                                deInterlaceInterpolateLinear(dstBlock, dstStride);
3340
                        else if(mode & LINEAR_BLEND_DEINT_FILTER)
3341
                                deInterlaceBlendLinear(dstBlock, dstStride);
3342
                        else if(mode & MEDIAN_DEINT_FILTER)
3343
                                deInterlaceMedian(dstBlock, dstStride);
3344
                        else if(mode & CUBIC_IPOL_DEINT_FILTER)
3345
                                deInterlaceInterpolateCubic(dstBlock, dstStride);
3346
/*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
3347
                                deInterlaceBlendCubic(dstBlock, dstStride);
3348
*/
3349

    
3350
                        /* only deblock if we have 2 blocks */
3351
                        if(y + 8 < height)
3352
                        {
3353
#ifdef MORE_TIMING
3354
                                T1= rdtsc();
3355
                                memcpyTime+= T1-T0;
3356
                                T0=T1;
3357
#endif
3358
                                if(mode & V_RK1_FILTER)
3359
                                        vertRK1Filter(dstBlock, stride, QP);
3360
                                else if(mode & V_X1_FILTER)
3361
                                        vertX1Filter(dstBlock, stride, QP);
3362
                                else if(mode & V_DEBLOCK)
3363
                                {
3364
                                        if( isVertDC(dstBlock, stride))
3365
                                        {
3366
                                                if(isVertMinMaxOk(dstBlock, stride, QP))
3367
                                                        doVertLowPass(dstBlock, stride, QP);
3368
                                        }
3369
                                        else
3370
                                                doVertDefFilter(dstBlock, stride, QP);
3371
                                }
3372
#ifdef MORE_TIMING
3373
                                T1= rdtsc();
3374
                                vertTime+= T1-T0;
3375
                                T0=T1;
3376
#endif
3377
                        }
3378

    
3379
#ifdef HAVE_MMX
3380
                        transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3381
#endif
3382
                        /* check if we have a previous block to deblock it with dstBlock */
3383
                        if(x - 8 >= 0)
3384
                        {
3385
#ifdef MORE_TIMING
3386
                                T0= rdtsc();
3387
#endif
3388
#ifdef HAVE_MMX
3389
                                if(mode & H_RK1_FILTER)
3390
                                        vertRK1Filter(tempBlock1, 16, QP);
3391
                                else if(mode & H_X1_FILTER)
3392
                                        vertX1Filter(tempBlock1, 16, QP);
3393
                                else if(mode & H_DEBLOCK)
3394
                                {
3395
                                        if( isVertDC(tempBlock1, 16))
3396
                                        {
3397
                                                if(isVertMinMaxOk(tempBlock1, 16, QP))
3398
                                                        doVertLowPass(tempBlock1, 16, QP);
3399
                                        }
3400
                                        else
3401
                                                doVertDefFilter(tempBlock1, 16, QP);
3402
                                }
3403

    
3404
                                transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
3405

    
3406
#else
3407
                                if(mode & H_X1_FILTER)
3408
                                        horizX1Filter(dstBlock-4, stride, QP);
3409
                                else if(mode & H_DEBLOCK)
3410
                                {
3411
                                        if( isHorizDC(dstBlock-4, stride))
3412
                                        {
3413
                                                if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3414
                                                        doHorizLowPass(dstBlock-4, stride, QP);
3415
                                        }
3416
                                        else
3417
                                                doHorizDefFilter(dstBlock-4, stride, QP);
3418
                                }
3419
#endif
3420
#ifdef MORE_TIMING
3421
                                T1= rdtsc();
3422
                                horizTime+= T1-T0;
3423
                                T0=T1;
3424
#endif
3425
                                if(mode & DERING)
3426
                                {
3427
                                //FIXME filter first line
3428
                                        if(y>0) dering(dstBlock - stride - 8, stride, QP);
3429
                                }
3430
                        }
3431
                        else if(mode & DERING)
3432
                        {
3433
                         //FIXME y+15 is required cuz of the tempBuffer thing -> bottom right block isnt filtered
3434
                                        if(y > 8 && y+15 < height) dering(dstBlock - stride*9 + width - 8, stride, QP);
3435
                        }
3436

    
3437

    
3438
#ifdef PP_FUNNY_STRIDE
3439
                        /* did we use a tmp-block buffer */
3440
                        if(x+7 >= width)
3441
                        {
3442
                                int i;
3443
                                dstBlock= dstBlockPtrBackup;
3444
                                srcBlock= srcBlockPtrBackup;
3445

    
3446
                                for(i=0;i<BLOCK_SIZE*2; i++)
3447
                                {
3448
                                        memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3449
                                }
3450
                        }
3451
#endif
3452

    
3453
                        dstBlock+=8;
3454
                        srcBlock+=8;
3455

    
3456
#ifdef HAVE_MMX
3457
                        tmpXchg= tempBlock1;
3458
                        tempBlock1= tempBlock2;
3459
                        tempBlock2 = tmpXchg;
3460
#endif
3461
                }
3462

    
3463
                /* did we use a tmp buffer for the last lines*/
3464
                if(y+15 >= height)
3465
                {
3466
                        uint8_t *dstBlock= &(dst[y*dstStride]);
3467
                        memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3468
                }
3469
        }
3470
#ifdef HAVE_3DNOW
3471
        asm volatile("femms");
3472
#elif defined (HAVE_MMX)
3473
        asm volatile("emms");
3474
#endif
3475

    
3476
#ifdef TIMING
3477
        // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3478
        sumTime= rdtsc() - sumTime;
3479
        if(!isColor)
3480
                printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
3481
                        (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3482
                        (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
3483
                        , black, white);
3484
#endif
3485
}