Statistics
| Branch: | Revision:

ffmpeg / postproc / postprocess_template.c @ 07f8991b

History | View | Annotate | Download (91.1 KB)

1
/*
2
    Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
#undef PAVGB
20
#undef PMINUB
21
#undef PMAXUB
22

    
23
#ifdef HAVE_MMX2
24
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
25
#elif defined (HAVE_3DNOW)
26
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
27
#endif
28

    
29
#ifdef HAVE_MMX2
30
#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
31
#elif defined (HAVE_MMX)
32
#define PMINUB(b,a,t) \
33
        "movq " #a ", " #t " \n\t"\
34
        "psubusb " #b ", " #t " \n\t"\
35
        "psubb " #t ", " #a " \n\t"
36
#endif
37

    
38
#ifdef HAVE_MMX2
39
#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
40
#elif defined (HAVE_MMX)
41
#define PMAXUB(a,b) \
42
        "psubusb " #a ", " #b " \n\t"\
43
        "paddb " #a ", " #b " \n\t"
44
#endif
45

    
46

    
47
//FIXME? |255-0| = 1 (shouldnt be a problem ...)
48
/**
49
 * Check if the middle 8x8 Block in the given 8x16 block is flat
50
 */
51
static inline int RENAME(isVertDC)(uint8_t src[], int stride){
52
        int numEq= 0;
53
#ifndef HAVE_MMX
54
        int y;
55
#endif
56
        src+= stride*4; // src points to begin of the 8x8 Block
57
#ifdef HAVE_MMX
58
asm volatile(
59
                "leal (%1, %2), %%eax                                \n\t"
60
                "leal (%%eax, %2, 4), %%ebx                        \n\t"
61
//        0        1        2        3        4        5        6        7        8        9
62
//        %1        eax        eax+%2        eax+2%2        %1+4%2        ebx        ebx+%2        ebx+2%2        %1+8%2        ebx+4%2
63
                "movq mmxDCOffset, %%mm7                        \n\t" // mm7 = 0x7F
64
                "movq mmxDCThreshold, %%mm6                        \n\t" // mm6 = 0x7D
65
                "movq (%1), %%mm0                                \n\t"
66
                "movq (%%eax), %%mm1                                \n\t"
67
                "psubb %%mm1, %%mm0                                \n\t" // mm0 = differnece
68
                "paddb %%mm7, %%mm0                                \n\t"
69
                "pcmpgtb %%mm6, %%mm0                                \n\t"
70

    
71
                "movq (%%eax,%2), %%mm2                                \n\t"
72
                "psubb %%mm2, %%mm1                                \n\t"
73
                "paddb %%mm7, %%mm1                                \n\t"
74
                "pcmpgtb %%mm6, %%mm1                                \n\t"
75
                "paddb %%mm1, %%mm0                                \n\t"
76

    
77
                "movq (%%eax, %2, 2), %%mm1                        \n\t"
78
                "psubb %%mm1, %%mm2                                \n\t"
79
                "paddb %%mm7, %%mm2                                \n\t"
80
                "pcmpgtb %%mm6, %%mm2                                \n\t"
81
                "paddb %%mm2, %%mm0                                \n\t"
82

    
83
                "movq (%1, %2, 4), %%mm2                        \n\t"
84
                "psubb %%mm2, %%mm1                                \n\t"
85
                "paddb %%mm7, %%mm1                                \n\t"
86
                "pcmpgtb %%mm6, %%mm1                                \n\t"
87
                "paddb %%mm1, %%mm0                                \n\t"
88

    
89
                "movq (%%ebx), %%mm1                                \n\t"
90
                "psubb %%mm1, %%mm2                                \n\t"
91
                "paddb %%mm7, %%mm2                                \n\t"
92
                "pcmpgtb %%mm6, %%mm2                                \n\t"
93
                "paddb %%mm2, %%mm0                                \n\t"
94

    
95
                "movq (%%ebx, %2), %%mm2                        \n\t"
96
                "psubb %%mm2, %%mm1                                \n\t"
97
                "paddb %%mm7, %%mm1                                \n\t"
98
                "pcmpgtb %%mm6, %%mm1                                \n\t"
99
                "paddb %%mm1, %%mm0                                \n\t"
100

    
101
                "movq (%%ebx, %2, 2), %%mm1                        \n\t"
102
                "psubb %%mm1, %%mm2                                \n\t"
103
                "paddb %%mm7, %%mm2                                \n\t"
104
                "pcmpgtb %%mm6, %%mm2                                \n\t"
105
                "paddb %%mm2, %%mm0                                \n\t"
106

    
107
                "                                                \n\t"
108
#ifdef HAVE_MMX2
109
                "pxor %%mm7, %%mm7                                \n\t"
110
                "psadbw %%mm7, %%mm0                                \n\t"
111
#else
112
                "movq %%mm0, %%mm1                                \n\t"
113
                "psrlw $8, %%mm0                                \n\t"
114
                "paddb %%mm1, %%mm0                                \n\t"
115
                "movq %%mm0, %%mm1                                \n\t"
116
                "psrlq $16, %%mm0                                \n\t"
117
                "paddb %%mm1, %%mm0                                \n\t"
118
                "movq %%mm0, %%mm1                                \n\t"
119
                "psrlq $32, %%mm0                                \n\t"
120
                "paddb %%mm1, %%mm0                                \n\t"
121
#endif
122
                "movd %%mm0, %0                                        \n\t"
123
                : "=r" (numEq)
124
                : "r" (src), "r" (stride)
125
                : "%ebx"
126
                );
127
        numEq= (-numEq) &0xFF;
128

    
129
#else
130
        for(y=0; y<BLOCK_SIZE-1; y++)
131
        {
132
                if(((src[0] - src[0+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
133
                if(((src[1] - src[1+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
134
                if(((src[2] - src[2+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
135
                if(((src[3] - src[3+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
136
                if(((src[4] - src[4+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
137
                if(((src[5] - src[5+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
138
                if(((src[6] - src[6+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
139
                if(((src[7] - src[7+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
140
                src+= stride;
141
        }
142
#endif
143
/*        if(abs(numEq - asmEq) > 0)
144
        {
145
                printf("\nasm:%d  c:%d\n", asmEq, numEq);
146
                for(int y=0; y<8; y++)
147
                {
148
                        for(int x=0; x<8; x++)
149
                        {
150
                                printf("%d ", temp[x + y*stride]);
151
                        }
152
                        printf("\n");
153
                }
154
        }
155
*/
156
//        for(int i=0; i<numEq/8; i++) src[i]=255;
157
        return (numEq > vFlatnessThreshold) ? 1 : 0;
158
}
159

    
160
static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, int QP)
161
{
162
#ifdef HAVE_MMX
163
        int isOk;
164
        src+= stride*3;
165
        asm volatile(
166
//                "int $3 \n\t"
167
                "movq (%1, %2), %%mm0                                \n\t"
168
                "movq (%1, %2, 8), %%mm1                        \n\t"
169
                "movq %%mm0, %%mm2                                \n\t"
170
                "psubusb %%mm1, %%mm0                                \n\t"
171
                "psubusb %%mm2, %%mm1                                \n\t"
172
                "por %%mm1, %%mm0                                \n\t" // ABS Diff
173

    
174
                "movq pQPb, %%mm7                                \n\t" // QP,..., QP
175
                "paddusb %%mm7, %%mm7                                \n\t" // 2QP ... 2QP
176
                "psubusb %%mm7, %%mm0                                \n\t" // Diff <= 2QP -> 0
177
                "pcmpeqd b00, %%mm0                                \n\t"
178
                "psrlq $16, %%mm0                                \n\t"
179
                "pcmpeqd bFF, %%mm0                                \n\t"
180
//                "movd %%mm0, (%1, %2, 4)\n\t"
181
                "movd %%mm0, %0                                        \n\t"
182
                : "=r" (isOk)
183
                : "r" (src), "r" (stride)
184
                );
185
        return isOk;
186
#else
187

    
188
        int isOk2= 1;
189
        int x;
190
        src+= stride*3;
191
        for(x=0; x<BLOCK_SIZE; x++)
192
        {
193
                if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
194
        }
195
/*        if(isOk && !isOk2 || !isOk && isOk2)
196
        {
197
                printf("\nasm:%d  c:%d QP:%d\n", isOk, isOk2, QP);
198
                for(int y=0; y<9; y++)
199
                {
200
                        for(int x=0; x<8; x++)
201
                        {
202
                                printf("%d ", src[x + y*stride]);
203
                        }
204
                        printf("\n");
205
                }
206
        } */
207

    
208
        return isOk2;
209
#endif
210

    
211
}
212

    
213
/**
214
 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
215
 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
216
 */
217
static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP)
218
{
219
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
220
        src+= stride*3;
221
        asm volatile(        //"movv %0 %1 %2\n\t"
222
                "movq pQPb, %%mm0                                \n\t"  // QP,..., QP
223

    
224
                "movq (%0), %%mm6                                \n\t"
225
                "movq (%0, %1), %%mm5                                \n\t"
226
                "movq %%mm5, %%mm1                                \n\t"
227
                "movq %%mm6, %%mm2                                \n\t"
228
                "psubusb %%mm6, %%mm5                                \n\t"
229
                "psubusb %%mm1, %%mm2                                \n\t"
230
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
231
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
232
                "pcmpeqb b00, %%mm2                                \n\t" // diff <= QP -> FF
233

    
234
                "pand %%mm2, %%mm6                                \n\t"
235
                "pandn %%mm1, %%mm2                                \n\t"
236
                "por %%mm2, %%mm6                                \n\t"// First Line to Filter
237

    
238
                "movq (%0, %1, 8), %%mm5                        \n\t"
239
                "leal (%0, %1, 4), %%eax                        \n\t"
240
                "leal (%0, %1, 8), %%ebx                        \n\t"
241
                "subl %1, %%ebx                                        \n\t"
242
                "addl %1, %0                                        \n\t" // %0 points to line 1 not 0
243
                "movq (%0, %1, 8), %%mm7                        \n\t"
244
                "movq %%mm5, %%mm1                                \n\t"
245
                "movq %%mm7, %%mm2                                \n\t"
246
                "psubusb %%mm7, %%mm5                                \n\t"
247
                "psubusb %%mm1, %%mm2                                \n\t"
248
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
249
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
250
                "pcmpeqb b00, %%mm2                                \n\t" // diff <= QP -> FF
251

    
252
                "pand %%mm2, %%mm7                                \n\t"
253
                "pandn %%mm1, %%mm2                                \n\t"
254
                "por %%mm2, %%mm7                                \n\t" // First Line to Filter
255

    
256

    
257
                //         1        2        3        4        5        6        7        8
258
                //        %0        %0+%1        %0+2%1        eax        %0+4%1        eax+2%1        ebx        eax+4%1
259
                // 6 4 2 2 1 1
260
                // 6 4 4 2
261
                // 6 8 2
262

    
263
                "movq (%0, %1), %%mm0                                \n\t" //  1
264
                "movq %%mm0, %%mm1                                \n\t" //  1
265
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
266
                PAVGB(%%mm6, %%mm0)                                      //3 1        /4
267

    
268
                "movq (%0, %1, 4), %%mm2                        \n\t" //     1
269
                "movq %%mm2, %%mm5                                \n\t" //     1
270
                PAVGB((%%eax), %%mm2)                                      //    11        /2
271
                PAVGB((%0, %1, 2), %%mm2)                              //   211        /4
272
                "movq %%mm2, %%mm3                                \n\t" //   211        /4
273
                "movq (%0), %%mm4                                \n\t" // 1
274
                PAVGB(%%mm4, %%mm3)                                      // 4 211        /8
275
                PAVGB(%%mm0, %%mm3)                                      //642211        /16
276
                "movq %%mm3, (%0)                                \n\t" // X
277
                // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
278
                "movq %%mm1, %%mm0                                \n\t" //  1
279
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
280
                "movq %%mm4, %%mm3                                \n\t" // 1
281
                PAVGB((%0,%1,2), %%mm3)                                      // 1 1        /2
282
                PAVGB((%%eax,%1,2), %%mm5)                              //     11        /2
283
                PAVGB((%%eax), %%mm5)                                      //    211 /4
284
                PAVGB(%%mm5, %%mm3)                                      // 2 2211 /8
285
                PAVGB(%%mm0, %%mm3)                                      //4242211 /16
286
                "movq %%mm3, (%0,%1)                                \n\t" //  X
287
                // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
288
                PAVGB(%%mm4, %%mm6)                                      //11        /2
289
                "movq (%%ebx), %%mm0                                \n\t" //       1
290
                PAVGB((%%eax, %1, 2), %%mm0)                              //      11/2
291
                "movq %%mm0, %%mm3                                \n\t" //      11/2
292
                PAVGB(%%mm1, %%mm0)                                      //  2   11/4
293
                PAVGB(%%mm6, %%mm0)                                      //222   11/8
294
                PAVGB(%%mm2, %%mm0)                                      //22242211/16
295
                "movq (%0, %1, 2), %%mm2                        \n\t" //   1
296
                "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
297
                // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
298
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
299
                PAVGB((%%ebx), %%mm0)                                      //       11        /2
300
                PAVGB(%%mm0, %%mm6)                                      //11     11        /4
301
                PAVGB(%%mm1, %%mm4)                                      // 11                /2
302
                PAVGB(%%mm2, %%mm1)                                      //  11                /2
303
                PAVGB(%%mm1, %%mm6)                                      //1122   11        /8
304
                PAVGB(%%mm5, %%mm6)                                      //112242211        /16
305
                "movq (%%eax), %%mm5                                \n\t" //    1
306
                "movq %%mm6, (%%eax)                                \n\t" //    X
307
                // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
308
                "movq (%%eax, %1, 4), %%mm6                        \n\t" //        1
309
                PAVGB(%%mm7, %%mm6)                                      //        11        /2
310
                PAVGB(%%mm4, %%mm6)                                      // 11     11        /4
311
                PAVGB(%%mm3, %%mm6)                                      // 11   2211        /8
312
                PAVGB(%%mm5, %%mm2)                                      //   11                /2
313
                "movq (%0, %1, 4), %%mm4                        \n\t" //     1
314
                PAVGB(%%mm4, %%mm2)                                      //   112                /4
315
                PAVGB(%%mm2, %%mm6)                                      // 112242211        /16
316
                "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
317
                // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
318
                PAVGB(%%mm7, %%mm1)                                      //  11     2        /4
319
                PAVGB(%%mm4, %%mm5)                                      //    11                /2
320
                PAVGB(%%mm5, %%mm0)                                      //    11 11        /4
321
                "movq (%%eax, %1, 2), %%mm6                        \n\t" //      1
322
                PAVGB(%%mm6, %%mm1)                                      //  11  4  2        /8
323
                PAVGB(%%mm0, %%mm1)                                      //  11224222        /16
324
                "movq %%mm1, (%%eax, %1, 2)                        \n\t" //      X
325
                // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
326
                PAVGB((%%ebx), %%mm2)                                      //   112 4        /8
327
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
328
                PAVGB(%%mm0, %%mm6)                                      //      1 1        /2
329
                PAVGB(%%mm7, %%mm6)                                      //      1 12        /4
330
                PAVGB(%%mm2, %%mm6)                                      //   1122424        /4
331
                "movq %%mm6, (%%ebx)                                \n\t" //       X
332
                // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
333
                PAVGB(%%mm7, %%mm5)                                      //    11   2        /4
334
                PAVGB(%%mm7, %%mm5)                                      //    11   6        /8
335

    
336
                PAVGB(%%mm3, %%mm0)                                      //      112        /4
337
                PAVGB(%%mm0, %%mm5)                                      //    112246        /16
338
                "movq %%mm5, (%%eax, %1, 4)                        \n\t" //        X
339
                "subl %1, %0                                        \n\t"
340

    
341
                :
342
                : "r" (src), "r" (stride)
343
                : "%eax", "%ebx"
344
        );
345
#else
346
        const int l1= stride;
347
        const int l2= stride + l1;
348
        const int l3= stride + l2;
349
        const int l4= stride + l3;
350
        const int l5= stride + l4;
351
        const int l6= stride + l5;
352
        const int l7= stride + l6;
353
        const int l8= stride + l7;
354
        const int l9= stride + l8;
355
        int x;
356
        src+= stride*3;
357
        for(x=0; x<BLOCK_SIZE; x++)
358
        {
359
                const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
360
                const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
361

    
362
                int sums[9];
363
                sums[0] = first + src[l1];
364
                sums[1] = src[l1] + src[l2];
365
                sums[2] = src[l2] + src[l3];
366
                sums[3] = src[l3] + src[l4];
367
                sums[4] = src[l4] + src[l5];
368
                sums[5] = src[l5] + src[l6];
369
                sums[6] = src[l6] + src[l7];
370
                sums[7] = src[l7] + src[l8];
371
                sums[8] = src[l8] + last;
372

    
373
                src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
374
                src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
375
                src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
376
                src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
377
                src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
378
                src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
379
                src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
380
                src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
381

    
382
                src++;
383
        }
384

    
385
#endif
386
}
387

    
388
/**
389
 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
390
 * values are correctly clipped (MMX2)
391
 * values are wraparound (C)
392
 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
393
        0 8 16 24
394
        x = 8
395
        x/2 = 4
396
        x/8 = 1
397
        1 12 12 23
398
 */
399
static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
400
{
401
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
402
        src+= stride*3;
403
// FIXME rounding
404
        asm volatile(
405
                "pxor %%mm7, %%mm7                                \n\t" // 0
406
                "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
407
                "leal (%0, %1), %%eax                                \n\t"
408
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
409
//        0        1        2        3        4        5        6        7        8        9
410
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
411
                "movq pQPb, %%mm0                                \n\t" // QP,..., QP
412
                "movq %%mm0, %%mm1                                \n\t" // QP,..., QP
413
                "paddusb b02, %%mm0                                \n\t"
414
                "psrlw $2, %%mm0                                \n\t"
415
                "pand b3F, %%mm0                                \n\t" // QP/4,..., QP/4
416
                "paddusb %%mm1, %%mm0                                \n\t" // QP*1.25 ...
417
                "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
418
                "movq (%%ebx), %%mm3                                \n\t" // line 5
419
                "movq %%mm2, %%mm4                                \n\t" // line 4
420
                "pcmpeqb %%mm5, %%mm5                                \n\t" // -1
421
                "pxor %%mm2, %%mm5                                \n\t" // -line 4 - 1
422
                PAVGB(%%mm3, %%mm5)
423
                "paddb %%mm6, %%mm5                                \n\t" // (l5-l4)/2
424
                "psubusb %%mm3, %%mm4                                \n\t"
425
                "psubusb %%mm2, %%mm3                                \n\t"
426
                "por %%mm3, %%mm4                                \n\t" // |l4 - l5|
427
                "psubusb %%mm0, %%mm4                                \n\t"
428
                "pcmpeqb %%mm7, %%mm4                                \n\t"
429
                "pand %%mm4, %%mm5                                \n\t" // d/2
430

    
431
//                "paddb %%mm6, %%mm2                                \n\t" // line 4 + 0x80
432
                "paddb %%mm5, %%mm2                                \n\t"
433
//                "psubb %%mm6, %%mm2                                \n\t"
434
                "movq %%mm2, (%0,%1, 4)                                \n\t"
435

    
436
                "movq (%%ebx), %%mm2                                \n\t"
437
//                "paddb %%mm6, %%mm2                                \n\t" // line 5 + 0x80
438
                "psubb %%mm5, %%mm2                                \n\t"
439
//                "psubb %%mm6, %%mm2                                \n\t"
440
                "movq %%mm2, (%%ebx)                                \n\t"
441

    
442
                "paddb %%mm6, %%mm5                                \n\t"
443
                "psrlw $2, %%mm5                                \n\t"
444
                "pand b3F, %%mm5                                \n\t"
445
                "psubb b20, %%mm5                                \n\t" // (l5-l4)/8
446

    
447
                "movq (%%eax, %1, 2), %%mm2                        \n\t"
448
                "paddb %%mm6, %%mm2                                \n\t" // line 3 + 0x80
449
                "paddsb %%mm5, %%mm2                                \n\t"
450
                "psubb %%mm6, %%mm2                                \n\t"
451
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
452

    
453
                "movq (%%ebx, %1), %%mm2                        \n\t"
454
                "paddb %%mm6, %%mm2                                \n\t" // line 6 + 0x80
455
                "psubsb %%mm5, %%mm2                                \n\t"
456
                "psubb %%mm6, %%mm2                                \n\t"
457
                "movq %%mm2, (%%ebx, %1)                        \n\t"
458

    
459
                :
460
                : "r" (src), "r" (stride)
461
                : "%eax", "%ebx"
462
        );
463
#else
464
         const int l1= stride;
465
        const int l2= stride + l1;
466
        const int l3= stride + l2;
467
        const int l4= stride + l3;
468
        const int l5= stride + l4;
469
        const int l6= stride + l5;
470
//        const int l7= stride + l6;
471
//        const int l8= stride + l7;
472
//        const int l9= stride + l8;
473
        int x;
474
        const int QP15= QP + (QP>>2);
475
        src+= stride*3;
476
        for(x=0; x<BLOCK_SIZE; x++)
477
        {
478
                const int v = (src[x+l5] - src[x+l4]);
479
                if(ABS(v) < QP15)
480
                {
481
                        src[x+l3] +=v>>3;
482
                        src[x+l4] +=v>>1;
483
                        src[x+l5] -=v>>1;
484
                        src[x+l6] -=v>>3;
485

    
486
                }
487
        }
488

    
489
#endif
490
}
491

    
492
/**
493
 * Experimental Filter 1
494
 * will not damage linear gradients
495
 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
496
 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
497
 * MMX2 version does correct clipping C version doesnt
498
 */
499
static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP)
500
{
501
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
502
        src+= stride*3;
503

    
504
        asm volatile(
505
                "pxor %%mm7, %%mm7                                \n\t" // 0
506
//                "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
507
                "leal (%0, %1), %%eax                                \n\t"
508
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
509
//        0        1        2        3        4        5        6        7        8        9
510
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
511
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
512
                "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
513
                "movq %%mm1, %%mm2                                \n\t" // line 4
514
                "psubusb %%mm0, %%mm1                                \n\t"
515
                "psubusb %%mm2, %%mm0                                \n\t"
516
                "por %%mm1, %%mm0                                \n\t" // |l2 - l3|
517
                "movq (%%ebx), %%mm3                                \n\t" // line 5
518
                "movq (%%ebx, %1), %%mm4                                \n\t" // line 6
519
                "movq %%mm3, %%mm5                                \n\t" // line 5
520
                "psubusb %%mm4, %%mm3                                \n\t"
521
                "psubusb %%mm5, %%mm4                                \n\t"
522
                "por %%mm4, %%mm3                                \n\t" // |l5 - l6|
523
                PAVGB(%%mm3, %%mm0)                                      // (|l2 - l3| + |l5 - l6|)/2
524
                "movq %%mm2, %%mm1                                \n\t" // line 4
525
                "psubusb %%mm5, %%mm2                                \n\t"
526
                "movq %%mm2, %%mm4                                \n\t"
527
                "pcmpeqb %%mm7, %%mm2                                \n\t" // (l4 - l5) <= 0 ? -1 : 0
528
                "psubusb %%mm1, %%mm5                                \n\t"
529
                "por %%mm5, %%mm4                                \n\t" // |l4 - l5|
530
                "psubusb %%mm0, %%mm4                \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
531
                "movq %%mm4, %%mm3                                \n\t" // d
532
                "psubusb pQPb, %%mm4                                \n\t"
533
                "pcmpeqb %%mm7, %%mm4                                \n\t" // d <= QP ? -1 : 0
534
                "psubusb b01, %%mm3                                \n\t"
535
                "pand %%mm4, %%mm3                                \n\t" // d <= QP ? d : 0
536

    
537
                PAVGB(%%mm7, %%mm3)                                      // d/2
538
                "movq %%mm3, %%mm1                                \n\t" // d/2
539
                PAVGB(%%mm7, %%mm3)                                      // d/4
540
                PAVGB(%%mm1, %%mm3)                                      // 3*d/8
541

    
542
                "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
543
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
544
                "psubusb %%mm3, %%mm0                                \n\t"
545
                "pxor %%mm2, %%mm0                                \n\t"
546
                "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
547

    
548
                "movq (%%ebx), %%mm0                                \n\t" // line 5
549
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
550
                "paddusb %%mm3, %%mm0                                \n\t"
551
                "pxor %%mm2, %%mm0                                \n\t"
552
                "movq %%mm0, (%%ebx)                                \n\t" // line 5
553

    
554
                PAVGB(%%mm7, %%mm1)                                      // d/4
555

    
556
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
557
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
558
                "psubusb %%mm1, %%mm0                                \n\t"
559
                "pxor %%mm2, %%mm0                                \n\t"
560
                "movq %%mm0, (%%eax, %1, 2)                        \n\t" // line 3
561

    
562
                "movq (%%ebx, %1), %%mm0                        \n\t" // line 6
563
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
564
                "paddusb %%mm1, %%mm0                                \n\t"
565
                "pxor %%mm2, %%mm0                                \n\t"
566
                "movq %%mm0, (%%ebx, %1)                        \n\t" // line 6
567

    
568
                PAVGB(%%mm7, %%mm1)                                      // d/8
569

    
570
                "movq (%%eax, %1), %%mm0                        \n\t" // line 2
571
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
572
                "psubusb %%mm1, %%mm0                                \n\t"
573
                "pxor %%mm2, %%mm0                                \n\t"
574
                "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
575

    
576
                "movq (%%ebx, %1, 2), %%mm0                        \n\t" // line 7
577
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
578
                "paddusb %%mm1, %%mm0                                \n\t"
579
                "pxor %%mm2, %%mm0                                \n\t"
580
                "movq %%mm0, (%%ebx, %1, 2)                        \n\t" // line 7
581

    
582
                :
583
                : "r" (src), "r" (stride)
584
                : "%eax", "%ebx"
585
        );
586
#else
587

    
588
         const int l1= stride;
589
        const int l2= stride + l1;
590
        const int l3= stride + l2;
591
        const int l4= stride + l3;
592
        const int l5= stride + l4;
593
        const int l6= stride + l5;
594
        const int l7= stride + l6;
595
//        const int l8= stride + l7;
596
//        const int l9= stride + l8;
597
        int x;
598

    
599
        src+= stride*3;
600
        for(x=0; x<BLOCK_SIZE; x++)
601
        {
602
                int a= src[l3] - src[l4];
603
                int b= src[l4] - src[l5];
604
                int c= src[l5] - src[l6];
605

    
606
                int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
607
                d= MAX(d, 0);
608

    
609
                if(d < QP)
610
                {
611
                        int v = d * SIGN(-b);
612

    
613
                        src[l2] +=v>>3;
614
                        src[l3] +=v>>2;
615
                        src[l4] +=(3*v)>>3;
616
                        src[l5] -=(3*v)>>3;
617
                        src[l6] -=v>>2;
618
                        src[l7] -=v>>3;
619

    
620
                }
621
                src++;
622
        }
623
        /*
624
         const int l1= stride;
625
        const int l2= stride + l1;
626
        const int l3= stride + l2;
627
        const int l4= stride + l3;
628
        const int l5= stride + l4;
629
        const int l6= stride + l5;
630
        const int l7= stride + l6;
631
        const int l8= stride + l7;
632
        const int l9= stride + l8;
633
        for(int x=0; x<BLOCK_SIZE; x++)
634
        {
635
                int v2= src[l2];
636
                int v3= src[l3];
637
                int v4= src[l4];
638
                int v5= src[l5];
639
                int v6= src[l6];
640
                int v7= src[l7];
641

642
                if(ABS(v4-v5)<QP &&  ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
643
                {
644
                        src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6         )/16;
645
                        src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7  )/16;
646
                        src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
647
                        src[l6] = (       1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
648
                }
649
                src++;
650
        }
651
*/
652
#endif
653
}
654

    
655
static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP)
656
{
657
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
658
/*
659
        uint8_t tmp[16];
660
        const int l1= stride;
661
        const int l2= stride + l1;
662
        const int l3= stride + l2;
663
        const int l4= (int)tmp - (int)src - stride*3;
664
        const int l5= (int)tmp - (int)src - stride*3 + 8;
665
        const int l6= stride*3 + l3;
666
        const int l7= stride + l6;
667
        const int l8= stride + l7;
668

669
        memcpy(tmp, src+stride*7, 8);
670
        memcpy(tmp+8, src+stride*8, 8);
671
*/
672
        src+= stride*4;
673
        asm volatile(
674

    
675
#if 0 //sligtly more accurate and slightly slower
676
                "pxor %%mm7, %%mm7                                \n\t" // 0
677
                "leal (%0, %1), %%eax                                \n\t"
678
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
679
//        0        1        2        3        4        5        6        7
680
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ebx+%1        ebx+2%1
681
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1
682

683

684
                "movq (%0, %1, 2), %%mm0                        \n\t" // l2
685
                "movq (%0), %%mm1                                \n\t" // l0
686
                "movq %%mm0, %%mm2                                \n\t" // l2
687
                PAVGB(%%mm7, %%mm0)                                      // ~l2/2
688
                PAVGB(%%mm1, %%mm0)                                      // ~(l2 + 2l0)/4
689
                PAVGB(%%mm2, %%mm0)                                      // ~(5l2 + 2l0)/8
690

691
                "movq (%%eax), %%mm1                                \n\t" // l1
692
                "movq (%%eax, %1, 2), %%mm3                        \n\t" // l3
693
                "movq %%mm1, %%mm4                                \n\t" // l1
694
                PAVGB(%%mm7, %%mm1)                                      // ~l1/2
695
                PAVGB(%%mm3, %%mm1)                                      // ~(l1 + 2l3)/4
696
                PAVGB(%%mm4, %%mm1)                                      // ~(5l1 + 2l3)/8
697

698
                "movq %%mm0, %%mm4                                \n\t" // ~(5l2 + 2l0)/8
699
                "psubusb %%mm1, %%mm0                                \n\t"
700
                "psubusb %%mm4, %%mm1                                \n\t"
701
                "por %%mm0, %%mm1                                \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
702
// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
703

704
                "movq (%0, %1, 4), %%mm0                        \n\t" // l4
705
                "movq %%mm0, %%mm4                                \n\t" // l4
706
                PAVGB(%%mm7, %%mm0)                                      // ~l4/2
707
                PAVGB(%%mm2, %%mm0)                                      // ~(l4 + 2l2)/4
708
                PAVGB(%%mm4, %%mm0)                                      // ~(5l4 + 2l2)/8
709

710
                "movq (%%ebx), %%mm2                                \n\t" // l5
711
                "movq %%mm3, %%mm5                                \n\t" // l3
712
                PAVGB(%%mm7, %%mm3)                                      // ~l3/2
713
                PAVGB(%%mm2, %%mm3)                                      // ~(l3 + 2l5)/4
714
                PAVGB(%%mm5, %%mm3)                                      // ~(5l3 + 2l5)/8
715

716
                "movq %%mm0, %%mm6                                \n\t" // ~(5l4 + 2l2)/8
717
                "psubusb %%mm3, %%mm0                                \n\t"
718
                "psubusb %%mm6, %%mm3                                \n\t"
719
                "por %%mm0, %%mm3                                \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
720
                "pcmpeqb %%mm7, %%mm0                                \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
721
// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
722

723
                "movq (%%ebx, %1), %%mm6                        \n\t" // l6
724
                "movq %%mm6, %%mm5                                \n\t" // l6
725
                PAVGB(%%mm7, %%mm6)                                      // ~l6/2
726
                PAVGB(%%mm4, %%mm6)                                      // ~(l6 + 2l4)/4
727
                PAVGB(%%mm5, %%mm6)                                      // ~(5l6 + 2l4)/8
728

729
                "movq (%%ebx, %1, 2), %%mm5                        \n\t" // l7
730
                "movq %%mm2, %%mm4                                \n\t" // l5
731
                PAVGB(%%mm7, %%mm2)                                      // ~l5/2
732
                PAVGB(%%mm5, %%mm2)                                      // ~(l5 + 2l7)/4
733
                PAVGB(%%mm4, %%mm2)                                      // ~(5l5 + 2l7)/8
734

735
                "movq %%mm6, %%mm4                                \n\t" // ~(5l6 + 2l4)/8
736
                "psubusb %%mm2, %%mm6                                \n\t"
737
                "psubusb %%mm4, %%mm2                                \n\t"
738
                "por %%mm6, %%mm2                                \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
739
// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
740

741

742
                PMINUB(%%mm2, %%mm1, %%mm4)                              // MIN(|lenergy|,|renergy|)/8
743
                "movq pQPb, %%mm4                                \n\t" // QP //FIXME QP+1 ?
744
                "paddusb b01, %%mm4                                \n\t"
745
                "pcmpgtb %%mm3, %%mm4                                \n\t" // |menergy|/8 < QP
746
                "psubusb %%mm1, %%mm3                                \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
747
                "pand %%mm4, %%mm3                                \n\t"
748

749
                "movq %%mm3, %%mm1                                \n\t"
750
//                "psubusb b01, %%mm3                                \n\t"
751
                PAVGB(%%mm7, %%mm3)
752
                PAVGB(%%mm7, %%mm3)
753
                "paddusb %%mm1, %%mm3                                \n\t"
754
//                "paddusb b01, %%mm3                                \n\t"
755

756
                "movq (%%eax, %1, 2), %%mm6                        \n\t" //l3
757
                "movq (%0, %1, 4), %%mm5                        \n\t" //l4
758
                "movq (%0, %1, 4), %%mm4                        \n\t" //l4
759
                "psubusb %%mm6, %%mm5                                \n\t"
760
                "psubusb %%mm4, %%mm6                                \n\t"
761
                "por %%mm6, %%mm5                                \n\t" // |l3-l4|
762
                "pcmpeqb %%mm7, %%mm6                                \n\t" // SIGN(l3-l4)
763
                "pxor %%mm6, %%mm0                                \n\t"
764
                "pand %%mm0, %%mm3                                \n\t"
765
                PMINUB(%%mm5, %%mm3, %%mm0)
766

767
                "psubusb b01, %%mm3                                \n\t"
768
                PAVGB(%%mm7, %%mm3)
769

770
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
771
                "movq (%0, %1, 4), %%mm2                        \n\t"
772
                "pxor %%mm6, %%mm0                                \n\t"
773
                "pxor %%mm6, %%mm2                                \n\t"
774
                "psubb %%mm3, %%mm0                                \n\t"
775
                "paddb %%mm3, %%mm2                                \n\t"
776
                "pxor %%mm6, %%mm0                                \n\t"
777
                "pxor %%mm6, %%mm2                                \n\t"
778
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
779
                "movq %%mm2, (%0, %1, 4)                        \n\t"
780
#endif
781

    
782
                "leal (%0, %1), %%eax                                \n\t"
783
                "pcmpeqb %%mm6, %%mm6                                \n\t" // -1
784
//        0        1        2        3        4        5        6        7
785
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ebx+%1        ebx+2%1
786
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1
787

    
788

    
789
                "movq (%%eax, %1, 2), %%mm1                        \n\t" // l3
790
                "movq (%0, %1, 4), %%mm0                        \n\t" // l4
791
                "pxor %%mm6, %%mm1                                \n\t" // -l3-1
792
                PAVGB(%%mm1, %%mm0)                                      // -q+128 = (l4-l3+256)/2
793
// mm1=-l3-1, mm0=128-q
794

    
795
                "movq (%%eax, %1, 4), %%mm2                        \n\t" // l5
796
                "movq (%%eax, %1), %%mm3                        \n\t" // l2
797
                "pxor %%mm6, %%mm2                                \n\t" // -l5-1
798
                "movq %%mm2, %%mm5                                \n\t" // -l5-1
799
                "movq b80, %%mm4                                \n\t" // 128
800
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
801
                PAVGB(%%mm3, %%mm2)                                      // (l2-l5+256)/2
802
                PAVGB(%%mm0, %%mm4)                                      // ~(l4-l3)/4 + 128
803
                PAVGB(%%mm2, %%mm4)                                      // ~(l2-l5)/4 +(l4-l3)/8 + 128
804
                PAVGB(%%mm0, %%mm4)                                      // ~(l2-l5)/8 +5(l4-l3)/16 + 128
805
// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
806

    
807
                "movq (%%eax), %%mm2                                \n\t" // l1
808
                "pxor %%mm6, %%mm2                                \n\t" // -l1-1
809
                PAVGB(%%mm3, %%mm2)                                      // (l2-l1+256)/2
810
                PAVGB((%0), %%mm1)                                      // (l0-l3+256)/2
811
                "movq b80, %%mm3                                \n\t" // 128
812
                PAVGB(%%mm2, %%mm3)                                      // ~(l2-l1)/4 + 128
813
                PAVGB(%%mm1, %%mm3)                                      // ~(l0-l3)/4 +(l2-l1)/8 + 128
814
                PAVGB(%%mm2, %%mm3)                                      // ~(l0-l3)/8 +5(l2-l1)/16 + 128
815
// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
816

    
817
                PAVGB((%%ebx, %1), %%mm5)                              // (l6-l5+256)/2
818
                "movq (%%ebx, %1, 2), %%mm1                        \n\t" // l7
819
                "pxor %%mm6, %%mm1                                \n\t" // -l7-1
820
                PAVGB((%0, %1, 4), %%mm1)                              // (l4-l7+256)/2
821
                "movq b80, %%mm2                                \n\t" // 128
822
                PAVGB(%%mm5, %%mm2)                                      // ~(l6-l5)/4 + 128
823
                PAVGB(%%mm1, %%mm2)                                      // ~(l4-l7)/4 +(l6-l5)/8 + 128
824
                PAVGB(%%mm5, %%mm2)                                      // ~(l4-l7)/8 +5(l6-l5)/16 + 128
825
// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
826

    
827
                "movq b00, %%mm1                                \n\t" // 0
828
                "movq b00, %%mm5                                \n\t" // 0
829
                "psubb %%mm2, %%mm1                                \n\t" // 128 - renergy/16
830
                "psubb %%mm3, %%mm5                                \n\t" // 128 - lenergy/16
831
                PMAXUB(%%mm1, %%mm2)                                      // 128 + |renergy/16|
832
                 PMAXUB(%%mm5, %%mm3)                                      // 128 + |lenergy/16|
833
                PMINUB(%%mm2, %%mm3, %%mm1)                              // 128 + MIN(|lenergy|,|renergy|)/16
834

    
835
// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
836

    
837
                "movq b00, %%mm7                                \n\t" // 0
838
                "movq pQPb, %%mm2                                \n\t" // QP
839
                PAVGB(%%mm6, %%mm2)                                      // 128 + QP/2
840
                "psubb %%mm6, %%mm2                                \n\t"
841

    
842
                "movq %%mm4, %%mm1                                \n\t"
843
                "pcmpgtb %%mm7, %%mm1                                \n\t" // SIGN(menergy)
844
                "pxor %%mm1, %%mm4                                \n\t"
845
                "psubb %%mm1, %%mm4                                \n\t" // 128 + |menergy|/16
846
                "pcmpgtb %%mm4, %%mm2                                \n\t" // |menergy|/16 < QP/2
847
                "psubusb %%mm3, %%mm4                                \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
848
// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
849

    
850
                "movq %%mm4, %%mm3                                \n\t" // d
851
                "psubusb b01, %%mm4                                \n\t"
852
                PAVGB(%%mm7, %%mm4)                                      // d/32
853
                PAVGB(%%mm7, %%mm4)                                      // (d + 32)/64
854
                "paddb %%mm3, %%mm4                                \n\t" // 5d/64
855
                "pand %%mm2, %%mm4                                \n\t"
856

    
857
                "movq b80, %%mm5                                \n\t" // 128
858
                "psubb %%mm0, %%mm5                                \n\t" // q
859
                "paddsb %%mm6, %%mm5                                \n\t" // fix bad rounding
860
                "pcmpgtb %%mm5, %%mm7                                \n\t" // SIGN(q)
861
                "pxor %%mm7, %%mm5                                \n\t"
862

    
863
                PMINUB(%%mm5, %%mm4, %%mm3)                              // MIN(|q|, 5d/64)
864
                "pxor %%mm1, %%mm7                                \n\t" // SIGN(d*q)
865

    
866
                "pand %%mm7, %%mm4                                \n\t"
867
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
868
                "movq (%0, %1, 4), %%mm2                        \n\t"
869
                "pxor %%mm1, %%mm0                                \n\t"
870
                "pxor %%mm1, %%mm2                                \n\t"
871
                "paddb %%mm4, %%mm0                                \n\t"
872
                "psubb %%mm4, %%mm2                                \n\t"
873
                "pxor %%mm1, %%mm0                                \n\t"
874
                "pxor %%mm1, %%mm2                                \n\t"
875
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
876
                "movq %%mm2, (%0, %1, 4)                        \n\t"
877

    
878
                :
879
                : "r" (src), "r" (stride)
880
                : "%eax", "%ebx"
881
        );
882

    
883
/*
884
        {
885
        int x;
886
        src-= stride;
887
        for(x=0; x<BLOCK_SIZE; x++)
888
        {
889
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
890
                if(ABS(middleEnergy)< 8*QP)
891
                {
892
                        const int q=(src[l4] - src[l5])/2;
893
                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
894
                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
895

896
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
897
                        d= MAX(d, 0);
898

899
                        d= (5*d + 32) >> 6;
900
                        d*= SIGN(-middleEnergy);
901

902
                        if(q>0)
903
                        {
904
                                d= d<0 ? 0 : d;
905
                                d= d>q ? q : d;
906
                        }
907
                        else
908
                        {
909
                                d= d>0 ? 0 : d;
910
                                d= d<q ? q : d;
911
                        }
912

913
                        src[l4]-= d;
914
                        src[l5]+= d;
915
                }
916
                src++;
917
        }
918
src-=8;
919
        for(x=0; x<8; x++)
920
        {
921
                int y;
922
                for(y=4; y<6; y++)
923
                {
924
                        int d= src[x+y*stride] - tmp[x+(y-4)*8];
925
                        int ad= ABS(d);
926
                        static int max=0;
927
                        static int sum=0;
928
                        static int num=0;
929
                        static int bias=0;
930

931
                        if(max<ad) max=ad;
932
                        sum+= ad>3 ? 1 : 0;
933
                        if(ad>3)
934
                        {
935
                                src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
936
                        }
937
                        if(y==4) bias+=d;
938
                        num++;
939
                        if(num%1000000 == 0)
940
                        {
941
                                printf(" %d %d %d %d\n", num, sum, max, bias);
942
                        }
943
                }
944
        }
945
}
946
*/
947
#elif defined (HAVE_MMX)
948
        src+= stride*4;
949

    
950
        asm volatile(
951
                "pxor %%mm7, %%mm7                                \n\t"
952
                "leal (%0, %1), %%eax                                \n\t"
953
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
954
//        0        1        2        3        4        5        6        7
955
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ebx+%1        ebx+2%1
956
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1
957

    
958
                "movq (%0), %%mm0                                \n\t"
959
                "movq %%mm0, %%mm1                                \n\t"
960
                "punpcklbw %%mm7, %%mm0                                \n\t" // low part of line 0
961
                "punpckhbw %%mm7, %%mm1                                \n\t" // high part of line 0
962

    
963
                "movq (%%eax), %%mm2                                \n\t"
964
                "movq %%mm2, %%mm3                                \n\t"
965
                "punpcklbw %%mm7, %%mm2                                \n\t" // low part of line 1
966
                "punpckhbw %%mm7, %%mm3                                \n\t" // high part of line 1
967

    
968
                "movq (%%eax, %1), %%mm4                        \n\t"
969
                "movq %%mm4, %%mm5                                \n\t"
970
                "punpcklbw %%mm7, %%mm4                                \n\t" // low part of line 2
971
                "punpckhbw %%mm7, %%mm5                                \n\t" // high part of line 2
972

    
973
                "paddw %%mm0, %%mm0                                \n\t" // 2L0
974
                "paddw %%mm1, %%mm1                                \n\t" // 2H0
975
                "psubw %%mm4, %%mm2                                \n\t" // L1 - L2
976
                "psubw %%mm5, %%mm3                                \n\t" // H1 - H2
977
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - L1 + L2
978
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - H1 + H2
979

    
980
                "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
981
                "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
982
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2
983
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2
984

    
985
                "movq (%%eax, %1, 2), %%mm2                        \n\t"
986
                "movq %%mm2, %%mm3                                \n\t"
987
                "punpcklbw %%mm7, %%mm2                                \n\t" // L3
988
                "punpckhbw %%mm7, %%mm3                                \n\t" // H3
989

    
990
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - L3
991
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - H3
992
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
993
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
994
                "movq %%mm0, temp0                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
995
                "movq %%mm1, temp1                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
996

    
997
                "movq (%0, %1, 4), %%mm0                        \n\t"
998
                "movq %%mm0, %%mm1                                \n\t"
999
                "punpcklbw %%mm7, %%mm0                                \n\t" // L4
1000
                "punpckhbw %%mm7, %%mm1                                \n\t" // H4
1001

    
1002
                "psubw %%mm0, %%mm2                                \n\t" // L3 - L4
1003
                "psubw %%mm1, %%mm3                                \n\t" // H3 - H4
1004
                "movq %%mm2, temp2                                \n\t" // L3 - L4
1005
                "movq %%mm3, temp3                                \n\t" // H3 - H4
1006
                "paddw %%mm4, %%mm4                                \n\t" // 2L2
1007
                "paddw %%mm5, %%mm5                                \n\t" // 2H2
1008
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - L3 + L4
1009
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - H3 + H4
1010

    
1011
                "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
1012
                "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
1013
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4
1014
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4
1015
//50 opcodes so far
1016
                "movq (%%ebx), %%mm2                                \n\t"
1017
                "movq %%mm2, %%mm3                                \n\t"
1018
                "punpcklbw %%mm7, %%mm2                                \n\t" // L5
1019
                "punpckhbw %%mm7, %%mm3                                \n\t" // H5
1020
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - L5
1021
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - H5
1022
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1023
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1024

    
1025
                "movq (%%ebx, %1), %%mm6                        \n\t"
1026
                "punpcklbw %%mm7, %%mm6                                \n\t" // L6
1027
                "psubw %%mm6, %%mm2                                \n\t" // L5 - L6
1028
                "movq (%%ebx, %1), %%mm6                        \n\t"
1029
                "punpckhbw %%mm7, %%mm6                                \n\t" // H6
1030
                "psubw %%mm6, %%mm3                                \n\t" // H5 - H6
1031

    
1032
                "paddw %%mm0, %%mm0                                \n\t" // 2L4
1033
                "paddw %%mm1, %%mm1                                \n\t" // 2H4
1034
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - L5 + L6
1035
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - H5 + H6
1036

    
1037
                "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
1038
                "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
1039
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6
1040
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6
1041

    
1042
                "movq (%%ebx, %1, 2), %%mm2                        \n\t"
1043
                "movq %%mm2, %%mm3                                \n\t"
1044
                "punpcklbw %%mm7, %%mm2                                \n\t" // L7
1045
                "punpckhbw %%mm7, %%mm3                                \n\t" // H7
1046

    
1047
                "paddw %%mm2, %%mm2                                \n\t" // 2L7
1048
                "paddw %%mm3, %%mm3                                \n\t" // 2H7
1049
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1050
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1051

    
1052
                "movq temp0, %%mm2                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1053
                "movq temp1, %%mm3                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1054

    
1055
#ifdef HAVE_MMX2
1056
                "movq %%mm7, %%mm6                                \n\t" // 0
1057
                "psubw %%mm0, %%mm6                                \n\t"
1058
                "pmaxsw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1059
                "movq %%mm7, %%mm6                                \n\t" // 0
1060
                "psubw %%mm1, %%mm6                                \n\t"
1061
                "pmaxsw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1062
                "movq %%mm7, %%mm6                                \n\t" // 0
1063
                "psubw %%mm2, %%mm6                                \n\t"
1064
                "pmaxsw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1065
                "movq %%mm7, %%mm6                                \n\t" // 0
1066
                "psubw %%mm3, %%mm6                                \n\t"
1067
                "pmaxsw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1068
#else
1069
                "movq %%mm7, %%mm6                                \n\t" // 0
1070
                "pcmpgtw %%mm0, %%mm6                                \n\t"
1071
                "pxor %%mm6, %%mm0                                \n\t"
1072
                "psubw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1073
                "movq %%mm7, %%mm6                                \n\t" // 0
1074
                "pcmpgtw %%mm1, %%mm6                                \n\t"
1075
                "pxor %%mm6, %%mm1                                \n\t"
1076
                "psubw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1077
                "movq %%mm7, %%mm6                                \n\t" // 0
1078
                "pcmpgtw %%mm2, %%mm6                                \n\t"
1079
                "pxor %%mm6, %%mm2                                \n\t"
1080
                "psubw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1081
                "movq %%mm7, %%mm6                                \n\t" // 0
1082
                "pcmpgtw %%mm3, %%mm6                                \n\t"
1083
                "pxor %%mm6, %%mm3                                \n\t"
1084
                "psubw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1085
#endif
1086

    
1087
#ifdef HAVE_MMX2
1088
                "pminsw %%mm2, %%mm0                                \n\t"
1089
                "pminsw %%mm3, %%mm1                                \n\t"
1090
#else
1091
                "movq %%mm0, %%mm6                                \n\t"
1092
                "psubusw %%mm2, %%mm6                                \n\t"
1093
                "psubw %%mm6, %%mm0                                \n\t"
1094
                "movq %%mm1, %%mm6                                \n\t"
1095
                "psubusw %%mm3, %%mm6                                \n\t"
1096
                "psubw %%mm6, %%mm1                                \n\t"
1097
#endif
1098

    
1099
                "movq %%mm7, %%mm6                                \n\t" // 0
1100
                "pcmpgtw %%mm4, %%mm6                                \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1101
                "pxor %%mm6, %%mm4                                \n\t"
1102
                "psubw %%mm6, %%mm4                                \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1103
                "pcmpgtw %%mm5, %%mm7                                \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1104
                "pxor %%mm7, %%mm5                                \n\t"
1105
                "psubw %%mm7, %%mm5                                \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1106
// 100 opcodes
1107
                "movd %2, %%mm2                                        \n\t" // QP
1108
                "punpcklwd %%mm2, %%mm2                                \n\t"
1109
                "punpcklwd %%mm2, %%mm2                                \n\t"
1110
                "psllw $3, %%mm2                                \n\t" // 8QP
1111
                "movq %%mm2, %%mm3                                \n\t" // 8QP
1112
                "pcmpgtw %%mm4, %%mm2                                \n\t"
1113
                "pcmpgtw %%mm5, %%mm3                                \n\t"
1114
                "pand %%mm2, %%mm4                                \n\t"
1115
                "pand %%mm3, %%mm5                                \n\t"
1116

    
1117

    
1118
                "psubusw %%mm0, %%mm4                                \n\t" // hd
1119
                "psubusw %%mm1, %%mm5                                \n\t" // ld
1120

    
1121

    
1122
                "movq w05, %%mm2                                \n\t" // 5
1123
                "pmullw %%mm2, %%mm4                                \n\t"
1124
                "pmullw %%mm2, %%mm5                                \n\t"
1125
                "movq w20, %%mm2                                \n\t" // 32
1126
                "paddw %%mm2, %%mm4                                \n\t"
1127
                "paddw %%mm2, %%mm5                                \n\t"
1128
                "psrlw $6, %%mm4                                \n\t"
1129
                "psrlw $6, %%mm5                                \n\t"
1130

    
1131
/*
1132
                "movq w06, %%mm2                                \n\t" // 6
1133
                "paddw %%mm2, %%mm4                                \n\t"
1134
                "paddw %%mm2, %%mm5                                \n\t"
1135
                "movq w1400, %%mm2                                \n\t" // 1400h = 5120 = 5/64*2^16
1136
//FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1137
                "pmulhw %%mm2, %%mm4                                \n\t" // hd/13
1138
                "pmulhw %%mm2, %%mm5                                \n\t" // ld/13
1139
*/
1140

    
1141
                "movq temp2, %%mm0                                \n\t" // L3 - L4
1142
                "movq temp3, %%mm1                                \n\t" // H3 - H4
1143

    
1144
                "pxor %%mm2, %%mm2                                \n\t"
1145
                "pxor %%mm3, %%mm3                                \n\t"
1146

    
1147
                "pcmpgtw %%mm0, %%mm2                                \n\t" // sign (L3-L4)
1148
                "pcmpgtw %%mm1, %%mm3                                \n\t" // sign (H3-H4)
1149
                "pxor %%mm2, %%mm0                                \n\t"
1150
                "pxor %%mm3, %%mm1                                \n\t"
1151
                "psubw %%mm2, %%mm0                                \n\t" // |L3-L4|
1152
                "psubw %%mm3, %%mm1                                \n\t" // |H3-H4|
1153
                "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1154
                "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1155

    
1156
                "pxor %%mm6, %%mm2                                \n\t"
1157
                "pxor %%mm7, %%mm3                                \n\t"
1158
                "pand %%mm2, %%mm4                                \n\t"
1159
                "pand %%mm3, %%mm5                                \n\t"
1160

    
1161
#ifdef HAVE_MMX2
1162
                "pminsw %%mm0, %%mm4                                \n\t"
1163
                "pminsw %%mm1, %%mm5                                \n\t"
1164
#else
1165
                "movq %%mm4, %%mm2                                \n\t"
1166
                "psubusw %%mm0, %%mm2                                \n\t"
1167
                "psubw %%mm2, %%mm4                                \n\t"
1168
                "movq %%mm5, %%mm2                                \n\t"
1169
                "psubusw %%mm1, %%mm2                                \n\t"
1170
                "psubw %%mm2, %%mm5                                \n\t"
1171
#endif
1172
                "pxor %%mm6, %%mm4                                \n\t"
1173
                "pxor %%mm7, %%mm5                                \n\t"
1174
                "psubw %%mm6, %%mm4                                \n\t"
1175
                "psubw %%mm7, %%mm5                                \n\t"
1176
                "packsswb %%mm5, %%mm4                                \n\t"
1177
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
1178
                "paddb   %%mm4, %%mm0                                \n\t"
1179
                "movq %%mm0, (%%eax, %1, 2)                         \n\t"
1180
                "movq (%0, %1, 4), %%mm0                        \n\t"
1181
                "psubb %%mm4, %%mm0                                \n\t"
1182
                "movq %%mm0, (%0, %1, 4)                         \n\t"
1183

    
1184
                :
1185
                : "r" (src), "r" (stride), "r" (QP)
1186
                : "%eax", "%ebx"
1187
        );
1188
#else
1189
        const int l1= stride;
1190
        const int l2= stride + l1;
1191
        const int l3= stride + l2;
1192
        const int l4= stride + l3;
1193
        const int l5= stride + l4;
1194
        const int l6= stride + l5;
1195
        const int l7= stride + l6;
1196
        const int l8= stride + l7;
1197
//        const int l9= stride + l8;
1198
        int x;
1199
        src+= stride*3;
1200
        for(x=0; x<BLOCK_SIZE; x++)
1201
        {
1202
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1203
                if(ABS(middleEnergy) < 8*QP)
1204
                {
1205
                        const int q=(src[l4] - src[l5])/2;
1206
                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1207
                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1208

    
1209
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1210
                        d= MAX(d, 0);
1211

    
1212
                        d= (5*d + 32) >> 6;
1213
                        d*= SIGN(-middleEnergy);
1214

    
1215
                        if(q>0)
1216
                        {
1217
                                d= d<0 ? 0 : d;
1218
                                d= d>q ? q : d;
1219
                        }
1220
                        else
1221
                        {
1222
                                d= d>0 ? 0 : d;
1223
                                d= d<q ? q : d;
1224
                        }
1225

    
1226
                        src[l4]-= d;
1227
                        src[l5]+= d;
1228
                }
1229
                src++;
1230
        }
1231
#endif
1232
}
1233

    
1234
static inline void RENAME(dering)(uint8_t src[], int stride, int QP)
1235
{
1236
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1237
        asm volatile(
1238
                "movq pQPb, %%mm0                                \n\t"
1239
                "paddusb %%mm0, %%mm0                                \n\t"
1240
                "movq %%mm0, pQPb2                                \n\t"
1241

    
1242
                "leal (%0, %1), %%eax                                \n\t"
1243
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1244
//        0        1        2        3        4        5        6        7        8        9
1245
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1246

    
1247
                "pcmpeqb %%mm7, %%mm7                                \n\t"
1248
                "pxor %%mm6, %%mm6                                \n\t"
1249
#undef FIND_MIN_MAX
1250
#ifdef HAVE_MMX2
1251
#define FIND_MIN_MAX(addr)\
1252
                "movq " #addr ", %%mm0                                \n\t"\
1253
                "pminub %%mm0, %%mm7                                \n\t"\
1254
                "pmaxub %%mm0, %%mm6                                \n\t"
1255
#else
1256
#define FIND_MIN_MAX(addr)\
1257
                "movq " #addr ", %%mm0                                \n\t"\
1258
                "movq %%mm7, %%mm1                                \n\t"\
1259
                "psubusb %%mm0, %%mm6                                \n\t"\
1260
                "paddb %%mm0, %%mm6                                \n\t"\
1261
                "psubusb %%mm0, %%mm1                                \n\t"\
1262
                "psubb %%mm1, %%mm7                                \n\t"
1263
#endif
1264

    
1265
FIND_MIN_MAX((%%eax))
1266
FIND_MIN_MAX((%%eax, %1))
1267
FIND_MIN_MAX((%%eax, %1, 2))
1268
FIND_MIN_MAX((%0, %1, 4))
1269
FIND_MIN_MAX((%%ebx))
1270
FIND_MIN_MAX((%%ebx, %1))
1271
FIND_MIN_MAX((%%ebx, %1, 2))
1272
FIND_MIN_MAX((%0, %1, 8))
1273

    
1274
                "movq %%mm7, %%mm4                                \n\t"
1275
                "psrlq $8, %%mm7                                \n\t"
1276
#ifdef HAVE_MMX2
1277
                "pminub %%mm4, %%mm7                                \n\t" // min of pixels
1278
                "pshufw $0xF9, %%mm7, %%mm4                        \n\t"
1279
                "pminub %%mm4, %%mm7                                \n\t" // min of pixels
1280
                "pshufw $0xFE, %%mm7, %%mm4                        \n\t"
1281
                "pminub %%mm4, %%mm7                                \n\t"
1282
#else
1283
                "movq %%mm7, %%mm1                                \n\t"
1284
                "psubusb %%mm4, %%mm1                                \n\t"
1285
                "psubb %%mm1, %%mm7                                \n\t"
1286
                "movq %%mm7, %%mm4                                \n\t"
1287
                "psrlq $16, %%mm7                                \n\t"
1288
                "movq %%mm7, %%mm1                                \n\t"
1289
                "psubusb %%mm4, %%mm1                                \n\t"
1290
                "psubb %%mm1, %%mm7                                \n\t"
1291
                "movq %%mm7, %%mm4                                \n\t"
1292
                "psrlq $32, %%mm7                                \n\t"
1293
                "movq %%mm7, %%mm1                                \n\t"
1294
                "psubusb %%mm4, %%mm1                                \n\t"
1295
                "psubb %%mm1, %%mm7                                \n\t"
1296
#endif
1297

    
1298

    
1299
                "movq %%mm6, %%mm4                                \n\t"
1300
                "psrlq $8, %%mm6                                \n\t"
1301
#ifdef HAVE_MMX2
1302
                "pmaxub %%mm4, %%mm6                                \n\t" // max of pixels
1303
                "pshufw $0xF9, %%mm6, %%mm4                        \n\t"
1304
                "pmaxub %%mm4, %%mm6                                \n\t"
1305
                "pshufw $0xFE, %%mm6, %%mm4                        \n\t"
1306
                "pmaxub %%mm4, %%mm6                                \n\t"
1307
#else
1308
                "psubusb %%mm4, %%mm6                                \n\t"
1309
                "paddb %%mm4, %%mm6                                \n\t"
1310
                "movq %%mm6, %%mm4                                \n\t"
1311
                "psrlq $16, %%mm6                                \n\t"
1312
                "psubusb %%mm4, %%mm6                                \n\t"
1313
                "paddb %%mm4, %%mm6                                \n\t"
1314
                "movq %%mm6, %%mm4                                \n\t"
1315
                "psrlq $32, %%mm6                                \n\t"
1316
                "psubusb %%mm4, %%mm6                                \n\t"
1317
                "paddb %%mm4, %%mm6                                \n\t"
1318
#endif
1319
                "movq %%mm6, %%mm0                                \n\t" // max
1320
                "psubb %%mm7, %%mm6                                \n\t" // max - min
1321
                "movd %%mm6, %%ecx                                \n\t"
1322
                "cmpb deringThreshold, %%cl                        \n\t"
1323
                " jb 1f                                                \n\t"
1324
                PAVGB(%%mm0, %%mm7)                                      // a=(max + min)/2
1325
                "punpcklbw %%mm7, %%mm7                                \n\t"
1326
                "punpcklbw %%mm7, %%mm7                                \n\t"
1327
                "punpcklbw %%mm7, %%mm7                                \n\t"
1328
                "movq %%mm7, temp0                                \n\t"
1329

    
1330
                "movq (%0), %%mm0                                \n\t" // L10
1331
                "movq %%mm0, %%mm1                                \n\t" // L10
1332
                "movq %%mm0, %%mm2                                \n\t" // L10
1333
                "psllq $8, %%mm1                                \n\t"
1334
                "psrlq $8, %%mm2                                \n\t"
1335
                "movd -4(%0), %%mm3                                \n\t"
1336
                "movd 8(%0), %%mm4                                \n\t"
1337
                "psrlq $24, %%mm3                                \n\t"
1338
                "psllq $56, %%mm4                                \n\t"
1339
                "por %%mm3, %%mm1                                \n\t" // L00
1340
                "por %%mm4, %%mm2                                \n\t" // L20
1341
                "movq %%mm1, %%mm3                                \n\t" // L00
1342
                PAVGB(%%mm2, %%mm1)                                      // (L20 + L00)/2
1343
                PAVGB(%%mm0, %%mm1)                                      // (L20 + L00 + 2L10)/4
1344
                "psubusb %%mm7, %%mm0                                \n\t"
1345
                "psubusb %%mm7, %%mm2                                \n\t"
1346
                "psubusb %%mm7, %%mm3                                \n\t"
1347
                "pcmpeqb b00, %%mm0                                \n\t" // L10 > a ? 0 : -1
1348
                "pcmpeqb b00, %%mm2                                \n\t" // L20 > a ? 0 : -1
1349
                "pcmpeqb b00, %%mm3                                \n\t" // L00 > a ? 0 : -1
1350
                "paddb %%mm2, %%mm0                                \n\t"
1351
                "paddb %%mm3, %%mm0                                \n\t"
1352

    
1353
                "movq (%%eax), %%mm2                                \n\t" // L11
1354
                "movq %%mm2, %%mm3                                \n\t" // L11
1355
                "movq %%mm2, %%mm4                                \n\t" // L11
1356
                "psllq $8, %%mm3                                \n\t"
1357
                "psrlq $8, %%mm4                                \n\t"
1358
                "movd -4(%%eax), %%mm5                                \n\t"
1359
                "movd 8(%%eax), %%mm6                                \n\t"
1360
                "psrlq $24, %%mm5                                \n\t"
1361
                "psllq $56, %%mm6                                \n\t"
1362
                "por %%mm5, %%mm3                                \n\t" // L01
1363
                "por %%mm6, %%mm4                                \n\t" // L21
1364
                "movq %%mm3, %%mm5                                \n\t" // L01
1365
                PAVGB(%%mm4, %%mm3)                                      // (L21 + L01)/2
1366
                PAVGB(%%mm2, %%mm3)                                      // (L21 + L01 + 2L11)/4
1367
                "psubusb %%mm7, %%mm2                                \n\t"
1368
                "psubusb %%mm7, %%mm4                                \n\t"
1369
                "psubusb %%mm7, %%mm5                                \n\t"
1370
                "pcmpeqb b00, %%mm2                                \n\t" // L11 > a ? 0 : -1
1371
                "pcmpeqb b00, %%mm4                                \n\t" // L21 > a ? 0 : -1
1372
                "pcmpeqb b00, %%mm5                                \n\t" // L01 > a ? 0 : -1
1373
                "paddb %%mm4, %%mm2                                \n\t"
1374
                "paddb %%mm5, %%mm2                                \n\t"
1375
// 0, 2, 3, 1
1376
#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1377
                "movq " #src ", " #sx "                                \n\t" /* src[0] */\
1378
                "movq " #sx ", " #lx "                                \n\t" /* src[0] */\
1379
                "movq " #sx ", " #t0 "                                \n\t" /* src[0] */\
1380
                "psllq $8, " #lx "                                \n\t"\
1381
                "psrlq $8, " #t0 "                                \n\t"\
1382
                "movd -4" #src ", " #t1 "                        \n\t"\
1383
                "psrlq $24, " #t1 "                                \n\t"\
1384
                "por " #t1 ", " #lx "                                \n\t" /* src[-1] */\
1385
                "movd 8" #src ", " #t1 "                        \n\t"\
1386
                "psllq $56, " #t1 "                                \n\t"\
1387
                "por " #t1 ", " #t0 "                                \n\t" /* src[+1] */\
1388
                "movq " #lx ", " #t1 "                                \n\t" /* src[-1] */\
1389
                PAVGB(t0, lx)                                              /* (src[-1] + src[+1])/2 */\
1390
                PAVGB(sx, lx)                                      /* (src[-1] + 2src[0] + src[+1])/4 */\
1391
                PAVGB(lx, pplx)                                             \
1392
                "movq " #lx ", temp1                                \n\t"\
1393
                "movq temp0, " #lx "                                \n\t"\
1394
                "psubusb " #lx ", " #t1 "                        \n\t"\
1395
                "psubusb " #lx ", " #t0 "                        \n\t"\
1396
                "psubusb " #lx ", " #sx "                        \n\t"\
1397
                "movq b00, " #lx "                                \n\t"\
1398
                "pcmpeqb " #lx ", " #t1 "                        \n\t" /* src[-1] > a ? 0 : -1*/\
1399
                "pcmpeqb " #lx ", " #t0 "                        \n\t" /* src[+1] > a ? 0 : -1*/\
1400
                "pcmpeqb " #lx ", " #sx "                        \n\t" /* src[0]  > a ? 0 : -1*/\
1401
                "paddb " #t1 ", " #t0 "                                \n\t"\
1402
                "paddb " #t0 ", " #sx "                                \n\t"\
1403
\
1404
                PAVGB(plx, pplx)                                      /* filtered */\
1405
                "movq " #dst ", " #t0 "                                \n\t" /* dst */\
1406
                "movq " #t0 ", " #t1 "                                \n\t" /* dst */\
1407
                "psubusb pQPb2, " #t0 "                                \n\t"\
1408
                "paddusb pQPb2, " #t1 "                                \n\t"\
1409
                PMAXUB(t0, pplx)\
1410
                PMINUB(t1, pplx, t0)\
1411
                "paddb " #sx ", " #ppsx "                        \n\t"\
1412
                "paddb " #psx ", " #ppsx "                        \n\t"\
1413
        "#paddb b02, " #ppsx "                                \n\t"\
1414
                "pand b08, " #ppsx "                                \n\t"\
1415
                "pcmpeqb " #lx ", " #ppsx "                        \n\t"\
1416
                "pand " #ppsx ", " #pplx "                        \n\t"\
1417
                "pandn " #dst ", " #ppsx "                        \n\t"\
1418
                "por " #pplx ", " #ppsx "                        \n\t"\
1419
                "movq " #ppsx ", " #dst "                        \n\t"\
1420
                "movq temp1, " #lx "                                \n\t"
1421

    
1422
/*
1423
0000000
1424
1111111
1425

1426
1111110
1427
1111101
1428
1111100
1429
1111011
1430
1111010
1431
1111001
1432

1433
1111000
1434
1110111
1435

1436
*/
1437
//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1438
DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1439
DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1440
DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1441
DERING_CORE((%0, %1, 4),(%%ebx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1442
DERING_CORE((%%ebx),(%%ebx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1443
DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1444
DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1445
DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1446

    
1447
                "1:                        \n\t"
1448
                : : "r" (src), "r" (stride), "r" (QP)
1449
                : "%eax", "%ebx", "%ecx"
1450
        );
1451
#else
1452
        int y;
1453
        int min=255;
1454
        int max=0;
1455
        int avg;
1456
        uint8_t *p;
1457
        int s[10];
1458

    
1459
        for(y=1; y<9; y++)
1460
        {
1461
                int x;
1462
                p= src + stride*y;
1463
                for(x=1; x<9; x++)
1464
                {
1465
                        p++;
1466
                        if(*p > max) max= *p;
1467
                        if(*p < min) min= *p;
1468
                }
1469
        }
1470
        avg= (min + max + 1)/2;
1471

    
1472
        if(max - min <deringThreshold) return;
1473

    
1474
        for(y=0; y<10; y++)
1475
        {
1476
                int x;
1477
                int t = 0;
1478
                p= src + stride*y;
1479
                for(x=0; x<10; x++)
1480
                {
1481
                        if(*p > avg) t |= (1<<x);
1482
                        p++;
1483
                }
1484
                t |= (~t)<<16;
1485
                t &= (t<<1) & (t>>1);
1486
                s[y] = t;
1487
        }
1488

    
1489
        for(y=1; y<9; y++)
1490
        {
1491
                int x;
1492
                int t = s[y-1] & s[y] & s[y+1];
1493
                t|= t>>16;
1494

    
1495
                p= src + stride*y;
1496
                for(x=1; x<9; x++)
1497
                {
1498
                        p++;
1499
                        if(t & (1<<x))
1500
                        {
1501
                                int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1502
                                      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1503
                                      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1504
                                f= (f + 8)>>4;
1505

    
1506
#ifdef DEBUG_DERING_THRESHOLD
1507
                                asm volatile("emms\n\t":);
1508
                                {
1509
                                static long long numPixels=0;
1510
                                if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1511
//                                if((max-min)<20 || (max-min)*QP<200)
1512
//                                if((max-min)*QP < 500)
1513
//                                if(max-min<QP/2)
1514
                                if(max-min < 20)
1515
                                {
1516
                                        static int numSkiped=0;
1517
                                        static int errorSum=0;
1518
                                        static int worstQP=0;
1519
                                        static int worstRange=0;
1520
                                        static int worstDiff=0;
1521
                                        int diff= (f - *p);
1522
                                        int absDiff= ABS(diff);
1523
                                        int error= diff*diff;
1524

    
1525
                                        if(x==1 || x==8 || y==1 || y==8) continue;
1526

    
1527
                                        numSkiped++;
1528
                                        if(absDiff > worstDiff)
1529
                                        {
1530
                                                worstDiff= absDiff;
1531
                                                worstQP= QP;
1532
                                                worstRange= max-min;
1533
                                        }
1534
                                        errorSum+= error;
1535

    
1536
                                        if(1024LL*1024LL*1024LL % numSkiped == 0)
1537
                                        {
1538
                                                printf( "sum:%1.3f, skip:%d, wQP:%d, "
1539
                                                        "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1540
                                                        (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1541
                                                        worstDiff, (float)numSkiped/numPixels);
1542
                                        }
1543
                                }
1544
                                }
1545
#endif
1546
                                if     (*p + 2*QP < f) *p= *p + 2*QP;
1547
                                else if(*p - 2*QP > f) *p= *p - 2*QP;
1548
                                else *p=f;
1549
                        }
1550
                }
1551
        }
1552
#ifdef DEBUG_DERING_THRESHOLD
1553
        if(max-min < 20)
1554
        {
1555
                for(y=1; y<9; y++)
1556
                {
1557
                        int x;
1558
                        int t = 0;
1559
                        p= src + stride*y;
1560
                        for(x=1; x<9; x++)
1561
                        {
1562
                                p++;
1563
                                *p = MIN(*p + 20, 255);
1564
                        }
1565
                }
1566
//                src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1567
        }
1568
#endif
1569
#endif
1570
}
1571

    
1572
/**
1573
 * Deinterlaces the given block
1574
 * will be called for every 8x8 block and can read & write from line 4-15
1575
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1576
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1577
 */
1578
static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1579
{
1580
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1581
        src+= 4*stride;
1582
        asm volatile(
1583
                "leal (%0, %1), %%eax                                \n\t"
1584
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1585
//        0        1        2        3        4        5        6        7        8        9
1586
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1587

    
1588
                "movq (%0), %%mm0                                \n\t"
1589
                "movq (%%eax, %1), %%mm1                        \n\t"
1590
                PAVGB(%%mm1, %%mm0)
1591
                "movq %%mm0, (%%eax)                                \n\t"
1592
                "movq (%0, %1, 4), %%mm0                        \n\t"
1593
                PAVGB(%%mm0, %%mm1)
1594
                "movq %%mm1, (%%eax, %1, 2)                        \n\t"
1595
                "movq (%%ebx, %1), %%mm1                        \n\t"
1596
                PAVGB(%%mm1, %%mm0)
1597
                "movq %%mm0, (%%ebx)                                \n\t"
1598
                "movq (%0, %1, 8), %%mm0                        \n\t"
1599
                PAVGB(%%mm0, %%mm1)
1600
                "movq %%mm1, (%%ebx, %1, 2)                        \n\t"
1601

    
1602
                : : "r" (src), "r" (stride)
1603
                : "%eax", "%ebx"
1604
        );
1605
#else
1606
        int x;
1607
        src+= 4*stride;
1608
        for(x=0; x<8; x++)
1609
        {
1610
                src[stride]   = (src[0]        + src[stride*2])>>1;
1611
                src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1612
                src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1613
                src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1614
                src++;
1615
        }
1616
#endif
1617
}
1618

    
1619
/**
1620
 * Deinterlaces the given block
1621
 * will be called for every 8x8 block and can read & write from line 4-15
1622
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1623
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1624
 * this filter will read lines 3-15 and write 7-13
1625
 * no cliping in C version
1626
 */
1627
static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1628
{
1629
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1630
        src+= stride*3;
1631
        asm volatile(
1632
                "leal (%0, %1), %%eax                                \n\t"
1633
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1634
                "leal (%%ebx, %1, 4), %%ecx                        \n\t"
1635
                "addl %1, %%ecx                                        \n\t"
1636
                "pxor %%mm7, %%mm7                                \n\t"
1637
//        0        1        2        3        4        5        6        7        8        9        10
1638
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1 ecx
1639

    
1640
#define DEINT_CUBIC(a,b,c,d,e)\
1641
                "movq " #a ", %%mm0                                \n\t"\
1642
                "movq " #b ", %%mm1                                \n\t"\
1643
                "movq " #d ", %%mm2                                \n\t"\
1644
                "movq " #e ", %%mm3                                \n\t"\
1645
                PAVGB(%%mm2, %%mm1)                                        /* (b+d) /2 */\
1646
                PAVGB(%%mm3, %%mm0)                                        /* a(a+e) /2 */\
1647
                "movq %%mm0, %%mm2                                \n\t"\
1648
                "punpcklbw %%mm7, %%mm0                                \n\t"\
1649
                "punpckhbw %%mm7, %%mm2                                \n\t"\
1650
                "movq %%mm1, %%mm3                                \n\t"\
1651
                "punpcklbw %%mm7, %%mm1                                \n\t"\
1652
                "punpckhbw %%mm7, %%mm3                                \n\t"\
1653
                "psubw %%mm1, %%mm0                                \n\t"        /* L(a+e - (b+d))/2 */\
1654
                "psubw %%mm3, %%mm2                                \n\t"        /* H(a+e - (b+d))/2 */\
1655
                "psraw $3, %%mm0                                \n\t"        /* L(a+e - (b+d))/16 */\
1656
                "psraw $3, %%mm2                                \n\t"        /* H(a+e - (b+d))/16 */\
1657
                "psubw %%mm0, %%mm1                                \n\t"        /* L(9b + 9d - a - e)/16 */\
1658
                "psubw %%mm2, %%mm3                                \n\t"        /* H(9b + 9d - a - e)/16 */\
1659
                "packuswb %%mm3, %%mm1                                \n\t"\
1660
                "movq %%mm1, " #c "                                \n\t"
1661

    
1662
DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
1663
DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
1664
DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
1665
DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1666

    
1667
                : : "r" (src), "r" (stride)
1668
                : "%eax", "%ebx", "ecx"
1669
        );
1670
#else
1671
        int x;
1672
        src+= stride*3;
1673
        for(x=0; x<8; x++)
1674
        {
1675
                src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1676
                src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1677
                src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1678
                src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
1679
                src++;
1680
        }
1681
#endif
1682
}
1683

    
1684
/**
1685
 * Deinterlaces the given block
1686
 * will be called for every 8x8 block and can read & write from line 4-15
1687
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1688
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1689
 * will shift the image up by 1 line (FIXME if this is a problem)
1690
 * this filter will read lines 4-13 and write 4-11
1691
 */
1692
static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride)
1693
{
1694
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1695
        src+= 4*stride;
1696
        asm volatile(
1697
                "leal (%0, %1), %%eax                                \n\t"
1698
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1699
//        0        1        2        3        4        5        6        7        8        9
1700
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1701

    
1702
                "movq (%0), %%mm0                                \n\t" // L0
1703
                "movq (%%eax, %1), %%mm1                        \n\t" // L2
1704
                PAVGB(%%mm1, %%mm0)                                      // L0+L2
1705
                "movq (%%eax), %%mm2                                \n\t" // L1
1706
                PAVGB(%%mm2, %%mm0)
1707
                "movq %%mm0, (%0)                                \n\t"
1708
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // L3
1709
                PAVGB(%%mm0, %%mm2)                                      // L1+L3
1710
                PAVGB(%%mm1, %%mm2)                                      // 2L2 + L1 + L3
1711
                "movq %%mm2, (%%eax)                                \n\t"
1712
                "movq (%0, %1, 4), %%mm2                        \n\t" // L4
1713
                PAVGB(%%mm2, %%mm1)                                      // L2+L4
1714
                PAVGB(%%mm0, %%mm1)                                      // 2L3 + L2 + L4
1715
                "movq %%mm1, (%%eax, %1)                        \n\t"
1716
                "movq (%%ebx), %%mm1                                \n\t" // L5
1717
                PAVGB(%%mm1, %%mm0)                                      // L3+L5
1718
                PAVGB(%%mm2, %%mm0)                                      // 2L4 + L3 + L5
1719
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
1720
                "movq (%%ebx, %1), %%mm0                        \n\t" // L6
1721
                PAVGB(%%mm0, %%mm2)                                      // L4+L6
1722
                PAVGB(%%mm1, %%mm2)                                      // 2L5 + L4 + L6
1723
                "movq %%mm2, (%0, %1, 4)                        \n\t"
1724
                "movq (%%ebx, %1, 2), %%mm2                        \n\t" // L7
1725
                PAVGB(%%mm2, %%mm1)                                      // L5+L7
1726
                PAVGB(%%mm0, %%mm1)                                      // 2L6 + L5 + L7
1727
                "movq %%mm1, (%%ebx)                                \n\t"
1728
                "movq (%0, %1, 8), %%mm1                        \n\t" // L8
1729
                PAVGB(%%mm1, %%mm0)                                      // L6+L8
1730
                PAVGB(%%mm2, %%mm0)                                      // 2L7 + L6 + L8
1731
                "movq %%mm0, (%%ebx, %1)                        \n\t"
1732
                "movq (%%ebx, %1, 4), %%mm0                        \n\t" // L9
1733
                PAVGB(%%mm0, %%mm2)                                      // L7+L9
1734
                PAVGB(%%mm1, %%mm2)                                      // 2L8 + L7 + L9
1735
                "movq %%mm2, (%%ebx, %1, 2)                        \n\t"
1736

    
1737

    
1738
                : : "r" (src), "r" (stride)
1739
                : "%eax", "%ebx"
1740
        );
1741
#else
1742
        int x;
1743
        src+= 4*stride;
1744
        for(x=0; x<8; x++)
1745
        {
1746
                src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
1747
                src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
1748
                src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
1749
                src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
1750
                src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
1751
                src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
1752
                src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
1753
                src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
1754
                src++;
1755
        }
1756
#endif
1757
}
1758

    
1759
/**
1760
 * Deinterlaces the given block
1761
 * will be called for every 8x8 block and can read & write from line 4-15,
1762
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1763
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1764
 */
1765
static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1766
{
1767
#ifdef HAVE_MMX
1768
        src+= 4*stride;
1769
#ifdef HAVE_MMX2
1770
        asm volatile(
1771
                "leal (%0, %1), %%eax                                \n\t"
1772
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1773
//        0        1        2        3        4        5        6        7        8        9
1774
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1775

    
1776
                "movq (%0), %%mm0                                \n\t" //
1777
                "movq (%%eax, %1), %%mm2                        \n\t" //
1778
                "movq (%%eax), %%mm1                                \n\t" //
1779
                "movq %%mm0, %%mm3                                \n\t"
1780
                "pmaxub %%mm1, %%mm0                                \n\t" //
1781
                "pminub %%mm3, %%mm1                                \n\t" //
1782
                "pmaxub %%mm2, %%mm1                                \n\t" //
1783
                "pminub %%mm1, %%mm0                                \n\t"
1784
                "movq %%mm0, (%%eax)                                \n\t"
1785

    
1786
                "movq (%0, %1, 4), %%mm0                        \n\t" //
1787
                "movq (%%eax, %1, 2), %%mm1                        \n\t" //
1788
                "movq %%mm2, %%mm3                                \n\t"
1789
                "pmaxub %%mm1, %%mm2                                \n\t" //
1790
                "pminub %%mm3, %%mm1                                \n\t" //
1791
                "pmaxub %%mm0, %%mm1                                \n\t" //
1792
                "pminub %%mm1, %%mm2                                \n\t"
1793
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
1794

    
1795
                "movq (%%ebx), %%mm2                                \n\t" //
1796
                "movq (%%ebx, %1), %%mm1                        \n\t" //
1797
                "movq %%mm2, %%mm3                                \n\t"
1798
                "pmaxub %%mm0, %%mm2                                \n\t" //
1799
                "pminub %%mm3, %%mm0                                \n\t" //
1800
                "pmaxub %%mm1, %%mm0                                \n\t" //
1801
                "pminub %%mm0, %%mm2                                \n\t"
1802
                "movq %%mm2, (%%ebx)                                \n\t"
1803

    
1804
                "movq (%%ebx, %1, 2), %%mm2                        \n\t" //
1805
                "movq (%0, %1, 8), %%mm0                        \n\t" //
1806
                "movq %%mm2, %%mm3                                \n\t"
1807
                "pmaxub %%mm0, %%mm2                                \n\t" //
1808
                "pminub %%mm3, %%mm0                                \n\t" //
1809
                "pmaxub %%mm1, %%mm0                                \n\t" //
1810
                "pminub %%mm0, %%mm2                                \n\t"
1811
                "movq %%mm2, (%%ebx, %1, 2)                        \n\t"
1812

    
1813

    
1814
                : : "r" (src), "r" (stride)
1815
                : "%eax", "%ebx"
1816
        );
1817

    
1818
#else // MMX without MMX2
1819
        asm volatile(
1820
                "leal (%0, %1), %%eax                                \n\t"
1821
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1822
//        0        1        2        3        4        5        6        7        8        9
1823
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1824
                "pxor %%mm7, %%mm7                                \n\t"
1825

    
1826
#define MEDIAN(a,b,c)\
1827
                "movq " #a ", %%mm0                                \n\t"\
1828
                "movq " #b ", %%mm2                                \n\t"\
1829
                "movq " #c ", %%mm1                                \n\t"\
1830
                "movq %%mm0, %%mm3                                \n\t"\
1831
                "movq %%mm1, %%mm4                                \n\t"\
1832
                "movq %%mm2, %%mm5                                \n\t"\
1833
                "psubusb %%mm1, %%mm3                                \n\t"\
1834
                "psubusb %%mm2, %%mm4                                \n\t"\
1835
                "psubusb %%mm0, %%mm5                                \n\t"\
1836
                "pcmpeqb %%mm7, %%mm3                                \n\t"\
1837
                "pcmpeqb %%mm7, %%mm4                                \n\t"\
1838
                "pcmpeqb %%mm7, %%mm5                                \n\t"\
1839
                "movq %%mm3, %%mm6                                \n\t"\
1840
                "pxor %%mm4, %%mm3                                \n\t"\
1841
                "pxor %%mm5, %%mm4                                \n\t"\
1842
                "pxor %%mm6, %%mm5                                \n\t"\
1843
                "por %%mm3, %%mm1                                \n\t"\
1844
                "por %%mm4, %%mm2                                \n\t"\
1845
                "por %%mm5, %%mm0                                \n\t"\
1846
                "pand %%mm2, %%mm0                                \n\t"\
1847
                "pand %%mm1, %%mm0                                \n\t"\
1848
                "movq %%mm0, " #b "                                \n\t"
1849

    
1850
MEDIAN((%0), (%%eax), (%%eax, %1))
1851
MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
1852
MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
1853
MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
1854

    
1855
                : : "r" (src), "r" (stride)
1856
                : "%eax", "%ebx"
1857
        );
1858
#endif // MMX
1859
#else
1860
        //FIXME
1861
        int x;
1862
        src+= 4*stride;
1863
        for(x=0; x<8; x++)
1864
        {
1865
                src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
1866
                src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
1867
                src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
1868
                src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
1869
                src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
1870
                src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
1871
                src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
1872
                src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
1873
                src++;
1874
        }
1875
#endif
1876
}
1877

    
1878
#ifdef HAVE_MMX
1879
/**
1880
 * transposes and shift the given 8x8 Block into dst1 and dst2
1881
 */
1882
static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
1883
{
1884
        asm(
1885
                "leal (%0, %1), %%eax                                \n\t"
1886
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1887
//        0        1        2        3        4        5        6        7        8        9
1888
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1889
                "movq (%0), %%mm0                \n\t" // 12345678
1890
                "movq (%%eax), %%mm1                \n\t" // abcdefgh
1891
                "movq %%mm0, %%mm2                \n\t" // 12345678
1892
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
1893
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
1894

    
1895
                "movq (%%eax, %1), %%mm1        \n\t"
1896
                "movq (%%eax, %1, 2), %%mm3        \n\t"
1897
                "movq %%mm1, %%mm4                \n\t"
1898
                "punpcklbw %%mm3, %%mm1                \n\t"
1899
                "punpckhbw %%mm3, %%mm4                \n\t"
1900

    
1901
                "movq %%mm0, %%mm3                \n\t"
1902
                "punpcklwd %%mm1, %%mm0                \n\t"
1903
                "punpckhwd %%mm1, %%mm3                \n\t"
1904
                "movq %%mm2, %%mm1                \n\t"
1905
                "punpcklwd %%mm4, %%mm2                \n\t"
1906
                "punpckhwd %%mm4, %%mm1                \n\t"
1907

    
1908
                "movd %%mm0, 128(%2)                \n\t"
1909
                "psrlq $32, %%mm0                \n\t"
1910
                "movd %%mm0, 144(%2)                \n\t"
1911
                "movd %%mm3, 160(%2)                \n\t"
1912
                "psrlq $32, %%mm3                \n\t"
1913
                "movd %%mm3, 176(%2)                \n\t"
1914
                "movd %%mm3, 48(%3)                \n\t"
1915
                "movd %%mm2, 192(%2)                \n\t"
1916
                "movd %%mm2, 64(%3)                \n\t"
1917
                "psrlq $32, %%mm2                \n\t"
1918
                "movd %%mm2, 80(%3)                \n\t"
1919
                "movd %%mm1, 96(%3)                \n\t"
1920
                "psrlq $32, %%mm1                \n\t"
1921
                "movd %%mm1, 112(%3)                \n\t"
1922

    
1923
                "movq (%0, %1, 4), %%mm0        \n\t" // 12345678
1924
                "movq (%%ebx), %%mm1                \n\t" // abcdefgh
1925
                "movq %%mm0, %%mm2                \n\t" // 12345678
1926
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
1927
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
1928

    
1929
                "movq (%%ebx, %1), %%mm1        \n\t"
1930
                "movq (%%ebx, %1, 2), %%mm3        \n\t"
1931
                "movq %%mm1, %%mm4                \n\t"
1932
                "punpcklbw %%mm3, %%mm1                \n\t"
1933
                "punpckhbw %%mm3, %%mm4                \n\t"
1934

    
1935
                "movq %%mm0, %%mm3                \n\t"
1936
                "punpcklwd %%mm1, %%mm0                \n\t"
1937
                "punpckhwd %%mm1, %%mm3                \n\t"
1938
                "movq %%mm2, %%mm1                \n\t"
1939
                "punpcklwd %%mm4, %%mm2                \n\t"
1940
                "punpckhwd %%mm4, %%mm1                \n\t"
1941

    
1942
                "movd %%mm0, 132(%2)                \n\t"
1943
                "psrlq $32, %%mm0                \n\t"
1944
                "movd %%mm0, 148(%2)                \n\t"
1945
                "movd %%mm3, 164(%2)                \n\t"
1946
                "psrlq $32, %%mm3                \n\t"
1947
                "movd %%mm3, 180(%2)                \n\t"
1948
                "movd %%mm3, 52(%3)                \n\t"
1949
                "movd %%mm2, 196(%2)                \n\t"
1950
                "movd %%mm2, 68(%3)                \n\t"
1951
                "psrlq $32, %%mm2                \n\t"
1952
                "movd %%mm2, 84(%3)                \n\t"
1953
                "movd %%mm1, 100(%3)                \n\t"
1954
                "psrlq $32, %%mm1                \n\t"
1955
                "movd %%mm1, 116(%3)                \n\t"
1956

    
1957

    
1958
        :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
1959
        : "%eax", "%ebx"
1960
        );
1961
}
1962

    
1963
/**
1964
 * transposes the given 8x8 block
1965
 */
1966
static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
1967
{
1968
        asm(
1969
                "leal (%0, %1), %%eax                                \n\t"
1970
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1971
//        0        1        2        3        4        5        6        7        8        9
1972
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1973
                "movq (%2), %%mm0                \n\t" // 12345678
1974
                "movq 16(%2), %%mm1                \n\t" // abcdefgh
1975
                "movq %%mm0, %%mm2                \n\t" // 12345678
1976
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
1977
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
1978

    
1979
                "movq 32(%2), %%mm1                \n\t"
1980
                "movq 48(%2), %%mm3                \n\t"
1981
                "movq %%mm1, %%mm4                \n\t"
1982
                "punpcklbw %%mm3, %%mm1                \n\t"
1983
                "punpckhbw %%mm3, %%mm4                \n\t"
1984

    
1985
                "movq %%mm0, %%mm3                \n\t"
1986
                "punpcklwd %%mm1, %%mm0                \n\t"
1987
                "punpckhwd %%mm1, %%mm3                \n\t"
1988
                "movq %%mm2, %%mm1                \n\t"
1989
                "punpcklwd %%mm4, %%mm2                \n\t"
1990
                "punpckhwd %%mm4, %%mm1                \n\t"
1991

    
1992
                "movd %%mm0, (%0)                \n\t"
1993
                "psrlq $32, %%mm0                \n\t"
1994
                "movd %%mm0, (%%eax)                \n\t"
1995
                "movd %%mm3, (%%eax, %1)        \n\t"
1996
                "psrlq $32, %%mm3                \n\t"
1997
                "movd %%mm3, (%%eax, %1, 2)        \n\t"
1998
                "movd %%mm2, (%0, %1, 4)        \n\t"
1999
                "psrlq $32, %%mm2                \n\t"
2000
                "movd %%mm2, (%%ebx)                \n\t"
2001
                "movd %%mm1, (%%ebx, %1)        \n\t"
2002
                "psrlq $32, %%mm1                \n\t"
2003
                "movd %%mm1, (%%ebx, %1, 2)        \n\t"
2004

    
2005

    
2006
                "movq 64(%2), %%mm0                \n\t" // 12345678
2007
                "movq 80(%2), %%mm1                \n\t" // abcdefgh
2008
                "movq %%mm0, %%mm2                \n\t" // 12345678
2009
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2010
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2011

    
2012
                "movq 96(%2), %%mm1                \n\t"
2013
                "movq 112(%2), %%mm3                \n\t"
2014
                "movq %%mm1, %%mm4                \n\t"
2015
                "punpcklbw %%mm3, %%mm1                \n\t"
2016
                "punpckhbw %%mm3, %%mm4                \n\t"
2017

    
2018
                "movq %%mm0, %%mm3                \n\t"
2019
                "punpcklwd %%mm1, %%mm0                \n\t"
2020
                "punpckhwd %%mm1, %%mm3                \n\t"
2021
                "movq %%mm2, %%mm1                \n\t"
2022
                "punpcklwd %%mm4, %%mm2                \n\t"
2023
                "punpckhwd %%mm4, %%mm1                \n\t"
2024

    
2025
                "movd %%mm0, 4(%0)                \n\t"
2026
                "psrlq $32, %%mm0                \n\t"
2027
                "movd %%mm0, 4(%%eax)                \n\t"
2028
                "movd %%mm3, 4(%%eax, %1)        \n\t"
2029
                "psrlq $32, %%mm3                \n\t"
2030
                "movd %%mm3, 4(%%eax, %1, 2)        \n\t"
2031
                "movd %%mm2, 4(%0, %1, 4)        \n\t"
2032
                "psrlq $32, %%mm2                \n\t"
2033
                "movd %%mm2, 4(%%ebx)                \n\t"
2034
                "movd %%mm1, 4(%%ebx, %1)        \n\t"
2035
                "psrlq $32, %%mm1                \n\t"
2036
                "movd %%mm1, 4(%%ebx, %1, 2)        \n\t"
2037

    
2038
        :: "r" (dst), "r" (dstStride), "r" (src)
2039
        : "%eax", "%ebx"
2040
        );
2041
}
2042
#endif
2043
//static int test=0;
2044

    
2045
static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2046
                                    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2047
{
2048
#define FAST_L2_DIFF
2049
//#define L1_DIFF //u should change the thresholds too if u try that one
2050
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2051
        asm volatile(
2052
                "leal (%2, %2, 2), %%eax                        \n\t" // 3*stride
2053
                "leal (%2, %2, 4), %%ebx                        \n\t" // 5*stride
2054
                "leal (%%ebx, %2, 2), %%ecx                        \n\t" // 7*stride
2055
//        0        1        2        3        4        5        6        7        8        9
2056
//        %x        %x+%2        %x+2%2        %x+eax        %x+4%2        %x+ebx        %x+2eax        %x+ecx        %x+8%2
2057
//FIXME reorder?
2058
#ifdef L1_DIFF //needs mmx2
2059
                "movq (%0), %%mm0                                \n\t" // L0
2060
                "psadbw (%1), %%mm0                                \n\t" // |L0-R0|
2061
                "movq (%0, %2), %%mm1                                \n\t" // L1
2062
                "psadbw (%1, %2), %%mm1                                \n\t" // |L1-R1|
2063
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2064
                "psadbw (%1, %2, 2), %%mm2                        \n\t" // |L2-R2|
2065
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2066
                "psadbw (%1, %%eax), %%mm3                        \n\t" // |L3-R3|
2067

    
2068
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2069
                "paddw %%mm1, %%mm0                                \n\t"
2070
                "psadbw (%1, %2, 4), %%mm4                        \n\t" // |L4-R4|
2071
                "movq (%0, %%ebx), %%mm5                        \n\t" // L5
2072
                "paddw %%mm2, %%mm0                                \n\t"
2073
                "psadbw (%1, %%ebx), %%mm5                        \n\t" // |L5-R5|
2074
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2075
                "paddw %%mm3, %%mm0                                \n\t"
2076
                "psadbw (%1, %%eax, 2), %%mm6                        \n\t" // |L6-R6|
2077
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2078
                "paddw %%mm4, %%mm0                                \n\t"
2079
                "psadbw (%1, %%ecx), %%mm7                        \n\t" // |L7-R7|
2080
                "paddw %%mm5, %%mm6                                \n\t"
2081
                "paddw %%mm7, %%mm6                                \n\t"
2082
                "paddw %%mm6, %%mm0                                \n\t"
2083
#elif defined (FAST_L2_DIFF)
2084
                "pcmpeqb %%mm7, %%mm7                                \n\t"
2085
                "movq b80, %%mm6                                \n\t"
2086
                "pxor %%mm0, %%mm0                                \n\t"
2087
#define L2_DIFF_CORE(a, b)\
2088
                "movq " #a ", %%mm5                                \n\t"\
2089
                "movq " #b ", %%mm2                                \n\t"\
2090
                "pxor %%mm7, %%mm2                                \n\t"\
2091
                PAVGB(%%mm2, %%mm5)\
2092
                "paddb %%mm6, %%mm5                                \n\t"\
2093
                "movq %%mm5, %%mm2                                \n\t"\
2094
                "psllw $8, %%mm5                                \n\t"\
2095
                "pmaddwd %%mm5, %%mm5                                \n\t"\
2096
                "pmaddwd %%mm2, %%mm2                                \n\t"\
2097
                "paddd %%mm2, %%mm5                                \n\t"\
2098
                "psrld $14, %%mm5                                \n\t"\
2099
                "paddd %%mm5, %%mm0                                \n\t"
2100

    
2101
L2_DIFF_CORE((%0), (%1))
2102
L2_DIFF_CORE((%0, %2), (%1, %2))
2103
L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2104
L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2105
L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2106
L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2107
L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2108
L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2109

    
2110
#else
2111
                "pxor %%mm7, %%mm7                                \n\t"
2112
                "pxor %%mm0, %%mm0                                \n\t"
2113
#define L2_DIFF_CORE(a, b)\
2114
                "movq " #a ", %%mm5                                \n\t"\
2115
                "movq " #b ", %%mm2                                \n\t"\
2116
                "movq %%mm5, %%mm1                                \n\t"\
2117
                "movq %%mm2, %%mm3                                \n\t"\
2118
                "punpcklbw %%mm7, %%mm5                                \n\t"\
2119
                "punpckhbw %%mm7, %%mm1                                \n\t"\
2120
                "punpcklbw %%mm7, %%mm2                                \n\t"\
2121
                "punpckhbw %%mm7, %%mm3                                \n\t"\
2122
                "psubw %%mm2, %%mm5                                \n\t"\
2123
                "psubw %%mm3, %%mm1                                \n\t"\
2124
                "pmaddwd %%mm5, %%mm5                                \n\t"\
2125
                "pmaddwd %%mm1, %%mm1                                \n\t"\
2126
                "paddd %%mm1, %%mm5                                \n\t"\
2127
                "paddd %%mm5, %%mm0                                \n\t"
2128

    
2129
L2_DIFF_CORE((%0), (%1))
2130
L2_DIFF_CORE((%0, %2), (%1, %2))
2131
L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2132
L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2133
L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2134
L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2135
L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2136
L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2137

    
2138
#endif
2139

    
2140
                "movq %%mm0, %%mm4                                \n\t"
2141
                "psrlq $32, %%mm0                                \n\t"
2142
                "paddd %%mm0, %%mm4                                \n\t"
2143
                "movd %%mm4, %%ecx                                \n\t"
2144
                "shll $2, %%ecx                                        \n\t"
2145
                "movl %3, %%ebx                                        \n\t"
2146
                "addl -4(%%ebx), %%ecx                                \n\t"
2147
                "addl 4(%%ebx), %%ecx                                \n\t"
2148
                "addl -1024(%%ebx), %%ecx                        \n\t"
2149
                "addl $4, %%ecx                                        \n\t"
2150
                "addl 1024(%%ebx), %%ecx                        \n\t"
2151
                "shrl $3, %%ecx                                        \n\t"
2152
                "movl %%ecx, (%%ebx)                                \n\t"
2153
                "leal (%%eax, %2, 2), %%ebx                        \n\t" // 5*stride
2154

    
2155
//                "movl %3, %%ecx                                \n\t"
2156
//                "movl %%ecx, test                                \n\t"
2157
//                "jmp 4f \n\t"
2158
                "cmpl 4+maxTmpNoise, %%ecx                        \n\t"
2159
                " jb 2f                                                \n\t"
2160
                "cmpl 8+maxTmpNoise, %%ecx                        \n\t"
2161
                " jb 1f                                                \n\t"
2162

    
2163
                "leal (%%ebx, %2, 2), %%ecx                        \n\t" // 7*stride
2164
                "movq (%0), %%mm0                                \n\t" // L0
2165
                "movq (%0, %2), %%mm1                                \n\t" // L1
2166
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2167
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2168
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2169
                "movq (%0, %%ebx), %%mm5                        \n\t" // L5
2170
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2171
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2172
                "movq %%mm0, (%1)                                \n\t" // L0
2173
                "movq %%mm1, (%1, %2)                                \n\t" // L1
2174
                "movq %%mm2, (%1, %2, 2)                        \n\t" // L2
2175
                "movq %%mm3, (%1, %%eax)                        \n\t" // L3
2176
                "movq %%mm4, (%1, %2, 4)                        \n\t" // L4
2177
                "movq %%mm5, (%1, %%ebx)                        \n\t" // L5
2178
                "movq %%mm6, (%1, %%eax, 2)                        \n\t" // L6
2179
                "movq %%mm7, (%1, %%ecx)                        \n\t" // L7
2180
                "jmp 4f                                                \n\t"
2181

    
2182
                "1:                                                \n\t"
2183
                "leal (%%ebx, %2, 2), %%ecx                        \n\t" // 7*stride
2184
                "movq (%0), %%mm0                                \n\t" // L0
2185
                "pavgb (%1), %%mm0                                \n\t" // L0
2186
                "movq (%0, %2), %%mm1                                \n\t" // L1
2187
                "pavgb (%1, %2), %%mm1                                \n\t" // L1
2188
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2189
                "pavgb (%1, %2, 2), %%mm2                        \n\t" // L2
2190
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2191
                "pavgb (%1, %%eax), %%mm3                        \n\t" // L3
2192
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2193
                "pavgb (%1, %2, 4), %%mm4                        \n\t" // L4
2194
                "movq (%0, %%ebx), %%mm5                        \n\t" // L5
2195
                "pavgb (%1, %%ebx), %%mm5                        \n\t" // L5
2196
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2197
                "pavgb (%1, %%eax, 2), %%mm6                        \n\t" // L6
2198
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2199
                "pavgb (%1, %%ecx), %%mm7                        \n\t" // L7
2200
                "movq %%mm0, (%1)                                \n\t" // R0
2201
                "movq %%mm1, (%1, %2)                                \n\t" // R1
2202
                "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2203
                "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2204
                "movq %%mm4, (%1, %2, 4)                        \n\t" // R4
2205
                "movq %%mm5, (%1, %%ebx)                        \n\t" // R5
2206
                "movq %%mm6, (%1, %%eax, 2)                        \n\t" // R6
2207
                "movq %%mm7, (%1, %%ecx)                        \n\t" // R7
2208
                "movq %%mm0, (%0)                                \n\t" // L0
2209
                "movq %%mm1, (%0, %2)                                \n\t" // L1
2210
                "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2211
                "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2212
                "movq %%mm4, (%0, %2, 4)                        \n\t" // L4
2213
                "movq %%mm5, (%0, %%ebx)                        \n\t" // L5
2214
                "movq %%mm6, (%0, %%eax, 2)                        \n\t" // L6
2215
                "movq %%mm7, (%0, %%ecx)                        \n\t" // L7
2216
                "jmp 4f                                                \n\t"
2217

    
2218
                "2:                                                \n\t"
2219
                "cmpl maxTmpNoise, %%ecx                        \n\t"
2220
                " jb 3f                                                \n\t"
2221

    
2222
                "leal (%%ebx, %2, 2), %%ecx                        \n\t" // 7*stride
2223
                "movq (%0), %%mm0                                \n\t" // L0
2224
                "movq (%0, %2), %%mm1                                \n\t" // L1
2225
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2226
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2227
                "movq (%1), %%mm4                                \n\t" // R0
2228
                "movq (%1, %2), %%mm5                                \n\t" // R1
2229
                "movq (%1, %2, 2), %%mm6                        \n\t" // R2
2230
                "movq (%1, %%eax), %%mm7                        \n\t" // R3
2231
                PAVGB(%%mm4, %%mm0)
2232
                PAVGB(%%mm5, %%mm1)
2233
                PAVGB(%%mm6, %%mm2)
2234
                PAVGB(%%mm7, %%mm3)
2235
                PAVGB(%%mm4, %%mm0)
2236
                PAVGB(%%mm5, %%mm1)
2237
                PAVGB(%%mm6, %%mm2)
2238
                PAVGB(%%mm7, %%mm3)
2239
                "movq %%mm0, (%1)                                \n\t" // R0
2240
                "movq %%mm1, (%1, %2)                                \n\t" // R1
2241
                "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2242
                "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2243
                "movq %%mm0, (%0)                                \n\t" // L0
2244
                "movq %%mm1, (%0, %2)                                \n\t" // L1
2245
                "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2246
                "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2247

    
2248
                "movq (%0, %2, 4), %%mm0                        \n\t" // L4
2249
                "movq (%0, %%ebx), %%mm1                        \n\t" // L5
2250
                "movq (%0, %%eax, 2), %%mm2                        \n\t" // L6
2251
                "movq (%0, %%ecx), %%mm3                        \n\t" // L7
2252
                "movq (%1, %2, 4), %%mm4                        \n\t" // R4
2253
                "movq (%1, %%ebx), %%mm5                        \n\t" // R5
2254
                "movq (%1, %%eax, 2), %%mm6                        \n\t" // R6
2255
                "movq (%1, %%ecx), %%mm7                        \n\t" // R7
2256
                PAVGB(%%mm4, %%mm0)
2257
                PAVGB(%%mm5, %%mm1)
2258
                PAVGB(%%mm6, %%mm2)
2259
                PAVGB(%%mm7, %%mm3)
2260
                PAVGB(%%mm4, %%mm0)
2261
                PAVGB(%%mm5, %%mm1)
2262
                PAVGB(%%mm6, %%mm2)
2263
                PAVGB(%%mm7, %%mm3)
2264
                "movq %%mm0, (%1, %2, 4)                        \n\t" // R4
2265
                "movq %%mm1, (%1, %%ebx)                        \n\t" // R5
2266
                "movq %%mm2, (%1, %%eax, 2)                        \n\t" // R6
2267
                "movq %%mm3, (%1, %%ecx)                        \n\t" // R7
2268
                "movq %%mm0, (%0, %2, 4)                        \n\t" // L4
2269
                "movq %%mm1, (%0, %%ebx)                        \n\t" // L5
2270
                "movq %%mm2, (%0, %%eax, 2)                        \n\t" // L6
2271
                "movq %%mm3, (%0, %%ecx)                        \n\t" // L7
2272
                "jmp 4f                                                \n\t"
2273

    
2274
                "3:                                                \n\t"
2275
                "leal (%%ebx, %2, 2), %%ecx                        \n\t" // 7*stride
2276
                "movq (%0), %%mm0                                \n\t" // L0
2277
                "movq (%0, %2), %%mm1                                \n\t" // L1
2278
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2279
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2280
                "movq (%1), %%mm4                                \n\t" // R0
2281
                "movq (%1, %2), %%mm5                                \n\t" // R1
2282
                "movq (%1, %2, 2), %%mm6                        \n\t" // R2
2283
                "movq (%1, %%eax), %%mm7                        \n\t" // R3
2284
                PAVGB(%%mm4, %%mm0)
2285
                PAVGB(%%mm5, %%mm1)
2286
                PAVGB(%%mm6, %%mm2)
2287
                PAVGB(%%mm7, %%mm3)
2288
                PAVGB(%%mm4, %%mm0)
2289
                PAVGB(%%mm5, %%mm1)
2290
                PAVGB(%%mm6, %%mm2)
2291
                PAVGB(%%mm7, %%mm3)
2292
                PAVGB(%%mm4, %%mm0)
2293
                PAVGB(%%mm5, %%mm1)
2294
                PAVGB(%%mm6, %%mm2)
2295
                PAVGB(%%mm7, %%mm3)
2296
                "movq %%mm0, (%1)                                \n\t" // R0
2297
                "movq %%mm1, (%1, %2)                                \n\t" // R1
2298
                "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2299
                "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2300
                "movq %%mm0, (%0)                                \n\t" // L0
2301
                "movq %%mm1, (%0, %2)                                \n\t" // L1
2302
                "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2303
                "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2304

    
2305
                "movq (%0, %2, 4), %%mm0                        \n\t" // L4
2306
                "movq (%0, %%ebx), %%mm1                        \n\t" // L5
2307
                "movq (%0, %%eax, 2), %%mm2                        \n\t" // L6
2308
                "movq (%0, %%ecx), %%mm3                        \n\t" // L7
2309
                "movq (%1, %2, 4), %%mm4                        \n\t" // R4
2310
                "movq (%1, %%ebx), %%mm5                        \n\t" // R5
2311
                "movq (%1, %%eax, 2), %%mm6                        \n\t" // R6
2312
                "movq (%1, %%ecx), %%mm7                        \n\t" // R7
2313
                PAVGB(%%mm4, %%mm0)
2314
                PAVGB(%%mm5, %%mm1)
2315
                PAVGB(%%mm6, %%mm2)
2316
                PAVGB(%%mm7, %%mm3)
2317
                PAVGB(%%mm4, %%mm0)
2318
                PAVGB(%%mm5, %%mm1)
2319
                PAVGB(%%mm6, %%mm2)
2320
                PAVGB(%%mm7, %%mm3)
2321
                PAVGB(%%mm4, %%mm0)
2322
                PAVGB(%%mm5, %%mm1)
2323
                PAVGB(%%mm6, %%mm2)
2324
                PAVGB(%%mm7, %%mm3)
2325
                "movq %%mm0, (%1, %2, 4)                        \n\t" // R4
2326
                "movq %%mm1, (%1, %%ebx)                        \n\t" // R5
2327
                "movq %%mm2, (%1, %%eax, 2)                        \n\t" // R6
2328
                "movq %%mm3, (%1, %%ecx)                        \n\t" // R7
2329
                "movq %%mm0, (%0, %2, 4)                        \n\t" // L4
2330
                "movq %%mm1, (%0, %%ebx)                        \n\t" // L5
2331
                "movq %%mm2, (%0, %%eax, 2)                        \n\t" // L6
2332
                "movq %%mm3, (%0, %%ecx)                        \n\t" // L7
2333

    
2334
                "4:                                                \n\t"
2335

    
2336
                :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
2337
                : "%eax", "%ebx", "%ecx", "memory"
2338
                );
2339
//printf("%d\n", test);
2340
#else
2341
        int y;
2342
        int d=0;
2343
        int sysd=0;
2344
        int i;
2345

    
2346
        for(y=0; y<8; y++)
2347
        {
2348
                int x;
2349
                for(x=0; x<8; x++)
2350
                {
2351
                        int ref= tempBlured[ x + y*stride ];
2352
                        int cur= src[ x + y*stride ];
2353
                        int d1=ref - cur;
2354
//                        if(x==0 || x==7) d1+= d1>>1;
2355
//                        if(y==0 || y==7) d1+= d1>>1;
2356
//                        d+= ABS(d1);
2357
                        d+= d1*d1;
2358
                        sysd+= d1;
2359
                }
2360
        }
2361
        i=d;
2362
        d=         (
2363
                4*d
2364
                +(*(tempBluredPast-256))
2365
                +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2366
                +(*(tempBluredPast+256))
2367
                +4)>>3;
2368
        *tempBluredPast=i;
2369
//        ((*tempBluredPast)*3 + d + 2)>>2;
2370

    
2371
//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2372
/*
2373
Switch between
2374
 1  0  0  0  0  0  0  (0)
2375
64 32 16  8  4  2  1  (1)
2376
64 48 36 27 20 15 11 (33) (approx)
2377
64 56 49 43 37 33 29 (200) (approx)
2378
*/
2379
        if(d > maxNoise[1])
2380
        {
2381
                if(d < maxNoise[2])
2382
                {
2383
                        for(y=0; y<8; y++)
2384
                        {
2385
                                int x;
2386
                                for(x=0; x<8; x++)
2387
                                {
2388
                                        int ref= tempBlured[ x + y*stride ];
2389
                                        int cur= src[ x + y*stride ];
2390
                                        tempBlured[ x + y*stride ]=
2391
                                        src[ x + y*stride ]=
2392
                                                (ref + cur + 1)>>1;
2393
                                }
2394
                        }
2395
                }
2396
                else
2397
                {
2398
                        for(y=0; y<8; y++)
2399
                        {
2400
                                int x;
2401
                                for(x=0; x<8; x++)
2402
                                {
2403
                                        tempBlured[ x + y*stride ]= src[ x + y*stride ];
2404
                                }
2405
                        }
2406
                }
2407
        }
2408
        else
2409
        {
2410
                if(d < maxNoise[0])
2411
                {
2412
                        for(y=0; y<8; y++)
2413
                        {
2414
                                int x;
2415
                                for(x=0; x<8; x++)
2416
                                {
2417
                                        int ref= tempBlured[ x + y*stride ];
2418
                                        int cur= src[ x + y*stride ];
2419
                                        tempBlured[ x + y*stride ]=
2420
                                        src[ x + y*stride ]=
2421
                                                (ref*7 + cur + 4)>>3;
2422
                                }
2423
                        }
2424
                }
2425
                else
2426
                {
2427
                        for(y=0; y<8; y++)
2428
                        {
2429
                                int x;
2430
                                for(x=0; x<8; x++)
2431
                                {
2432
                                        int ref= tempBlured[ x + y*stride ];
2433
                                        int cur= src[ x + y*stride ];
2434
                                        tempBlured[ x + y*stride ]=
2435
                                        src[ x + y*stride ]=
2436
                                                (ref*3 + cur + 2)>>2;
2437
                                }
2438
                        }
2439
                }
2440
        }
2441
#endif
2442
}
2443

    
2444
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2445
        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
2446

    
2447
/**
2448
 * Copies a block from src to dst and fixes the blacklevel
2449
 * numLines must be a multiple of 4
2450
 * levelFix == 0 -> dont touch the brighness & contrast
2451
 */
2452
static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2453
        int levelFix)
2454
{
2455
#ifndef HAVE_MMX
2456
        int i;
2457
#endif
2458
        if(levelFix)
2459
        {
2460
#ifdef HAVE_MMX
2461
                                        asm volatile(
2462
                                                "leal (%0,%2), %%eax        \n\t"
2463
                                                "leal (%1,%3), %%ebx        \n\t"
2464
                                                "movq packedYOffset, %%mm2        \n\t"
2465
                                                "movq packedYScale, %%mm3        \n\t"
2466
                                                "pxor %%mm4, %%mm4        \n\t"
2467
#ifdef HAVE_MMX2
2468
#define SCALED_CPY(src1, src2, dst1, dst2)                                        \
2469
                                                "movq " #src1 ", %%mm0        \n\t"\
2470
                                                "movq " #src1 ", %%mm5        \n\t"\
2471
                                                "movq " #src2 ", %%mm1        \n\t"\
2472
                                                "movq " #src2 ", %%mm6        \n\t"\
2473
                                                "punpcklbw %%mm0, %%mm0 \n\t"\
2474
                                                "punpckhbw %%mm5, %%mm5 \n\t"\
2475
                                                "punpcklbw %%mm1, %%mm1 \n\t"\
2476
                                                "punpckhbw %%mm6, %%mm6 \n\t"\
2477
                                                "pmulhuw %%mm3, %%mm0        \n\t"\
2478
                                                "pmulhuw %%mm3, %%mm5        \n\t"\
2479
                                                "pmulhuw %%mm3, %%mm1        \n\t"\
2480
                                                "pmulhuw %%mm3, %%mm6        \n\t"\
2481
                                                "psubw %%mm2, %%mm0        \n\t"\
2482
                                                "psubw %%mm2, %%mm5        \n\t"\
2483
                                                "psubw %%mm2, %%mm1        \n\t"\
2484
                                                "psubw %%mm2, %%mm6        \n\t"\
2485
                                                "packuswb %%mm5, %%mm0        \n\t"\
2486
                                                "packuswb %%mm6, %%mm1        \n\t"\
2487
                                                "movq %%mm0, " #dst1 "        \n\t"\
2488
                                                "movq %%mm1, " #dst2 "        \n\t"\
2489

    
2490
#else //HAVE_MMX2
2491
#define SCALED_CPY(src1, src2, dst1, dst2)                                        \
2492
                                                "movq " #src1 ", %%mm0        \n\t"\
2493
                                                "movq " #src1 ", %%mm5        \n\t"\
2494
                                                "punpcklbw %%mm4, %%mm0 \n\t"\
2495
                                                "punpckhbw %%mm4, %%mm5 \n\t"\
2496
                                                "psubw %%mm2, %%mm0        \n\t"\
2497
                                                "psubw %%mm2, %%mm5        \n\t"\
2498
                                                "movq " #src2 ", %%mm1        \n\t"\
2499
                                                "psllw $6, %%mm0        \n\t"\
2500
                                                "psllw $6, %%mm5        \n\t"\
2501
                                                "pmulhw %%mm3, %%mm0        \n\t"\
2502
                                                "movq " #src2 ", %%mm6        \n\t"\
2503
                                                "pmulhw %%mm3, %%mm5        \n\t"\
2504
                                                "punpcklbw %%mm4, %%mm1 \n\t"\
2505
                                                "punpckhbw %%mm4, %%mm6 \n\t"\
2506
                                                "psubw %%mm2, %%mm1        \n\t"\
2507
                                                "psubw %%mm2, %%mm6        \n\t"\
2508
                                                "psllw $6, %%mm1        \n\t"\
2509
                                                "psllw $6, %%mm6        \n\t"\
2510
                                                "pmulhw %%mm3, %%mm1        \n\t"\
2511
                                                "pmulhw %%mm3, %%mm6        \n\t"\
2512
                                                "packuswb %%mm5, %%mm0        \n\t"\
2513
                                                "packuswb %%mm6, %%mm1        \n\t"\
2514
                                                "movq %%mm0, " #dst1 "        \n\t"\
2515
                                                "movq %%mm1, " #dst2 "        \n\t"\
2516

    
2517
#endif //!HAVE_MMX2
2518

    
2519
SCALED_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
2520
SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
2521
SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
2522
                                                "leal (%%eax,%2,4), %%eax        \n\t"
2523
                                                "leal (%%ebx,%3,4), %%ebx        \n\t"
2524
SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
2525

    
2526

    
2527
                                                : : "r"(src),
2528
                                                "r"(dst),
2529
                                                "r" (srcStride),
2530
                                                "r" (dstStride)
2531
                                                : "%eax", "%ebx"
2532
                                        );
2533
#else
2534
                                for(i=0; i<8; i++)
2535
                                        memcpy(        &(dst[dstStride*i]),
2536
                                                &(src[srcStride*i]), BLOCK_SIZE);
2537
#endif
2538
        }
2539
        else
2540
        {
2541
#ifdef HAVE_MMX
2542
                                        asm volatile(
2543
                                                "leal (%0,%2), %%eax        \n\t"
2544
                                                "leal (%1,%3), %%ebx        \n\t"
2545

    
2546
#define SIMPLE_CPY(src1, src2, dst1, dst2)                                \
2547
                                                "movq " #src1 ", %%mm0        \n\t"\
2548
                                                "movq " #src2 ", %%mm1        \n\t"\
2549
                                                "movq %%mm0, " #dst1 "        \n\t"\
2550
                                                "movq %%mm1, " #dst2 "        \n\t"\
2551

    
2552
SIMPLE_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
2553
SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
2554
SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
2555
                                                "leal (%%eax,%2,4), %%eax        \n\t"
2556
                                                "leal (%%ebx,%3,4), %%ebx        \n\t"
2557
SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
2558

    
2559
                                                : : "r" (src),
2560
                                                "r" (dst),
2561
                                                "r" (srcStride),
2562
                                                "r" (dstStride)
2563
                                                : "%eax", "%ebx"
2564
                                        );
2565
#else
2566
                                for(i=0; i<8; i++)
2567
                                        memcpy(        &(dst[dstStride*i]),
2568
                                                &(src[srcStride*i]), BLOCK_SIZE);
2569
#endif
2570
        }
2571
}
2572

    
2573

    
2574
/**
2575
 * Filters array of bytes (Y or U or V values)
2576
 */
2577
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2578
        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode)
2579
{
2580
        int x,y;
2581
#ifdef COMPILE_TIME_MODE
2582
        const int mode= COMPILE_TIME_MODE;
2583
#else
2584
        const int mode= isColor ? ppMode->chromMode : ppMode->lumMode;
2585
#endif
2586
        /* we need 64bit here otherwise we?ll going to have a problem
2587
           after watching a black picture for 5 hours*/
2588
        static uint64_t *yHistogram= NULL;
2589
        int black=0, white=255; // blackest black and whitest white in the picture
2590
        int QPCorrecture= 256;
2591

    
2592
        /* Temporary buffers for handling the last row(s) */
2593
        static uint8_t *tempDst= NULL;
2594
        static uint8_t *tempSrc= NULL;
2595

    
2596
        /* Temporary buffers for handling the last block */
2597
        static uint8_t *tempDstBlock= NULL;
2598
        static uint8_t *tempSrcBlock= NULL;
2599

    
2600
        /* Temporal noise reducing buffers */
2601
        static uint8_t *tempBlured[3]= {NULL,NULL,NULL};
2602
        static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL};
2603

    
2604
        int copyAhead;
2605

    
2606
#ifdef PP_FUNNY_STRIDE
2607
        uint8_t *dstBlockPtrBackup;
2608
        uint8_t *srcBlockPtrBackup;
2609
#endif
2610

    
2611
#ifdef MORE_TIMING
2612
        long long T0, T1, diffTime=0;
2613
#endif
2614
#ifdef TIMING
2615
        long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
2616
        sumTime= rdtsc();
2617
#endif
2618

    
2619
        dcOffset= ppMode->maxDcDiff;
2620
        dcThreshold= ppMode->maxDcDiff*2 + 1;
2621

    
2622
#ifdef HAVE_MMX
2623
        maxTmpNoise[0]= ppMode->maxTmpNoise[0];
2624
        maxTmpNoise[1]= ppMode->maxTmpNoise[1];
2625
        maxTmpNoise[2]= ppMode->maxTmpNoise[2];
2626
        
2627
        mmxDCOffset= 0x7F - dcOffset;
2628
        mmxDCThreshold= 0x7F - dcThreshold;
2629

    
2630
        mmxDCOffset*= 0x0101010101010101LL;
2631
        mmxDCThreshold*= 0x0101010101010101LL;
2632
#endif
2633

    
2634
        if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
2635
        else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14;
2636
        else if(   (mode & V_DEBLOCK)
2637
                || (mode & LINEAR_IPOL_DEINT_FILTER)
2638
                || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
2639
        else if(mode & V_X1_FILTER) copyAhead=11;
2640
        else if(mode & V_RK1_FILTER) copyAhead=10;
2641
        else if(mode & DERING) copyAhead=9;
2642
        else copyAhead=8;
2643

    
2644
        copyAhead-= 8;
2645

    
2646
        if(tempDst==NULL)
2647
        {
2648
                tempDst= (uint8_t*)memalign(8, 1024*24);
2649
                tempSrc= (uint8_t*)memalign(8, 1024*24);
2650
                tempDstBlock= (uint8_t*)memalign(8, 1024*24);
2651
                tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
2652
        }
2653

    
2654
        if(tempBlured[isColor]==NULL && (mode & TEMP_NOISE_FILTER))
2655
        {
2656
//                printf("%d %d %d\n", isColor, dstStride, height);
2657
                //FIXME works only as long as the size doesnt increase
2658
                //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
2659
                tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024);
2660
                tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024);
2661

    
2662
                memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024);
2663
                memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024);
2664
        }
2665

    
2666
        if(!yHistogram)
2667
        {
2668
                int i;
2669
                yHistogram= (uint64_t*)malloc(8*256);
2670
                for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
2671

    
2672
                if(mode & FULL_Y_RANGE)
2673
                {
2674
                        ppMode->maxAllowedY=255;
2675
                        ppMode->minAllowedY=0;
2676
                }
2677
        }
2678

    
2679
        if(!isColor)
2680
        {
2681
                uint64_t sum= 0;
2682
                int i;
2683
                static int framenum= -1;
2684
                uint64_t maxClipped;
2685
                uint64_t clipped;
2686
                double scale;
2687

    
2688
                framenum++;
2689
                if(framenum == 1) yHistogram[0]= width*height/64*15/256;
2690

    
2691
                for(i=0; i<256; i++)
2692
                {
2693
                        sum+= yHistogram[i];
2694
//                        printf("%d ", yHistogram[i]);
2695
                }
2696
//                printf("\n\n");
2697

    
2698
                /* we allways get a completly black picture first */
2699
                maxClipped= (uint64_t)(sum * maxClippedThreshold);
2700

    
2701
                clipped= sum;
2702
                for(black=255; black>0; black--)
2703
                {
2704
                        if(clipped < maxClipped) break;
2705
                        clipped-= yHistogram[black];
2706
                }
2707

    
2708
                clipped= sum;
2709
                for(white=0; white<256; white++)
2710
                {
2711
                        if(clipped < maxClipped) break;
2712
                        clipped-= yHistogram[white];
2713
                }
2714

    
2715
                scale= (double)(ppMode->maxAllowedY - ppMode->minAllowedY) / (double)(white-black);
2716

    
2717
#ifdef HAVE_MMX2
2718
                packedYScale= (uint16_t)(scale*256.0 + 0.5);
2719
                packedYOffset= (((black*packedYScale)>>8) - ppMode->minAllowedY) & 0xFFFF;
2720
#else
2721
                packedYScale= (uint16_t)(scale*1024.0 + 0.5);
2722
                packedYOffset= (black - ppMode->minAllowedY) & 0xFFFF;
2723
#endif
2724

    
2725
                packedYOffset|= packedYOffset<<32;
2726
                packedYOffset|= packedYOffset<<16;
2727

    
2728
                packedYScale|= packedYScale<<32;
2729
                packedYScale|= packedYScale<<16;
2730
        }
2731
        else
2732
        {
2733
                packedYScale= 0x0100010001000100LL;
2734
                packedYOffset= 0;
2735
        }
2736

    
2737
        if(mode & LEVEL_FIX)        QPCorrecture= packedYScale &0xFFFF;
2738
        else                        QPCorrecture= 256;
2739

    
2740
        /* copy & deinterlace first row of blocks */
2741
        y=-BLOCK_SIZE;
2742
        {
2743
                //1% speedup if these are here instead of the inner loop
2744
                uint8_t *srcBlock= &(src[y*srcStride]);
2745
                uint8_t *dstBlock= &(dst[y*dstStride]);
2746

    
2747
                dstBlock= tempDst + dstStride;
2748

    
2749
                // From this point on it is guranteed that we can read and write 16 lines downward
2750
                // finish 1 block before the next otherwise we?ll might have a problem
2751
                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2752
                for(x=0; x<width; x+=BLOCK_SIZE)
2753
                {
2754

    
2755
#ifdef HAVE_MMX2
2756
/*
2757
                        prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
2758
                        prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
2759
                        prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
2760
                        prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
2761
*/
2762

    
2763
                        asm(
2764
                                "movl %4, %%eax                        \n\t"
2765
                                "shrl $2, %%eax                        \n\t"
2766
                                "andl $6, %%eax                        \n\t"
2767
                                "addl %5, %%eax                        \n\t"
2768
                                "movl %%eax, %%ebx                \n\t"
2769
                                "imul %1, %%eax                        \n\t"
2770
                                "imul %3, %%ebx                        \n\t"
2771
                                "prefetchnta 32(%%eax, %0)        \n\t"
2772
                                "prefetcht0 32(%%ebx, %2)        \n\t"
2773
                                "addl %1, %%eax                        \n\t"
2774
                                "addl %3, %%ebx                        \n\t"
2775
                                "prefetchnta 32(%%eax, %0)        \n\t"
2776
                                "prefetcht0 32(%%ebx, %2)        \n\t"
2777
                        :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2778
                        "m" (x), "m" (copyAhead)
2779
                        : "%eax", "%ebx"
2780
                        );
2781

    
2782
#elif defined(HAVE_3DNOW)
2783
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2784
/*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2785
                        prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2786
                        prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2787
                        prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2788
*/
2789
#endif
2790

    
2791
                        RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
2792
                                srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
2793

    
2794
                        if(mode & LINEAR_IPOL_DEINT_FILTER)
2795
                                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2796
                        else if(mode & LINEAR_BLEND_DEINT_FILTER)
2797
                                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2798
                        else if(mode & MEDIAN_DEINT_FILTER)
2799
                                RENAME(deInterlaceMedian)(dstBlock, dstStride);
2800
                        else if(mode & CUBIC_IPOL_DEINT_FILTER)
2801
                                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
2802
/*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
2803
                                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2804
*/
2805
                        dstBlock+=8;
2806
                        srcBlock+=8;
2807
                }
2808
                memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride );
2809
        }
2810

    
2811
        for(y=0; y<height; y+=BLOCK_SIZE)
2812
        {
2813
                //1% speedup if these are here instead of the inner loop
2814
                uint8_t *srcBlock= &(src[y*srcStride]);
2815
                uint8_t *dstBlock= &(dst[y*dstStride]);
2816
#ifdef HAVE_MMX
2817
                uint8_t *tempBlock1= tempBlocks;
2818
                uint8_t *tempBlock2= tempBlocks + 8;
2819
#endif
2820
#ifdef ARCH_X86
2821
                int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
2822
                int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
2823
                int QPFrac= QPDelta;
2824
#endif
2825
                int QP=0;
2826
                /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
2827
                   if not than use a temporary buffer */
2828
                if(y+15 >= height)
2829
                {
2830
                        int i;
2831
                        /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
2832
                           blockcopy to dst later */
2833
                        memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
2834
                                srcStride*MAX(height-y-copyAhead, 0) );
2835

    
2836
                        /* duplicate last line of src to fill the void upto line (copyAhead+7) */
2837
                        for(i=MAX(height-y, 8); i<copyAhead+8; i++)
2838
                                memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
2839

    
2840
                        /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
2841
                        memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
2842

    
2843
                        /* duplicate last line of dst to fill the void upto line (copyAhead) */
2844
                        for(i=height-y+1; i<=copyAhead; i++)
2845
                                memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
2846

    
2847
                        dstBlock= tempDst + dstStride;
2848
                        srcBlock= tempSrc;
2849
                }
2850

    
2851
                // From this point on it is guranteed that we can read and write 16 lines downward
2852
                // finish 1 block before the next otherwise we?ll might have a problem
2853
                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2854
                for(x=0; x<width; x+=BLOCK_SIZE)
2855
                {
2856
                        const int stride= dstStride;
2857
#ifdef HAVE_MMX
2858
                        uint8_t *tmpXchg;
2859
#endif
2860
#ifdef ARCH_X86
2861
                        QP= *QPptr;
2862
                        asm volatile(
2863
                                "addl %2, %1                \n\t"
2864
                                "sbbl %%eax, %%eax        \n\t"
2865
                                "shll $2, %%eax                \n\t"
2866
                                "subl %%eax, %0                \n\t"
2867
                                : "+r" (QPptr), "+m" (QPFrac)
2868
                                : "r" (QPDelta)
2869
                                : "%eax"
2870
                        );
2871
#else
2872
                        QP= isColor ?
2873
                                QPs[(y>>3)*QPStride + (x>>3)]:
2874
                                QPs[(y>>4)*QPStride + (x>>4)];
2875
#endif
2876
                        if(!isColor)
2877
                        {
2878
                                QP= (QP* QPCorrecture)>>8;
2879
                                yHistogram[ srcBlock[srcStride*12 + 4] ]++;
2880
                        }
2881
#ifdef HAVE_MMX
2882
                        asm volatile(
2883
                                "movd %0, %%mm7                                        \n\t"
2884
                                "packuswb %%mm7, %%mm7                                \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2885
                                "packuswb %%mm7, %%mm7                                \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2886
                                "packuswb %%mm7, %%mm7                                \n\t" // QP,..., QP
2887
                                "movq %%mm7, pQPb                                \n\t"
2888
                                : : "r" (QP)
2889
                        );
2890
#endif
2891

    
2892
#ifdef MORE_TIMING
2893
                        T0= rdtsc();
2894
#endif
2895

    
2896
#ifdef HAVE_MMX2
2897
/*
2898
                        prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
2899
                        prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
2900
                        prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
2901
                        prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
2902
*/
2903

    
2904
                        asm(
2905
                                "movl %4, %%eax                        \n\t"
2906
                                "shrl $2, %%eax                        \n\t"
2907
                                "andl $6, %%eax                        \n\t"
2908
                                "addl %5, %%eax                        \n\t"
2909
                                "movl %%eax, %%ebx                \n\t"
2910
                                "imul %1, %%eax                        \n\t"
2911
                                "imul %3, %%ebx                        \n\t"
2912
                                "prefetchnta 32(%%eax, %0)        \n\t"
2913
                                "prefetcht0 32(%%ebx, %2)        \n\t"
2914
                                "addl %1, %%eax                        \n\t"
2915
                                "addl %3, %%ebx                        \n\t"
2916
                                "prefetchnta 32(%%eax, %0)        \n\t"
2917
                                "prefetcht0 32(%%ebx, %2)        \n\t"
2918
                        :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2919
                        "m" (x), "m" (copyAhead)
2920
                        : "%eax", "%ebx"
2921
                        );
2922

    
2923
#elif defined(HAVE_3DNOW)
2924
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2925
/*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2926
                        prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2927
                        prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2928
                        prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2929
*/
2930
#endif
2931

    
2932
#ifdef PP_FUNNY_STRIDE
2933
                        //can we mess with a 8x16 block, if not use a temp buffer, yes again
2934
                        if(x+7 >= width)
2935
                        {
2936
                                int i;
2937
                                dstBlockPtrBackup= dstBlock;
2938
                                srcBlockPtrBackup= srcBlock;
2939

    
2940
                                for(i=0;i<BLOCK_SIZE*2; i++)
2941
                                {
2942
                                        memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
2943
                                        memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
2944
                                }
2945

    
2946
                                dstBlock= tempDstBlock;
2947
                                srcBlock= tempSrcBlock;
2948
                        }
2949
#endif
2950

    
2951
                        RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
2952
                                srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
2953

    
2954
                        if(mode & LINEAR_IPOL_DEINT_FILTER)
2955
                                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2956
                        else if(mode & LINEAR_BLEND_DEINT_FILTER)
2957
                                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2958
                        else if(mode & MEDIAN_DEINT_FILTER)
2959
                                RENAME(deInterlaceMedian)(dstBlock, dstStride);
2960
                        else if(mode & CUBIC_IPOL_DEINT_FILTER)
2961
                                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
2962
/*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
2963
                                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2964
*/
2965

    
2966
                        /* only deblock if we have 2 blocks */
2967
                        if(y + 8 < height)
2968
                        {
2969
#ifdef MORE_TIMING
2970
                                T1= rdtsc();
2971
                                memcpyTime+= T1-T0;
2972
                                T0=T1;
2973
#endif
2974
                                if(mode & V_RK1_FILTER)
2975
                                        RENAME(vertRK1Filter)(dstBlock, stride, QP);
2976
                                else if(mode & V_X1_FILTER)
2977
                                        RENAME(vertX1Filter)(dstBlock, stride, QP);
2978
                                else if(mode & V_DEBLOCK)
2979
                                {
2980
                                        if( RENAME(isVertDC)(dstBlock, stride))
2981
                                        {
2982
                                                if(RENAME(isVertMinMaxOk)(dstBlock, stride, QP))
2983
                                                        RENAME(doVertLowPass)(dstBlock, stride, QP);
2984
                                        }
2985
                                        else
2986
                                                RENAME(doVertDefFilter)(dstBlock, stride, QP);
2987
                                }
2988
#ifdef MORE_TIMING
2989
                                T1= rdtsc();
2990
                                vertTime+= T1-T0;
2991
                                T0=T1;
2992
#endif
2993
                        }
2994

    
2995
#ifdef HAVE_MMX
2996
                        RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
2997
#endif
2998
                        /* check if we have a previous block to deblock it with dstBlock */
2999
                        if(x - 8 >= 0)
3000
                        {
3001
#ifdef MORE_TIMING
3002
                                T0= rdtsc();
3003
#endif
3004
#ifdef HAVE_MMX
3005
                                if(mode & H_RK1_FILTER)
3006
                                        RENAME(vertRK1Filter)(tempBlock1, 16, QP);
3007
                                else if(mode & H_X1_FILTER)
3008
                                        RENAME(vertX1Filter)(tempBlock1, 16, QP);
3009
                                else if(mode & H_DEBLOCK)
3010
                                {
3011
                                        if( RENAME(isVertDC)(tempBlock1, 16) )
3012
                                        {
3013
                                                if(RENAME(isVertMinMaxOk)(tempBlock1, 16, QP))
3014
                                                        RENAME(doVertLowPass)(tempBlock1, 16, QP);
3015
                                        }
3016
                                        else
3017
                                                RENAME(doVertDefFilter)(tempBlock1, 16, QP);
3018
                                }
3019

    
3020
                                RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3021

    
3022
#else
3023
                                if(mode & H_X1_FILTER)
3024
                                        horizX1Filter(dstBlock-4, stride, QP);
3025
                                else if(mode & H_DEBLOCK)
3026
                                {
3027
                                        if( isHorizDC(dstBlock-4, stride))
3028
                                        {
3029
                                                if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3030
                                                        doHorizLowPass(dstBlock-4, stride, QP);
3031
                                        }
3032
                                        else
3033
                                                doHorizDefFilter(dstBlock-4, stride, QP);
3034
                                }
3035
#endif
3036
#ifdef MORE_TIMING
3037
                                T1= rdtsc();
3038
                                horizTime+= T1-T0;
3039
                                T0=T1;
3040
#endif
3041
                                if(mode & DERING)
3042
                                {
3043
                                //FIXME filter first line
3044
                                        if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, QP);
3045
                                }
3046

    
3047
                                if(mode & TEMP_NOISE_FILTER)
3048
                                {
3049
                                        RENAME(tempNoiseReducer)(dstBlock-8, stride,
3050
                                                tempBlured[isColor] + y*dstStride + x,
3051
                                                tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3052
                                                ppMode->maxTmpNoise);
3053
                                }
3054
                        }
3055

    
3056
#ifdef PP_FUNNY_STRIDE
3057
                        /* did we use a tmp-block buffer */
3058
                        if(x+7 >= width)
3059
                        {
3060
                                int i;
3061
                                dstBlock= dstBlockPtrBackup;
3062
                                srcBlock= srcBlockPtrBackup;
3063

    
3064
                                for(i=0;i<BLOCK_SIZE*2; i++)
3065
                                {
3066
                                        memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3067
                                }
3068
                        }
3069
#endif
3070

    
3071
                        dstBlock+=8;
3072
                        srcBlock+=8;
3073

    
3074
#ifdef HAVE_MMX
3075
                        tmpXchg= tempBlock1;
3076
                        tempBlock1= tempBlock2;
3077
                        tempBlock2 = tmpXchg;
3078
#endif
3079
                }
3080

    
3081
                if(mode & DERING)
3082
                {
3083
                                if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, QP);
3084
                }
3085

    
3086
                if((mode & TEMP_NOISE_FILTER))
3087
                {
3088
                        RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3089
                                tempBlured[isColor] + y*dstStride + x,
3090
                                tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3091
                                ppMode->maxTmpNoise);
3092
                }
3093

    
3094
                /* did we use a tmp buffer for the last lines*/
3095
                if(y+15 >= height)
3096
                {
3097
                        uint8_t *dstBlock= &(dst[y*dstStride]);
3098
                        memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3099
                }
3100
/*
3101
                for(x=0; x<width; x+=32)
3102
                {
3103
                        volatile int i;
3104
                        i+=        + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3105
                                + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3106
                                + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3107
//                                + dstBlock[x +13*dstStride]
3108
//                                + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3109
                }*/
3110
        }
3111
#ifdef HAVE_3DNOW
3112
        asm volatile("femms");
3113
#elif defined (HAVE_MMX)
3114
        asm volatile("emms");
3115
#endif
3116

    
3117
#ifdef TIMING
3118
        // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3119
        sumTime= rdtsc() - sumTime;
3120
        if(!isColor)
3121
                printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
3122
                        (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3123
                        (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
3124
                        , black, white);
3125
#endif
3126
#ifdef DEBUG_BRIGHTNESS
3127
        if(!isColor)
3128
        {
3129
                int max=1;
3130
                int i;
3131
                for(i=0; i<256; i++)
3132
                        if(yHistogram[i] > max) max=yHistogram[i];
3133

    
3134
                for(i=1; i<256; i++)
3135
                {
3136
                        int x;
3137
                        int start=yHistogram[i-1]/(max/256+1);
3138
                        int end=yHistogram[i]/(max/256+1);
3139
                        int inc= end > start ? 1 : -1;
3140
                        for(x=start; x!=end+inc; x+=inc)
3141
                                dst[ i*dstStride + x]+=128;
3142
                }
3143

    
3144
                for(i=0; i<100; i+=2)
3145
                {
3146
                        dst[ (white)*dstStride + i]+=128;
3147
                        dst[ (black)*dstStride + i]+=128;
3148
                }
3149

    
3150
        }
3151
#endif
3152

    
3153
}