Statistics
| Branch: | Revision:

ffmpeg / postproc / postprocess_template.c @ 4407a3c4

History | View | Annotate | Download (90.6 KB)

1
/*
2
    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
#undef PAVGB
20
#undef PMINUB
21
#undef PMAXUB
22

    
23
#ifdef HAVE_MMX2
24
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
25
#elif defined (HAVE_3DNOW)
26
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
27
#endif
28

    
29
#ifdef HAVE_MMX2
30
#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
31
#elif defined (HAVE_MMX)
32
#define PMINUB(b,a,t) \
33
        "movq " #a ", " #t " \n\t"\
34
        "psubusb " #b ", " #t " \n\t"\
35
        "psubb " #t ", " #a " \n\t"
36
#endif
37

    
38
#ifdef HAVE_MMX2
39
#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
40
#elif defined (HAVE_MMX)
41
#define PMAXUB(a,b) \
42
        "psubusb " #a ", " #b " \n\t"\
43
        "paddb " #a ", " #b " \n\t"
44
#endif
45

    
46

    
47
//FIXME? |255-0| = 1 (shouldnt be a problem ...)
48
#ifdef HAVE_MMX
49
/**
50
 * Check if the middle 8x8 Block in the given 8x16 block is flat
51
 */
52
static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){
53
        int numEq= 0;
54
        src+= stride*4; // src points to begin of the 8x8 Block
55
asm volatile(
56
                "leal (%1, %2), %%eax                                \n\t"
57
//        0        1        2        3        4        5        6        7        8        9
58
//        %1        eax        eax+%2        eax+2%2        %1+4%2        ecx        ecx+%2        ecx+2%2        %1+8%2        ecx+4%2
59
                "movq %3, %%mm7                                        \n\t" 
60
                "movq %4, %%mm6                                        \n\t" 
61

    
62
                "movq (%1), %%mm0                                \n\t"
63
                "movq (%%eax), %%mm1                                \n\t"
64
                "psubb %%mm1, %%mm0                                \n\t" // mm0 = differnece
65
                "paddb %%mm7, %%mm0                                \n\t"
66
                "pcmpgtb %%mm6, %%mm0                                \n\t"
67

    
68
                "movq (%%eax,%2), %%mm2                                \n\t"
69
                "psubb %%mm2, %%mm1                                \n\t"
70
                "paddb %%mm7, %%mm1                                \n\t"
71
                "pcmpgtb %%mm6, %%mm1                                \n\t"
72
                "paddb %%mm1, %%mm0                                \n\t"
73

    
74
                "movq (%%eax, %2, 2), %%mm1                        \n\t"
75
                "psubb %%mm1, %%mm2                                \n\t"
76
                "paddb %%mm7, %%mm2                                \n\t"
77
                "pcmpgtb %%mm6, %%mm2                                \n\t"
78
                "paddb %%mm2, %%mm0                                \n\t"
79
                
80
                "leal (%%eax, %2, 4), %%eax                        \n\t"
81

    
82
                "movq (%1, %2, 4), %%mm2                        \n\t"
83
                "psubb %%mm2, %%mm1                                \n\t"
84
                "paddb %%mm7, %%mm1                                \n\t"
85
                "pcmpgtb %%mm6, %%mm1                                \n\t"
86
                "paddb %%mm1, %%mm0                                \n\t"
87

    
88
                "movq (%%eax), %%mm1                                \n\t"
89
                "psubb %%mm1, %%mm2                                \n\t"
90
                "paddb %%mm7, %%mm2                                \n\t"
91
                "pcmpgtb %%mm6, %%mm2                                \n\t"
92
                "paddb %%mm2, %%mm0                                \n\t"
93

    
94
                "movq (%%eax, %2), %%mm2                        \n\t"
95
                "psubb %%mm2, %%mm1                                \n\t"
96
                "paddb %%mm7, %%mm1                                \n\t"
97
                "pcmpgtb %%mm6, %%mm1                                \n\t"
98
                "paddb %%mm1, %%mm0                                \n\t"
99

    
100
                "movq (%%eax, %2, 2), %%mm1                        \n\t"
101
                "psubb %%mm1, %%mm2                                \n\t"
102
                "paddb %%mm7, %%mm2                                \n\t"
103
                "pcmpgtb %%mm6, %%mm2                                \n\t"
104
                "paddb %%mm2, %%mm0                                \n\t"
105

    
106
                "                                                \n\t"
107
#ifdef HAVE_MMX2
108
                "pxor %%mm7, %%mm7                                \n\t"
109
                "psadbw %%mm7, %%mm0                                \n\t"
110
#else
111
                "movq %%mm0, %%mm1                                \n\t"
112
                "psrlw $8, %%mm0                                \n\t"
113
                "paddb %%mm1, %%mm0                                \n\t"
114
                "movq %%mm0, %%mm1                                \n\t"
115
                "psrlq $16, %%mm0                                \n\t"
116
                "paddb %%mm1, %%mm0                                \n\t"
117
                "movq %%mm0, %%mm1                                \n\t"
118
                "psrlq $32, %%mm0                                \n\t"
119
                "paddb %%mm1, %%mm0                                \n\t"
120
#endif
121
                "movd %%mm0, %0                                        \n\t"
122
                : "=r" (numEq)
123
                : "r" (src), "r" (stride), "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
124
                : "%eax"
125
                );
126
        numEq= (-numEq) &0xFF;
127
        return numEq > c->ppMode.flatnessThreshold;
128
}
129
#endif
130

    
131
static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c)
132
{
133
#ifdef HAVE_MMX
134
        int isOk;
135
        src+= stride*3;
136
        asm volatile(
137
                "movq (%1, %2), %%mm0                                \n\t"
138
                "movq (%1, %2, 8), %%mm1                        \n\t"
139
                "movq %%mm0, %%mm2                                \n\t"
140
                "psubusb %%mm1, %%mm0                                \n\t"
141
                "psubusb %%mm2, %%mm1                                \n\t"
142
                "por %%mm1, %%mm0                                \n\t" // ABS Diff
143

    
144
                "movq %3, %%mm7                                        \n\t" // QP,..., QP
145
                "paddusb %%mm7, %%mm7                                \n\t" // 2QP ... 2QP
146
                "psubusb %%mm7, %%mm0                                \n\t" // Diff <= 2QP -> 0
147
                "packssdw %%mm0, %%mm0                                \n\t"
148
                "movd %%mm0, %0                                        \n\t"
149
                : "=r" (isOk)
150
                : "r" (src), "r" (stride), "m" (c->pQPb)
151
                );
152
        return isOk==0;
153
#else
154
#if 1
155
        int x;
156
        const int QP= c->QP;
157
        src+= stride*3;
158
        for(x=0; x<BLOCK_SIZE; x++)
159
        {
160
                if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
161
        }
162

    
163
        return 1;
164
#else
165
        int x;
166
        const int QP= c->QP;
167
        src+= stride*4;
168
        for(x=0; x<BLOCK_SIZE; x++)
169
        {
170
                int min=255;
171
                int max=0;
172
                int y;
173
                for(y=0; y<8; y++){
174
                        int v= src[x + y*stride];
175
                        if(v>max) max=v;
176
                        if(v<min) min=v;
177
                }
178
                if(max-min > 2*QP) return 0;
179
        }
180
        return 1;
181
#endif
182
#endif
183
}
184

    
185
/**
186
 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
187
 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
188
 */
189
static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
190
{
191
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
192
        src+= stride*3;
193
        asm volatile(        //"movv %0 %1 %2\n\t"
194
                "movq %2, %%mm0                        \n\t"  // QP,..., QP
195
                "pxor %%mm4, %%mm4                                \n\t"
196

    
197
                "movq (%0), %%mm6                                \n\t"
198
                "movq (%0, %1), %%mm5                                \n\t"
199
                "movq %%mm5, %%mm1                                \n\t"
200
                "movq %%mm6, %%mm2                                \n\t"
201
                "psubusb %%mm6, %%mm5                                \n\t"
202
                "psubusb %%mm1, %%mm2                                \n\t"
203
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
204
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
205
                "pcmpeqb %%mm4, %%mm2                        \n\t" // diff <= QP -> FF
206

    
207
                "pand %%mm2, %%mm6                                \n\t"
208
                "pandn %%mm1, %%mm2                                \n\t"
209
                "por %%mm2, %%mm6                                \n\t"// First Line to Filter
210

    
211
                "movq (%0, %1, 8), %%mm5                        \n\t"
212
                "leal (%0, %1, 4), %%eax                        \n\t"
213
                "leal (%0, %1, 8), %%ecx                        \n\t"
214
                "subl %1, %%ecx                                        \n\t"
215
                "addl %1, %0                                        \n\t" // %0 points to line 1 not 0
216
                "movq (%0, %1, 8), %%mm7                        \n\t"
217
                "movq %%mm5, %%mm1                                \n\t"
218
                "movq %%mm7, %%mm2                                \n\t"
219
                "psubusb %%mm7, %%mm5                                \n\t"
220
                "psubusb %%mm1, %%mm2                                \n\t"
221
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
222
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
223
                "pcmpeqb %%mm4, %%mm2                        \n\t" // diff <= QP -> FF
224

    
225
                "pand %%mm2, %%mm7                                \n\t"
226
                "pandn %%mm1, %%mm2                                \n\t"
227
                "por %%mm2, %%mm7                                \n\t" // First Line to Filter
228

    
229

    
230
                //         1        2        3        4        5        6        7        8
231
                //        %0        %0+%1        %0+2%1        eax        %0+4%1        eax+2%1        ecx        eax+4%1
232
                // 6 4 2 2 1 1
233
                // 6 4 4 2
234
                // 6 8 2
235

    
236
                "movq (%0, %1), %%mm0                                \n\t" //  1
237
                "movq %%mm0, %%mm1                                \n\t" //  1
238
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
239
                PAVGB(%%mm6, %%mm0)                                      //3 1        /4
240

    
241
                "movq (%0, %1, 4), %%mm2                        \n\t" //     1
242
                "movq %%mm2, %%mm5                                \n\t" //     1
243
                PAVGB((%%eax), %%mm2)                                      //    11        /2
244
                PAVGB((%0, %1, 2), %%mm2)                              //   211        /4
245
                "movq %%mm2, %%mm3                                \n\t" //   211        /4
246
                "movq (%0), %%mm4                                \n\t" // 1
247
                PAVGB(%%mm4, %%mm3)                                      // 4 211        /8
248
                PAVGB(%%mm0, %%mm3)                                      //642211        /16
249
                "movq %%mm3, (%0)                                \n\t" // X
250
                // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
251
                "movq %%mm1, %%mm0                                \n\t" //  1
252
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
253
                "movq %%mm4, %%mm3                                \n\t" // 1
254
                PAVGB((%0,%1,2), %%mm3)                                      // 1 1        /2
255
                PAVGB((%%eax,%1,2), %%mm5)                              //     11        /2
256
                PAVGB((%%eax), %%mm5)                                      //    211 /4
257
                PAVGB(%%mm5, %%mm3)                                      // 2 2211 /8
258
                PAVGB(%%mm0, %%mm3)                                      //4242211 /16
259
                "movq %%mm3, (%0,%1)                                \n\t" //  X
260
                // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
261
                PAVGB(%%mm4, %%mm6)                                      //11        /2
262
                "movq (%%ecx), %%mm0                                \n\t" //       1
263
                PAVGB((%%eax, %1, 2), %%mm0)                              //      11/2
264
                "movq %%mm0, %%mm3                                \n\t" //      11/2
265
                PAVGB(%%mm1, %%mm0)                                      //  2   11/4
266
                PAVGB(%%mm6, %%mm0)                                      //222   11/8
267
                PAVGB(%%mm2, %%mm0)                                      //22242211/16
268
                "movq (%0, %1, 2), %%mm2                        \n\t" //   1
269
                "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
270
                // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
271
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
272
                PAVGB((%%ecx), %%mm0)                                      //       11        /2
273
                PAVGB(%%mm0, %%mm6)                                      //11     11        /4
274
                PAVGB(%%mm1, %%mm4)                                      // 11                /2
275
                PAVGB(%%mm2, %%mm1)                                      //  11                /2
276
                PAVGB(%%mm1, %%mm6)                                      //1122   11        /8
277
                PAVGB(%%mm5, %%mm6)                                      //112242211        /16
278
                "movq (%%eax), %%mm5                                \n\t" //    1
279
                "movq %%mm6, (%%eax)                                \n\t" //    X
280
                // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
281
                "movq (%%eax, %1, 4), %%mm6                        \n\t" //        1
282
                PAVGB(%%mm7, %%mm6)                                      //        11        /2
283
                PAVGB(%%mm4, %%mm6)                                      // 11     11        /4
284
                PAVGB(%%mm3, %%mm6)                                      // 11   2211        /8
285
                PAVGB(%%mm5, %%mm2)                                      //   11                /2
286
                "movq (%0, %1, 4), %%mm4                        \n\t" //     1
287
                PAVGB(%%mm4, %%mm2)                                      //   112                /4
288
                PAVGB(%%mm2, %%mm6)                                      // 112242211        /16
289
                "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
290
                // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
291
                PAVGB(%%mm7, %%mm1)                                      //  11     2        /4
292
                PAVGB(%%mm4, %%mm5)                                      //    11                /2
293
                PAVGB(%%mm5, %%mm0)                                      //    11 11        /4
294
                "movq (%%eax, %1, 2), %%mm6                        \n\t" //      1
295
                PAVGB(%%mm6, %%mm1)                                      //  11  4  2        /8
296
                PAVGB(%%mm0, %%mm1)                                      //  11224222        /16
297
                "movq %%mm1, (%%eax, %1, 2)                        \n\t" //      X
298
                // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
299
                PAVGB((%%ecx), %%mm2)                                      //   112 4        /8
300
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
301
                PAVGB(%%mm0, %%mm6)                                      //      1 1        /2
302
                PAVGB(%%mm7, %%mm6)                                      //      1 12        /4
303
                PAVGB(%%mm2, %%mm6)                                      //   1122424        /4
304
                "movq %%mm6, (%%ecx)                                \n\t" //       X
305
                // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
306
                PAVGB(%%mm7, %%mm5)                                      //    11   2        /4
307
                PAVGB(%%mm7, %%mm5)                                      //    11   6        /8
308

    
309
                PAVGB(%%mm3, %%mm0)                                      //      112        /4
310
                PAVGB(%%mm0, %%mm5)                                      //    112246        /16
311
                "movq %%mm5, (%%eax, %1, 4)                        \n\t" //        X
312
                "subl %1, %0                                        \n\t"
313

    
314
                :
315
                : "r" (src), "r" (stride), "m" (c->pQPb)
316
                : "%eax", "%ecx"
317
        );
318
#else
319
        const int l1= stride;
320
        const int l2= stride + l1;
321
        const int l3= stride + l2;
322
        const int l4= stride + l3;
323
        const int l5= stride + l4;
324
        const int l6= stride + l5;
325
        const int l7= stride + l6;
326
        const int l8= stride + l7;
327
        const int l9= stride + l8;
328
        int x;
329
        src+= stride*3;
330
        for(x=0; x<BLOCK_SIZE; x++)
331
        {
332
                const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
333
                const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
334

    
335
                int sums[9];
336
                sums[0] = first + src[l1];
337
                sums[1] = src[l1] + src[l2];
338
                sums[2] = src[l2] + src[l3];
339
                sums[3] = src[l3] + src[l4];
340
                sums[4] = src[l4] + src[l5];
341
                sums[5] = src[l5] + src[l6];
342
                sums[6] = src[l6] + src[l7];
343
                sums[7] = src[l7] + src[l8];
344
                sums[8] = src[l8] + last;
345

    
346
                src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
347
                src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
348
                src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
349
                src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
350
                src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
351
                src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
352
                src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
353
                src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
354

    
355
                src++;
356
        }
357
#endif
358
}
359

    
360
#if 0
361
/**
362
 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
363
 * values are correctly clipped (MMX2)
364
 * values are wraparound (C)
365
 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
366
        0 8 16 24
367
        x = 8
368
        x/2 = 4
369
        x/8 = 1
370
        1 12 12 23
371
 */
372
static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
373
{
374
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
375
        src+= stride*3;
376
// FIXME rounding
377
        asm volatile(
378
                "pxor %%mm7, %%mm7                                \n\t" // 0
379
                "movq "MANGLE(b80)", %%mm6                        \n\t" // MIN_SIGNED_BYTE
380
                "leal (%0, %1), %%eax                                \n\t"
381
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
382
//        0        1        2        3        4        5        6        7        8        9
383
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1        %0+8%1        ecx+4%1
384
                "movq "MANGLE(pQPb)", %%mm0                        \n\t" // QP,..., QP
385
                "movq %%mm0, %%mm1                                \n\t" // QP,..., QP
386
                "paddusb "MANGLE(b02)", %%mm0                        \n\t"
387
                "psrlw $2, %%mm0                                \n\t"
388
                "pand "MANGLE(b3F)", %%mm0                        \n\t" // QP/4,..., QP/4
389
                "paddusb %%mm1, %%mm0                                \n\t" // QP*1.25 ...
390
                "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
391
                "movq (%%ecx), %%mm3                                \n\t" // line 5
392
                "movq %%mm2, %%mm4                                \n\t" // line 4
393
                "pcmpeqb %%mm5, %%mm5                                \n\t" // -1
394
                "pxor %%mm2, %%mm5                                \n\t" // -line 4 - 1
395
                PAVGB(%%mm3, %%mm5)
396
                "paddb %%mm6, %%mm5                                \n\t" // (l5-l4)/2
397
                "psubusb %%mm3, %%mm4                                \n\t"
398
                "psubusb %%mm2, %%mm3                                \n\t"
399
                "por %%mm3, %%mm4                                \n\t" // |l4 - l5|
400
                "psubusb %%mm0, %%mm4                                \n\t"
401
                "pcmpeqb %%mm7, %%mm4                                \n\t"
402
                "pand %%mm4, %%mm5                                \n\t" // d/2
403

404
//                "paddb %%mm6, %%mm2                                \n\t" // line 4 + 0x80
405
                "paddb %%mm5, %%mm2                                \n\t"
406
//                "psubb %%mm6, %%mm2                                \n\t"
407
                "movq %%mm2, (%0,%1, 4)                                \n\t"
408

409
                "movq (%%ecx), %%mm2                                \n\t"
410
//                "paddb %%mm6, %%mm2                                \n\t" // line 5 + 0x80
411
                "psubb %%mm5, %%mm2                                \n\t"
412
//                "psubb %%mm6, %%mm2                                \n\t"
413
                "movq %%mm2, (%%ecx)                                \n\t"
414

415
                "paddb %%mm6, %%mm5                                \n\t"
416
                "psrlw $2, %%mm5                                \n\t"
417
                "pand "MANGLE(b3F)", %%mm5                        \n\t"
418
                "psubb "MANGLE(b20)", %%mm5                        \n\t" // (l5-l4)/8
419

420
                "movq (%%eax, %1, 2), %%mm2                        \n\t"
421
                "paddb %%mm6, %%mm2                                \n\t" // line 3 + 0x80
422
                "paddsb %%mm5, %%mm2                                \n\t"
423
                "psubb %%mm6, %%mm2                                \n\t"
424
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
425

426
                "movq (%%ecx, %1), %%mm2                        \n\t"
427
                "paddb %%mm6, %%mm2                                \n\t" // line 6 + 0x80
428
                "psubsb %%mm5, %%mm2                                \n\t"
429
                "psubb %%mm6, %%mm2                                \n\t"
430
                "movq %%mm2, (%%ecx, %1)                        \n\t"
431

432
                :
433
                : "r" (src), "r" (stride)
434
                : "%eax", "%ecx"
435
        );
436
#else
437
         const int l1= stride;
438
        const int l2= stride + l1;
439
        const int l3= stride + l2;
440
        const int l4= stride + l3;
441
        const int l5= stride + l4;
442
        const int l6= stride + l5;
443
//        const int l7= stride + l6;
444
//        const int l8= stride + l7;
445
//        const int l9= stride + l8;
446
        int x;
447
        const int QP15= QP + (QP>>2);
448
        src+= stride*3;
449
        for(x=0; x<BLOCK_SIZE; x++)
450
        {
451
                const int v = (src[x+l5] - src[x+l4]);
452
                if(ABS(v) < QP15)
453
                {
454
                        src[x+l3] +=v>>3;
455
                        src[x+l4] +=v>>1;
456
                        src[x+l5] -=v>>1;
457
                        src[x+l6] -=v>>3;
458

    
459
                }
460
        }
461

    
462
#endif
463
}
464
#endif
465

    
466
/**
467
 * Experimental Filter 1
468
 * will not damage linear gradients
469
 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
470
 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
471
 * MMX2 version does correct clipping C version doesnt
472
 */
473
static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
474
{
475
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
476
        src+= stride*3;
477

    
478
        asm volatile(
479
                "pxor %%mm7, %%mm7                                \n\t" // 0
480
                "leal (%0, %1), %%eax                                \n\t"
481
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
482
//        0        1        2        3        4        5        6        7        8        9
483
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1        %0+8%1        ecx+4%1
484
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
485
                "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
486
                "movq %%mm1, %%mm2                                \n\t" // line 4
487
                "psubusb %%mm0, %%mm1                                \n\t"
488
                "psubusb %%mm2, %%mm0                                \n\t"
489
                "por %%mm1, %%mm0                                \n\t" // |l2 - l3|
490
                "movq (%%ecx), %%mm3                                \n\t" // line 5
491
                "movq (%%ecx, %1), %%mm4                        \n\t" // line 6
492
                "movq %%mm3, %%mm5                                \n\t" // line 5
493
                "psubusb %%mm4, %%mm3                                \n\t"
494
                "psubusb %%mm5, %%mm4                                \n\t"
495
                "por %%mm4, %%mm3                                \n\t" // |l5 - l6|
496
                PAVGB(%%mm3, %%mm0)                                      // (|l2 - l3| + |l5 - l6|)/2
497
                "movq %%mm2, %%mm1                                \n\t" // line 4
498
                "psubusb %%mm5, %%mm2                                \n\t"
499
                "movq %%mm2, %%mm4                                \n\t"
500
                "pcmpeqb %%mm7, %%mm2                                \n\t" // (l4 - l5) <= 0 ? -1 : 0
501
                "psubusb %%mm1, %%mm5                                \n\t"
502
                "por %%mm5, %%mm4                                \n\t" // |l4 - l5|
503
                "psubusb %%mm0, %%mm4                \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
504
                "movq %%mm4, %%mm3                                \n\t" // d
505
                "movq %2, %%mm0                        \n\t"
506
                "paddusb %%mm0, %%mm0                                \n\t"
507
                "psubusb %%mm0, %%mm4                                \n\t"
508
                "pcmpeqb %%mm7, %%mm4                                \n\t" // d <= QP ? -1 : 0
509
                "psubusb "MANGLE(b01)", %%mm3                        \n\t"
510
                "pand %%mm4, %%mm3                                \n\t" // d <= QP ? d : 0
511

    
512
                PAVGB(%%mm7, %%mm3)                                      // d/2
513
                "movq %%mm3, %%mm1                                \n\t" // d/2
514
                PAVGB(%%mm7, %%mm3)                                      // d/4
515
                PAVGB(%%mm1, %%mm3)                                      // 3*d/8
516

    
517
                "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
518
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
519
                "psubusb %%mm3, %%mm0                                \n\t"
520
                "pxor %%mm2, %%mm0                                \n\t"
521
                "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
522

    
523
                "movq (%%ecx), %%mm0                                \n\t" // line 5
524
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
525
                "paddusb %%mm3, %%mm0                                \n\t"
526
                "pxor %%mm2, %%mm0                                \n\t"
527
                "movq %%mm0, (%%ecx)                                \n\t" // line 5
528

    
529
                PAVGB(%%mm7, %%mm1)                                      // d/4
530

    
531
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
532
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
533
                "psubusb %%mm1, %%mm0                                \n\t"
534
                "pxor %%mm2, %%mm0                                \n\t"
535
                "movq %%mm0, (%%eax, %1, 2)                        \n\t" // line 3
536

    
537
                "movq (%%ecx, %1), %%mm0                        \n\t" // line 6
538
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
539
                "paddusb %%mm1, %%mm0                                \n\t"
540
                "pxor %%mm2, %%mm0                                \n\t"
541
                "movq %%mm0, (%%ecx, %1)                        \n\t" // line 6
542

    
543
                PAVGB(%%mm7, %%mm1)                                      // d/8
544

    
545
                "movq (%%eax, %1), %%mm0                        \n\t" // line 2
546
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
547
                "psubusb %%mm1, %%mm0                                \n\t"
548
                "pxor %%mm2, %%mm0                                \n\t"
549
                "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
550

    
551
                "movq (%%ecx, %1, 2), %%mm0                        \n\t" // line 7
552
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
553
                "paddusb %%mm1, %%mm0                                \n\t"
554
                "pxor %%mm2, %%mm0                                \n\t"
555
                "movq %%mm0, (%%ecx, %1, 2)                        \n\t" // line 7
556

    
557
                :
558
                : "r" (src), "r" (stride), "m" (co->pQPb)
559
                : "%eax", "%ecx"
560
        );
561
#else
562

    
563
         const int l1= stride;
564
        const int l2= stride + l1;
565
        const int l3= stride + l2;
566
        const int l4= stride + l3;
567
        const int l5= stride + l4;
568
        const int l6= stride + l5;
569
        const int l7= stride + l6;
570
//        const int l8= stride + l7;
571
//        const int l9= stride + l8;
572
        int x;
573

    
574
        src+= stride*3;
575
        for(x=0; x<BLOCK_SIZE; x++)
576
        {
577
                int a= src[l3] - src[l4];
578
                int b= src[l4] - src[l5];
579
                int c= src[l5] - src[l6];
580

    
581
                int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
582
                d= MAX(d, 0);
583

    
584
                if(d < co->QP*2)
585
                {
586
                        int v = d * SIGN(-b);
587

    
588
                        src[l2] +=v>>3;
589
                        src[l3] +=v>>2;
590
                        src[l4] +=(3*v)>>3;
591
                        src[l5] -=(3*v)>>3;
592
                        src[l6] -=v>>2;
593
                        src[l7] -=v>>3;
594

    
595
                }
596
                src++;
597
        }
598
#endif
599
}
600

    
601
static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
602
{
603
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
604
/*
605
        uint8_t tmp[16];
606
        const int l1= stride;
607
        const int l2= stride + l1;
608
        const int l3= stride + l2;
609
        const int l4= (int)tmp - (int)src - stride*3;
610
        const int l5= (int)tmp - (int)src - stride*3 + 8;
611
        const int l6= stride*3 + l3;
612
        const int l7= stride + l6;
613
        const int l8= stride + l7;
614

615
        memcpy(tmp, src+stride*7, 8);
616
        memcpy(tmp+8, src+stride*8, 8);
617
*/
618
        src+= stride*4;
619
        asm volatile(
620

    
621
#if 0 //sligtly more accurate and slightly slower
622
                "pxor %%mm7, %%mm7                                \n\t" // 0
623
                "leal (%0, %1), %%eax                                \n\t"
624
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
625
//        0        1        2        3        4        5        6        7
626
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ecx+%1        ecx+2%1
627
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1
628

629

630
                "movq (%0, %1, 2), %%mm0                        \n\t" // l2
631
                "movq (%0), %%mm1                                \n\t" // l0
632
                "movq %%mm0, %%mm2                                \n\t" // l2
633
                PAVGB(%%mm7, %%mm0)                                      // ~l2/2
634
                PAVGB(%%mm1, %%mm0)                                      // ~(l2 + 2l0)/4
635
                PAVGB(%%mm2, %%mm0)                                      // ~(5l2 + 2l0)/8
636

637
                "movq (%%eax), %%mm1                                \n\t" // l1
638
                "movq (%%eax, %1, 2), %%mm3                        \n\t" // l3
639
                "movq %%mm1, %%mm4                                \n\t" // l1
640
                PAVGB(%%mm7, %%mm1)                                      // ~l1/2
641
                PAVGB(%%mm3, %%mm1)                                      // ~(l1 + 2l3)/4
642
                PAVGB(%%mm4, %%mm1)                                      // ~(5l1 + 2l3)/8
643

644
                "movq %%mm0, %%mm4                                \n\t" // ~(5l2 + 2l0)/8
645
                "psubusb %%mm1, %%mm0                                \n\t"
646
                "psubusb %%mm4, %%mm1                                \n\t"
647
                "por %%mm0, %%mm1                                \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
648
// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
649

650
                "movq (%0, %1, 4), %%mm0                        \n\t" // l4
651
                "movq %%mm0, %%mm4                                \n\t" // l4
652
                PAVGB(%%mm7, %%mm0)                                      // ~l4/2
653
                PAVGB(%%mm2, %%mm0)                                      // ~(l4 + 2l2)/4
654
                PAVGB(%%mm4, %%mm0)                                      // ~(5l4 + 2l2)/8
655

656
                "movq (%%ecx), %%mm2                                \n\t" // l5
657
                "movq %%mm3, %%mm5                                \n\t" // l3
658
                PAVGB(%%mm7, %%mm3)                                      // ~l3/2
659
                PAVGB(%%mm2, %%mm3)                                      // ~(l3 + 2l5)/4
660
                PAVGB(%%mm5, %%mm3)                                      // ~(5l3 + 2l5)/8
661

662
                "movq %%mm0, %%mm6                                \n\t" // ~(5l4 + 2l2)/8
663
                "psubusb %%mm3, %%mm0                                \n\t"
664
                "psubusb %%mm6, %%mm3                                \n\t"
665
                "por %%mm0, %%mm3                                \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
666
                "pcmpeqb %%mm7, %%mm0                                \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
667
// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
668

669
                "movq (%%ecx, %1), %%mm6                        \n\t" // l6
670
                "movq %%mm6, %%mm5                                \n\t" // l6
671
                PAVGB(%%mm7, %%mm6)                                      // ~l6/2
672
                PAVGB(%%mm4, %%mm6)                                      // ~(l6 + 2l4)/4
673
                PAVGB(%%mm5, %%mm6)                                      // ~(5l6 + 2l4)/8
674

675
                "movq (%%ecx, %1, 2), %%mm5                        \n\t" // l7
676
                "movq %%mm2, %%mm4                                \n\t" // l5
677
                PAVGB(%%mm7, %%mm2)                                      // ~l5/2
678
                PAVGB(%%mm5, %%mm2)                                      // ~(l5 + 2l7)/4
679
                PAVGB(%%mm4, %%mm2)                                      // ~(5l5 + 2l7)/8
680

681
                "movq %%mm6, %%mm4                                \n\t" // ~(5l6 + 2l4)/8
682
                "psubusb %%mm2, %%mm6                                \n\t"
683
                "psubusb %%mm4, %%mm2                                \n\t"
684
                "por %%mm6, %%mm2                                \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
685
// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
686

687

688
                PMINUB(%%mm2, %%mm1, %%mm4)                              // MIN(|lenergy|,|renergy|)/8
689
                "movq %2, %%mm4                                        \n\t" // QP //FIXME QP+1 ?
690
                "paddusb "MANGLE(b01)", %%mm4                        \n\t"
691
                "pcmpgtb %%mm3, %%mm4                                \n\t" // |menergy|/8 < QP
692
                "psubusb %%mm1, %%mm3                                \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
693
                "pand %%mm4, %%mm3                                \n\t"
694

695
                "movq %%mm3, %%mm1                                \n\t"
696
//                "psubusb "MANGLE(b01)", %%mm3                        \n\t"
697
                PAVGB(%%mm7, %%mm3)
698
                PAVGB(%%mm7, %%mm3)
699
                "paddusb %%mm1, %%mm3                                \n\t"
700
//                "paddusb "MANGLE(b01)", %%mm3                        \n\t"
701

702
                "movq (%%eax, %1, 2), %%mm6                        \n\t" //l3
703
                "movq (%0, %1, 4), %%mm5                        \n\t" //l4
704
                "movq (%0, %1, 4), %%mm4                        \n\t" //l4
705
                "psubusb %%mm6, %%mm5                                \n\t"
706
                "psubusb %%mm4, %%mm6                                \n\t"
707
                "por %%mm6, %%mm5                                \n\t" // |l3-l4|
708
                "pcmpeqb %%mm7, %%mm6                                \n\t" // SIGN(l3-l4)
709
                "pxor %%mm6, %%mm0                                \n\t"
710
                "pand %%mm0, %%mm3                                \n\t"
711
                PMINUB(%%mm5, %%mm3, %%mm0)
712

713
                "psubusb "MANGLE(b01)", %%mm3                        \n\t"
714
                PAVGB(%%mm7, %%mm3)
715

716
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
717
                "movq (%0, %1, 4), %%mm2                        \n\t"
718
                "pxor %%mm6, %%mm0                                \n\t"
719
                "pxor %%mm6, %%mm2                                \n\t"
720
                "psubb %%mm3, %%mm0                                \n\t"
721
                "paddb %%mm3, %%mm2                                \n\t"
722
                "pxor %%mm6, %%mm0                                \n\t"
723
                "pxor %%mm6, %%mm2                                \n\t"
724
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
725
                "movq %%mm2, (%0, %1, 4)                        \n\t"
726
#endif
727

    
728
                "leal (%0, %1), %%eax                                \n\t"
729
                "pcmpeqb %%mm6, %%mm6                                \n\t" // -1
730
//        0        1        2        3        4        5        6        7
731
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ecx+%1        ecx+2%1
732
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1
733

    
734

    
735
                "movq (%%eax, %1, 2), %%mm1                        \n\t" // l3
736
                "movq (%0, %1, 4), %%mm0                        \n\t" // l4
737
                "pxor %%mm6, %%mm1                                \n\t" // -l3-1
738
                PAVGB(%%mm1, %%mm0)                                      // -q+128 = (l4-l3+256)/2
739
// mm1=-l3-1, mm0=128-q
740

    
741
                "movq (%%eax, %1, 4), %%mm2                        \n\t" // l5
742
                "movq (%%eax, %1), %%mm3                        \n\t" // l2
743
                "pxor %%mm6, %%mm2                                \n\t" // -l5-1
744
                "movq %%mm2, %%mm5                                \n\t" // -l5-1
745
                "movq "MANGLE(b80)", %%mm4                        \n\t" // 128
746
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
747
                PAVGB(%%mm3, %%mm2)                                      // (l2-l5+256)/2
748
                PAVGB(%%mm0, %%mm4)                                      // ~(l4-l3)/4 + 128
749
                PAVGB(%%mm2, %%mm4)                                      // ~(l2-l5)/4 +(l4-l3)/8 + 128
750
                PAVGB(%%mm0, %%mm4)                                      // ~(l2-l5)/8 +5(l4-l3)/16 + 128
751
// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
752

    
753
                "movq (%%eax), %%mm2                                \n\t" // l1
754
                "pxor %%mm6, %%mm2                                \n\t" // -l1-1
755
                PAVGB(%%mm3, %%mm2)                                      // (l2-l1+256)/2
756
                PAVGB((%0), %%mm1)                                      // (l0-l3+256)/2
757
                "movq "MANGLE(b80)", %%mm3                        \n\t" // 128
758
                PAVGB(%%mm2, %%mm3)                                      // ~(l2-l1)/4 + 128
759
                PAVGB(%%mm1, %%mm3)                                      // ~(l0-l3)/4 +(l2-l1)/8 + 128
760
                PAVGB(%%mm2, %%mm3)                                      // ~(l0-l3)/8 +5(l2-l1)/16 + 128
761
// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
762

    
763
                PAVGB((%%ecx, %1), %%mm5)                              // (l6-l5+256)/2
764
                "movq (%%ecx, %1, 2), %%mm1                        \n\t" // l7
765
                "pxor %%mm6, %%mm1                                \n\t" // -l7-1
766
                PAVGB((%0, %1, 4), %%mm1)                              // (l4-l7+256)/2
767
                "movq "MANGLE(b80)", %%mm2                        \n\t" // 128
768
                PAVGB(%%mm5, %%mm2)                                      // ~(l6-l5)/4 + 128
769
                PAVGB(%%mm1, %%mm2)                                      // ~(l4-l7)/4 +(l6-l5)/8 + 128
770
                PAVGB(%%mm5, %%mm2)                                      // ~(l4-l7)/8 +5(l6-l5)/16 + 128
771
// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
772

    
773
                "movq "MANGLE(b00)", %%mm1                        \n\t" // 0
774
                "movq "MANGLE(b00)", %%mm5                        \n\t" // 0
775
                "psubb %%mm2, %%mm1                                \n\t" // 128 - renergy/16
776
                "psubb %%mm3, %%mm5                                \n\t" // 128 - lenergy/16
777
                PMAXUB(%%mm1, %%mm2)                                      // 128 + |renergy/16|
778
                 PMAXUB(%%mm5, %%mm3)                                      // 128 + |lenergy/16|
779
                PMINUB(%%mm2, %%mm3, %%mm1)                              // 128 + MIN(|lenergy|,|renergy|)/16
780

    
781
// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
782

    
783
                "movq "MANGLE(b00)", %%mm7                        \n\t" // 0
784
                "movq %2, %%mm2                                        \n\t" // QP
785
                PAVGB(%%mm6, %%mm2)                                      // 128 + QP/2
786
                "psubb %%mm6, %%mm2                                \n\t"
787

    
788
                "movq %%mm4, %%mm1                                \n\t"
789
                "pcmpgtb %%mm7, %%mm1                                \n\t" // SIGN(menergy)
790
                "pxor %%mm1, %%mm4                                \n\t"
791
                "psubb %%mm1, %%mm4                                \n\t" // 128 + |menergy|/16
792
                "pcmpgtb %%mm4, %%mm2                                \n\t" // |menergy|/16 < QP/2
793
                "psubusb %%mm3, %%mm4                                \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
794
// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
795

    
796
                "movq %%mm4, %%mm3                                \n\t" // d
797
                "psubusb "MANGLE(b01)", %%mm4                        \n\t"
798
                PAVGB(%%mm7, %%mm4)                                      // d/32
799
                PAVGB(%%mm7, %%mm4)                                      // (d + 32)/64
800
                "paddb %%mm3, %%mm4                                \n\t" // 5d/64
801
                "pand %%mm2, %%mm4                                \n\t"
802

    
803
                "movq "MANGLE(b80)", %%mm5                        \n\t" // 128
804
                "psubb %%mm0, %%mm5                                \n\t" // q
805
                "paddsb %%mm6, %%mm5                                \n\t" // fix bad rounding
806
                "pcmpgtb %%mm5, %%mm7                                \n\t" // SIGN(q)
807
                "pxor %%mm7, %%mm5                                \n\t"
808

    
809
                PMINUB(%%mm5, %%mm4, %%mm3)                              // MIN(|q|, 5d/64)
810
                "pxor %%mm1, %%mm7                                \n\t" // SIGN(d*q)
811

    
812
                "pand %%mm7, %%mm4                                \n\t"
813
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
814
                "movq (%0, %1, 4), %%mm2                        \n\t"
815
                "pxor %%mm1, %%mm0                                \n\t"
816
                "pxor %%mm1, %%mm2                                \n\t"
817
                "paddb %%mm4, %%mm0                                \n\t"
818
                "psubb %%mm4, %%mm2                                \n\t"
819
                "pxor %%mm1, %%mm0                                \n\t"
820
                "pxor %%mm1, %%mm2                                \n\t"
821
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
822
                "movq %%mm2, (%0, %1, 4)                        \n\t"
823

    
824
                :
825
                : "r" (src), "r" (stride), "m" (c->pQPb)
826
                : "%eax", "%ecx"
827
        );
828

    
829
/*
830
        {
831
        int x;
832
        src-= stride;
833
        for(x=0; x<BLOCK_SIZE; x++)
834
        {
835
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
836
                if(ABS(middleEnergy)< 8*QP)
837
                {
838
                        const int q=(src[l4] - src[l5])/2;
839
                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
840
                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
841

842
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
843
                        d= MAX(d, 0);
844

845
                        d= (5*d + 32) >> 6;
846
                        d*= SIGN(-middleEnergy);
847

848
                        if(q>0)
849
                        {
850
                                d= d<0 ? 0 : d;
851
                                d= d>q ? q : d;
852
                        }
853
                        else
854
                        {
855
                                d= d>0 ? 0 : d;
856
                                d= d<q ? q : d;
857
                        }
858

859
                        src[l4]-= d;
860
                        src[l5]+= d;
861
                }
862
                src++;
863
        }
864
src-=8;
865
        for(x=0; x<8; x++)
866
        {
867
                int y;
868
                for(y=4; y<6; y++)
869
                {
870
                        int d= src[x+y*stride] - tmp[x+(y-4)*8];
871
                        int ad= ABS(d);
872
                        static int max=0;
873
                        static int sum=0;
874
                        static int num=0;
875
                        static int bias=0;
876

877
                        if(max<ad) max=ad;
878
                        sum+= ad>3 ? 1 : 0;
879
                        if(ad>3)
880
                        {
881
                                src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
882
                        }
883
                        if(y==4) bias+=d;
884
                        num++;
885
                        if(num%1000000 == 0)
886
                        {
887
                                printf(" %d %d %d %d\n", num, sum, max, bias);
888
                        }
889
                }
890
        }
891
}
892
*/
893
#elif defined (HAVE_MMX)
894
        src+= stride*4;
895

    
896
        asm volatile(
897
                "pxor %%mm7, %%mm7                                \n\t"
898
                "leal (%0, %1), %%eax                                \n\t"
899
                "leal (%%eax, %1, 4), %%edx                        \n\t"
900
                "leal -40(%%esp), %%ecx                                \n\t" // make space for 4 8-byte vars
901
                "andl $0xFFFFFFF8, %%ecx                        \n\t" // align
902
//        0        1        2        3        4        5        6        7
903
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        edx+%1        edx+2%1
904
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1
905

    
906
                "movq (%0), %%mm0                                \n\t"
907
                "movq %%mm0, %%mm1                                \n\t"
908
                "punpcklbw %%mm7, %%mm0                                \n\t" // low part of line 0
909
                "punpckhbw %%mm7, %%mm1                                \n\t" // high part of line 0
910

    
911
                "movq (%%eax), %%mm2                                \n\t"
912
                "movq %%mm2, %%mm3                                \n\t"
913
                "punpcklbw %%mm7, %%mm2                                \n\t" // low part of line 1
914
                "punpckhbw %%mm7, %%mm3                                \n\t" // high part of line 1
915

    
916
                "movq (%%eax, %1), %%mm4                        \n\t"
917
                "movq %%mm4, %%mm5                                \n\t"
918
                "punpcklbw %%mm7, %%mm4                                \n\t" // low part of line 2
919
                "punpckhbw %%mm7, %%mm5                                \n\t" // high part of line 2
920

    
921
                "paddw %%mm0, %%mm0                                \n\t" // 2L0
922
                "paddw %%mm1, %%mm1                                \n\t" // 2H0
923
                "psubw %%mm4, %%mm2                                \n\t" // L1 - L2
924
                "psubw %%mm5, %%mm3                                \n\t" // H1 - H2
925
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - L1 + L2
926
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - H1 + H2
927

    
928
                "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
929
                "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
930
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2
931
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2
932

    
933
                "movq (%%eax, %1, 2), %%mm2                        \n\t"
934
                "movq %%mm2, %%mm3                                \n\t"
935
                "punpcklbw %%mm7, %%mm2                                \n\t" // L3
936
                "punpckhbw %%mm7, %%mm3                                \n\t" // H3
937

    
938
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - L3
939
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - H3
940
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
941
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
942
                "movq %%mm0, (%%ecx)                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
943
                "movq %%mm1, 8(%%ecx)                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
944

    
945
                "movq (%0, %1, 4), %%mm0                        \n\t"
946
                "movq %%mm0, %%mm1                                \n\t"
947
                "punpcklbw %%mm7, %%mm0                                \n\t" // L4
948
                "punpckhbw %%mm7, %%mm1                                \n\t" // H4
949

    
950
                "psubw %%mm0, %%mm2                                \n\t" // L3 - L4
951
                "psubw %%mm1, %%mm3                                \n\t" // H3 - H4
952
                "movq %%mm2, 16(%%ecx)                                \n\t" // L3 - L4
953
                "movq %%mm3, 24(%%ecx)                                \n\t" // H3 - H4
954
                "paddw %%mm4, %%mm4                                \n\t" // 2L2
955
                "paddw %%mm5, %%mm5                                \n\t" // 2H2
956
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - L3 + L4
957
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - H3 + H4
958

    
959
                "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
960
                "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
961
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4
962
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4
963
//50 opcodes so far
964
                "movq (%%edx), %%mm2                                \n\t"
965
                "movq %%mm2, %%mm3                                \n\t"
966
                "punpcklbw %%mm7, %%mm2                                \n\t" // L5
967
                "punpckhbw %%mm7, %%mm3                                \n\t" // H5
968
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - L5
969
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - H5
970
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - 2L5
971
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - 2H5
972

    
973
                "movq (%%edx, %1), %%mm6                        \n\t"
974
                "punpcklbw %%mm7, %%mm6                                \n\t" // L6
975
                "psubw %%mm6, %%mm2                                \n\t" // L5 - L6
976
                "movq (%%edx, %1), %%mm6                        \n\t"
977
                "punpckhbw %%mm7, %%mm6                                \n\t" // H6
978
                "psubw %%mm6, %%mm3                                \n\t" // H5 - H6
979

    
980
                "paddw %%mm0, %%mm0                                \n\t" // 2L4
981
                "paddw %%mm1, %%mm1                                \n\t" // 2H4
982
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - L5 + L6
983
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - H5 + H6
984

    
985
                "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
986
                "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
987
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6
988
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6
989

    
990
                "movq (%%edx, %1, 2), %%mm2                        \n\t"
991
                "movq %%mm2, %%mm3                                \n\t"
992
                "punpcklbw %%mm7, %%mm2                                \n\t" // L7
993
                "punpckhbw %%mm7, %%mm3                                \n\t" // H7
994

    
995
                "paddw %%mm2, %%mm2                                \n\t" // 2L7
996
                "paddw %%mm3, %%mm3                                \n\t" // 2H7
997
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6 - 2L7
998
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6 - 2H7
999

    
1000
                "movq (%%ecx), %%mm2                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1001
                "movq 8(%%ecx), %%mm3                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1002

    
1003
#ifdef HAVE_MMX2
1004
                "movq %%mm7, %%mm6                                \n\t" // 0
1005
                "psubw %%mm0, %%mm6                                \n\t"
1006
                "pmaxsw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1007
                "movq %%mm7, %%mm6                                \n\t" // 0
1008
                "psubw %%mm1, %%mm6                                \n\t"
1009
                "pmaxsw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1010
                "movq %%mm7, %%mm6                                \n\t" // 0
1011
                "psubw %%mm2, %%mm6                                \n\t"
1012
                "pmaxsw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1013
                "movq %%mm7, %%mm6                                \n\t" // 0
1014
                "psubw %%mm3, %%mm6                                \n\t"
1015
                "pmaxsw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1016
#else
1017
                "movq %%mm7, %%mm6                                \n\t" // 0
1018
                "pcmpgtw %%mm0, %%mm6                                \n\t"
1019
                "pxor %%mm6, %%mm0                                \n\t"
1020
                "psubw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1021
                "movq %%mm7, %%mm6                                \n\t" // 0
1022
                "pcmpgtw %%mm1, %%mm6                                \n\t"
1023
                "pxor %%mm6, %%mm1                                \n\t"
1024
                "psubw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1025
                "movq %%mm7, %%mm6                                \n\t" // 0
1026
                "pcmpgtw %%mm2, %%mm6                                \n\t"
1027
                "pxor %%mm6, %%mm2                                \n\t"
1028
                "psubw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1029
                "movq %%mm7, %%mm6                                \n\t" // 0
1030
                "pcmpgtw %%mm3, %%mm6                                \n\t"
1031
                "pxor %%mm6, %%mm3                                \n\t"
1032
                "psubw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1033
#endif
1034

    
1035
#ifdef HAVE_MMX2
1036
                "pminsw %%mm2, %%mm0                                \n\t"
1037
                "pminsw %%mm3, %%mm1                                \n\t"
1038
#else
1039
                "movq %%mm0, %%mm6                                \n\t"
1040
                "psubusw %%mm2, %%mm6                                \n\t"
1041
                "psubw %%mm6, %%mm0                                \n\t"
1042
                "movq %%mm1, %%mm6                                \n\t"
1043
                "psubusw %%mm3, %%mm6                                \n\t"
1044
                "psubw %%mm6, %%mm1                                \n\t"
1045
#endif
1046

    
1047
                "movq %%mm7, %%mm6                                \n\t" // 0
1048
                "pcmpgtw %%mm4, %%mm6                                \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1049
                "pxor %%mm6, %%mm4                                \n\t"
1050
                "psubw %%mm6, %%mm4                                \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1051
                "pcmpgtw %%mm5, %%mm7                                \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1052
                "pxor %%mm7, %%mm5                                \n\t"
1053
                "psubw %%mm7, %%mm5                                \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1054
// 100 opcodes
1055
                "movd %2, %%mm2                                        \n\t" // QP
1056
                "psllw $3, %%mm2                                \n\t" // 8QP
1057
                "movq %%mm2, %%mm3                                \n\t" // 8QP
1058
                "pcmpgtw %%mm4, %%mm2                                \n\t"
1059
                "pcmpgtw %%mm5, %%mm3                                \n\t"
1060
                "pand %%mm2, %%mm4                                \n\t"
1061
                "pand %%mm3, %%mm5                                \n\t"
1062

    
1063

    
1064
                "psubusw %%mm0, %%mm4                                \n\t" // hd
1065
                "psubusw %%mm1, %%mm5                                \n\t" // ld
1066

    
1067

    
1068
                "movq "MANGLE(w05)", %%mm2                        \n\t" // 5
1069
                "pmullw %%mm2, %%mm4                                \n\t"
1070
                "pmullw %%mm2, %%mm5                                \n\t"
1071
                "movq "MANGLE(w20)", %%mm2                        \n\t" // 32
1072
                "paddw %%mm2, %%mm4                                \n\t"
1073
                "paddw %%mm2, %%mm5                                \n\t"
1074
                "psrlw $6, %%mm4                                \n\t"
1075
                "psrlw $6, %%mm5                                \n\t"
1076

    
1077
                "movq 16(%%ecx), %%mm0                                \n\t" // L3 - L4
1078
                "movq 24(%%ecx), %%mm1                                \n\t" // H3 - H4
1079

    
1080
                "pxor %%mm2, %%mm2                                \n\t"
1081
                "pxor %%mm3, %%mm3                                \n\t"
1082

    
1083
                "pcmpgtw %%mm0, %%mm2                                \n\t" // sign (L3-L4)
1084
                "pcmpgtw %%mm1, %%mm3                                \n\t" // sign (H3-H4)
1085
                "pxor %%mm2, %%mm0                                \n\t"
1086
                "pxor %%mm3, %%mm1                                \n\t"
1087
                "psubw %%mm2, %%mm0                                \n\t" // |L3-L4|
1088
                "psubw %%mm3, %%mm1                                \n\t" // |H3-H4|
1089
                "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1090
                "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1091

    
1092
                "pxor %%mm6, %%mm2                                \n\t"
1093
                "pxor %%mm7, %%mm3                                \n\t"
1094
                "pand %%mm2, %%mm4                                \n\t"
1095
                "pand %%mm3, %%mm5                                \n\t"
1096

    
1097
#ifdef HAVE_MMX2
1098
                "pminsw %%mm0, %%mm4                                \n\t"
1099
                "pminsw %%mm1, %%mm5                                \n\t"
1100
#else
1101
                "movq %%mm4, %%mm2                                \n\t"
1102
                "psubusw %%mm0, %%mm2                                \n\t"
1103
                "psubw %%mm2, %%mm4                                \n\t"
1104
                "movq %%mm5, %%mm2                                \n\t"
1105
                "psubusw %%mm1, %%mm2                                \n\t"
1106
                "psubw %%mm2, %%mm5                                \n\t"
1107
#endif
1108
                "pxor %%mm6, %%mm4                                \n\t"
1109
                "pxor %%mm7, %%mm5                                \n\t"
1110
                "psubw %%mm6, %%mm4                                \n\t"
1111
                "psubw %%mm7, %%mm5                                \n\t"
1112
                "packsswb %%mm5, %%mm4                                \n\t"
1113
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
1114
                "paddb   %%mm4, %%mm0                                \n\t"
1115
                "movq %%mm0, (%%eax, %1, 2)                         \n\t"
1116
                "movq (%0, %1, 4), %%mm0                        \n\t"
1117
                "psubb %%mm4, %%mm0                                \n\t"
1118
                "movq %%mm0, (%0, %1, 4)                         \n\t"
1119

    
1120
                :
1121
                : "r" (src), "r" (stride), "m" (c->pQPb)
1122
                : "%eax", "%edx", "%ecx"
1123
        );
1124
#else
1125
        const int l1= stride;
1126
        const int l2= stride + l1;
1127
        const int l3= stride + l2;
1128
        const int l4= stride + l3;
1129
        const int l5= stride + l4;
1130
        const int l6= stride + l5;
1131
        const int l7= stride + l6;
1132
        const int l8= stride + l7;
1133
//        const int l9= stride + l8;
1134
        int x;
1135
        src+= stride*3;
1136
        for(x=0; x<BLOCK_SIZE; x++)
1137
        {
1138
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1139
                if(ABS(middleEnergy) < 8*c->QP)
1140
                {
1141
                        const int q=(src[l4] - src[l5])/2;
1142
                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1143
                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1144

    
1145
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1146
                        d= MAX(d, 0);
1147

    
1148
                        d= (5*d + 32) >> 6;
1149
                        d*= SIGN(-middleEnergy);
1150

    
1151
                        if(q>0)
1152
                        {
1153
                                d= d<0 ? 0 : d;
1154
                                d= d>q ? q : d;
1155
                        }
1156
                        else
1157
                        {
1158
                                d= d>0 ? 0 : d;
1159
                                d= d<q ? q : d;
1160
                        }
1161

    
1162
                        src[l4]-= d;
1163
                        src[l5]+= d;
1164
                }
1165
                src++;
1166
        }
1167
#endif
1168
}
1169

    
1170
static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1171
{
1172
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1173
        asm volatile(
1174
                "pxor %%mm6, %%mm6                                \n\t"
1175
                "pcmpeqb %%mm7, %%mm7                                \n\t"
1176
                "movq %2, %%mm0                                        \n\t"
1177
                "punpcklbw %%mm6, %%mm0                                \n\t"
1178
                "psrlw $1, %%mm0                                \n\t"
1179
                "psubw %%mm7, %%mm0                                \n\t"
1180
                "packuswb %%mm0, %%mm0                                \n\t"
1181
                "movq %%mm0, %3                                        \n\t"
1182

    
1183
                "leal (%0, %1), %%eax                                \n\t"
1184
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1185
                
1186
//        0        1        2        3        4        5        6        7        8        9
1187
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1188

    
1189
#undef FIND_MIN_MAX
1190
#ifdef HAVE_MMX2
1191
#define FIND_MIN_MAX(addr)\
1192
                "movq " #addr ", %%mm0                                \n\t"\
1193
                "pminub %%mm0, %%mm7                                \n\t"\
1194
                "pmaxub %%mm0, %%mm6                                \n\t"
1195
#else
1196
#define FIND_MIN_MAX(addr)\
1197
                "movq " #addr ", %%mm0                                \n\t"\
1198
                "movq %%mm7, %%mm1                                \n\t"\
1199
                "psubusb %%mm0, %%mm6                                \n\t"\
1200
                "paddb %%mm0, %%mm6                                \n\t"\
1201
                "psubusb %%mm0, %%mm1                                \n\t"\
1202
                "psubb %%mm1, %%mm7                                \n\t"
1203
#endif
1204

    
1205
FIND_MIN_MAX((%%eax))
1206
FIND_MIN_MAX((%%eax, %1))
1207
FIND_MIN_MAX((%%eax, %1, 2))
1208
FIND_MIN_MAX((%0, %1, 4))
1209
FIND_MIN_MAX((%%edx))
1210
FIND_MIN_MAX((%%edx, %1))
1211
FIND_MIN_MAX((%%edx, %1, 2))
1212
FIND_MIN_MAX((%0, %1, 8))
1213

    
1214
                "movq %%mm7, %%mm4                                \n\t"
1215
                "psrlq $8, %%mm7                                \n\t"
1216
#ifdef HAVE_MMX2
1217
                "pminub %%mm4, %%mm7                                \n\t" // min of pixels
1218
                "pshufw $0xF9, %%mm7, %%mm4                        \n\t"
1219
                "pminub %%mm4, %%mm7                                \n\t" // min of pixels
1220
                "pshufw $0xFE, %%mm7, %%mm4                        \n\t"
1221
                "pminub %%mm4, %%mm7                                \n\t"
1222
#else
1223
                "movq %%mm7, %%mm1                                \n\t"
1224
                "psubusb %%mm4, %%mm1                                \n\t"
1225
                "psubb %%mm1, %%mm7                                \n\t"
1226
                "movq %%mm7, %%mm4                                \n\t"
1227
                "psrlq $16, %%mm7                                \n\t"
1228
                "movq %%mm7, %%mm1                                \n\t"
1229
                "psubusb %%mm4, %%mm1                                \n\t"
1230
                "psubb %%mm1, %%mm7                                \n\t"
1231
                "movq %%mm7, %%mm4                                \n\t"
1232
                "psrlq $32, %%mm7                                \n\t"
1233
                "movq %%mm7, %%mm1                                \n\t"
1234
                "psubusb %%mm4, %%mm1                                \n\t"
1235
                "psubb %%mm1, %%mm7                                \n\t"
1236
#endif
1237

    
1238

    
1239
                "movq %%mm6, %%mm4                                \n\t"
1240
                "psrlq $8, %%mm6                                \n\t"
1241
#ifdef HAVE_MMX2
1242
                "pmaxub %%mm4, %%mm6                                \n\t" // max of pixels
1243
                "pshufw $0xF9, %%mm6, %%mm4                        \n\t"
1244
                "pmaxub %%mm4, %%mm6                                \n\t"
1245
                "pshufw $0xFE, %%mm6, %%mm4                        \n\t"
1246
                "pmaxub %%mm4, %%mm6                                \n\t"
1247
#else
1248
                "psubusb %%mm4, %%mm6                                \n\t"
1249
                "paddb %%mm4, %%mm6                                \n\t"
1250
                "movq %%mm6, %%mm4                                \n\t"
1251
                "psrlq $16, %%mm6                                \n\t"
1252
                "psubusb %%mm4, %%mm6                                \n\t"
1253
                "paddb %%mm4, %%mm6                                \n\t"
1254
                "movq %%mm6, %%mm4                                \n\t"
1255
                "psrlq $32, %%mm6                                \n\t"
1256
                "psubusb %%mm4, %%mm6                                \n\t"
1257
                "paddb %%mm4, %%mm6                                \n\t"
1258
#endif
1259
                "movq %%mm6, %%mm0                                \n\t" // max
1260
                "psubb %%mm7, %%mm6                                \n\t" // max - min
1261
                "movd %%mm6, %%ecx                                \n\t"
1262
                "cmpb "MANGLE(deringThreshold)", %%cl                \n\t"
1263
                " jb 1f                                                \n\t"
1264
                "leal -24(%%esp), %%ecx                                \n\t"
1265
                "andl $0xFFFFFFF8, %%ecx                        \n\t" 
1266
                PAVGB(%%mm0, %%mm7)                                      // a=(max + min)/2
1267
                "punpcklbw %%mm7, %%mm7                                \n\t"
1268
                "punpcklbw %%mm7, %%mm7                                \n\t"
1269
                "punpcklbw %%mm7, %%mm7                                \n\t"
1270
                "movq %%mm7, (%%ecx)                                \n\t"
1271

    
1272
                "movq (%0), %%mm0                                \n\t" // L10
1273
                "movq %%mm0, %%mm1                                \n\t" // L10
1274
                "movq %%mm0, %%mm2                                \n\t" // L10
1275
                "psllq $8, %%mm1                                \n\t"
1276
                "psrlq $8, %%mm2                                \n\t"
1277
                "movd -4(%0), %%mm3                                \n\t"
1278
                "movd 8(%0), %%mm4                                \n\t"
1279
                "psrlq $24, %%mm3                                \n\t"
1280
                "psllq $56, %%mm4                                \n\t"
1281
                "por %%mm3, %%mm1                                \n\t" // L00
1282
                "por %%mm4, %%mm2                                \n\t" // L20
1283
                "movq %%mm1, %%mm3                                \n\t" // L00
1284
                PAVGB(%%mm2, %%mm1)                                      // (L20 + L00)/2
1285
                PAVGB(%%mm0, %%mm1)                                      // (L20 + L00 + 2L10)/4
1286
                "psubusb %%mm7, %%mm0                                \n\t"
1287
                "psubusb %%mm7, %%mm2                                \n\t"
1288
                "psubusb %%mm7, %%mm3                                \n\t"
1289
                "pcmpeqb "MANGLE(b00)", %%mm0                        \n\t" // L10 > a ? 0 : -1
1290
                "pcmpeqb "MANGLE(b00)", %%mm2                        \n\t" // L20 > a ? 0 : -1
1291
                "pcmpeqb "MANGLE(b00)", %%mm3                        \n\t" // L00 > a ? 0 : -1
1292
                "paddb %%mm2, %%mm0                                \n\t"
1293
                "paddb %%mm3, %%mm0                                \n\t"
1294

    
1295
                "movq (%%eax), %%mm2                                \n\t" // L11
1296
                "movq %%mm2, %%mm3                                \n\t" // L11
1297
                "movq %%mm2, %%mm4                                \n\t" // L11
1298
                "psllq $8, %%mm3                                \n\t"
1299
                "psrlq $8, %%mm4                                \n\t"
1300
                "movd -4(%%eax), %%mm5                                \n\t"
1301
                "movd 8(%%eax), %%mm6                                \n\t"
1302
                "psrlq $24, %%mm5                                \n\t"
1303
                "psllq $56, %%mm6                                \n\t"
1304
                "por %%mm5, %%mm3                                \n\t" // L01
1305
                "por %%mm6, %%mm4                                \n\t" // L21
1306
                "movq %%mm3, %%mm5                                \n\t" // L01
1307
                PAVGB(%%mm4, %%mm3)                                      // (L21 + L01)/2
1308
                PAVGB(%%mm2, %%mm3)                                      // (L21 + L01 + 2L11)/4
1309
                "psubusb %%mm7, %%mm2                                \n\t"
1310
                "psubusb %%mm7, %%mm4                                \n\t"
1311
                "psubusb %%mm7, %%mm5                                \n\t"
1312
                "pcmpeqb "MANGLE(b00)", %%mm2                        \n\t" // L11 > a ? 0 : -1
1313
                "pcmpeqb "MANGLE(b00)", %%mm4                        \n\t" // L21 > a ? 0 : -1
1314
                "pcmpeqb "MANGLE(b00)", %%mm5                        \n\t" // L01 > a ? 0 : -1
1315
                "paddb %%mm4, %%mm2                                \n\t"
1316
                "paddb %%mm5, %%mm2                                \n\t"
1317
// 0, 2, 3, 1
1318
#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1319
                "movq " #src ", " #sx "                                \n\t" /* src[0] */\
1320
                "movq " #sx ", " #lx "                                \n\t" /* src[0] */\
1321
                "movq " #sx ", " #t0 "                                \n\t" /* src[0] */\
1322
                "psllq $8, " #lx "                                \n\t"\
1323
                "psrlq $8, " #t0 "                                \n\t"\
1324
                "movd -4" #src ", " #t1 "                        \n\t"\
1325
                "psrlq $24, " #t1 "                                \n\t"\
1326
                "por " #t1 ", " #lx "                                \n\t" /* src[-1] */\
1327
                "movd 8" #src ", " #t1 "                        \n\t"\
1328
                "psllq $56, " #t1 "                                \n\t"\
1329
                "por " #t1 ", " #t0 "                                \n\t" /* src[+1] */\
1330
                "movq " #lx ", " #t1 "                                \n\t" /* src[-1] */\
1331
                PAVGB(t0, lx)                                              /* (src[-1] + src[+1])/2 */\
1332
                PAVGB(sx, lx)                                      /* (src[-1] + 2src[0] + src[+1])/4 */\
1333
                PAVGB(lx, pplx)                                             \
1334
                "movq " #lx ", 8(%%ecx)                                \n\t"\
1335
                "movq (%%ecx), " #lx "                                \n\t"\
1336
                "psubusb " #lx ", " #t1 "                        \n\t"\
1337
                "psubusb " #lx ", " #t0 "                        \n\t"\
1338
                "psubusb " #lx ", " #sx "                        \n\t"\
1339
                "movq "MANGLE(b00)", " #lx "                        \n\t"\
1340
                "pcmpeqb " #lx ", " #t1 "                        \n\t" /* src[-1] > a ? 0 : -1*/\
1341
                "pcmpeqb " #lx ", " #t0 "                        \n\t" /* src[+1] > a ? 0 : -1*/\
1342
                "pcmpeqb " #lx ", " #sx "                        \n\t" /* src[0]  > a ? 0 : -1*/\
1343
                "paddb " #t1 ", " #t0 "                                \n\t"\
1344
                "paddb " #t0 ", " #sx "                                \n\t"\
1345
\
1346
                PAVGB(plx, pplx)                                      /* filtered */\
1347
                "movq " #dst ", " #t0 "                                \n\t" /* dst */\
1348
                "movq " #t0 ", " #t1 "                                \n\t" /* dst */\
1349
                "psubusb %3, " #t0 "                                \n\t"\
1350
                "paddusb %3, " #t1 "                                \n\t"\
1351
                PMAXUB(t0, pplx)\
1352
                PMINUB(t1, pplx, t0)\
1353
                "paddb " #sx ", " #ppsx "                        \n\t"\
1354
                "paddb " #psx ", " #ppsx "                        \n\t"\
1355
                "#paddb "MANGLE(b02)", " #ppsx "                \n\t"\
1356
                "pand "MANGLE(b08)", " #ppsx "                        \n\t"\
1357
                "pcmpeqb " #lx ", " #ppsx "                        \n\t"\
1358
                "pand " #ppsx ", " #pplx "                        \n\t"\
1359
                "pandn " #dst ", " #ppsx "                        \n\t"\
1360
                "por " #pplx ", " #ppsx "                        \n\t"\
1361
                "movq " #ppsx ", " #dst "                        \n\t"\
1362
                "movq 8(%%ecx), " #lx "                                \n\t"
1363

    
1364
/*
1365
0000000
1366
1111111
1367

1368
1111110
1369
1111101
1370
1111100
1371
1111011
1372
1111010
1373
1111001
1374

1375
1111000
1376
1110111
1377

1378
*/
1379
//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1380
DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1381
DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1382
DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1383
DERING_CORE((%0, %1, 4),(%%edx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1384
DERING_CORE((%%edx),(%%edx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1385
DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1386
DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1387
DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1388

    
1389
                "1:                        \n\t"
1390
                : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
1391
                : "%eax", "%edx", "%ecx"
1392
        );
1393
#else
1394
        int y;
1395
        int min=255;
1396
        int max=0;
1397
        int avg;
1398
        uint8_t *p;
1399
        int s[10];
1400
        const int QP2= c->QP/2 + 1;
1401

    
1402
        for(y=1; y<9; y++)
1403
        {
1404
                int x;
1405
                p= src + stride*y;
1406
                for(x=1; x<9; x++)
1407
                {
1408
                        p++;
1409
                        if(*p > max) max= *p;
1410
                        if(*p < min) min= *p;
1411
                }
1412
        }
1413
        avg= (min + max + 1)>>1;
1414

    
1415
        if(max - min <deringThreshold) return;
1416

    
1417
        for(y=0; y<10; y++)
1418
        {
1419
                int t = 0;
1420

    
1421
                if(src[stride*y + 0] > avg) t+= 1;
1422
                if(src[stride*y + 1] > avg) t+= 2;
1423
                if(src[stride*y + 2] > avg) t+= 4;
1424
                if(src[stride*y + 3] > avg) t+= 8;
1425
                if(src[stride*y + 4] > avg) t+= 16;
1426
                if(src[stride*y + 5] > avg) t+= 32;
1427
                if(src[stride*y + 6] > avg) t+= 64;
1428
                if(src[stride*y + 7] > avg) t+= 128;
1429
                if(src[stride*y + 8] > avg) t+= 256;
1430
                if(src[stride*y + 9] > avg) t+= 512;
1431
                
1432
                t |= (~t)<<16;
1433
                t &= (t<<1) & (t>>1);
1434
                s[y] = t;
1435
        }
1436
        
1437
        for(y=1; y<9; y++)
1438
        {
1439
                int t = s[y-1] & s[y] & s[y+1];
1440
                t|= t>>16;
1441
                s[y-1]= t;
1442
        }
1443

    
1444
        for(y=1; y<9; y++)
1445
        {
1446
                int x;
1447
                int t = s[y-1];
1448

    
1449
                p= src + stride*y;
1450
                for(x=1; x<9; x++)
1451
                {
1452
                        p++;
1453
                        if(t & (1<<x))
1454
                        {
1455
                                int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1456
                                      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1457
                                      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1458
                                f= (f + 8)>>4;
1459

    
1460
#ifdef DEBUG_DERING_THRESHOLD
1461
                                asm volatile("emms\n\t":);
1462
                                {
1463
                                static long long numPixels=0;
1464
                                if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1465
//                                if((max-min)<20 || (max-min)*QP<200)
1466
//                                if((max-min)*QP < 500)
1467
//                                if(max-min<QP/2)
1468
                                if(max-min < 20)
1469
                                {
1470
                                        static int numSkiped=0;
1471
                                        static int errorSum=0;
1472
                                        static int worstQP=0;
1473
                                        static int worstRange=0;
1474
                                        static int worstDiff=0;
1475
                                        int diff= (f - *p);
1476
                                        int absDiff= ABS(diff);
1477
                                        int error= diff*diff;
1478

    
1479
                                        if(x==1 || x==8 || y==1 || y==8) continue;
1480

    
1481
                                        numSkiped++;
1482
                                        if(absDiff > worstDiff)
1483
                                        {
1484
                                                worstDiff= absDiff;
1485
                                                worstQP= QP;
1486
                                                worstRange= max-min;
1487
                                        }
1488
                                        errorSum+= error;
1489

    
1490
                                        if(1024LL*1024LL*1024LL % numSkiped == 0)
1491
                                        {
1492
                                                printf( "sum:%1.3f, skip:%d, wQP:%d, "
1493
                                                        "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1494
                                                        (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1495
                                                        worstDiff, (float)numSkiped/numPixels);
1496
                                        }
1497
                                }
1498
                                }
1499
#endif
1500
                                if     (*p + QP2 < f) *p= *p + QP2;
1501
                                else if(*p - QP2 > f) *p= *p - QP2;
1502
                                else *p=f;
1503
                        }
1504
                }
1505
        }
1506
#ifdef DEBUG_DERING_THRESHOLD
1507
        if(max-min < 20)
1508
        {
1509
                for(y=1; y<9; y++)
1510
                {
1511
                        int x;
1512
                        int t = 0;
1513
                        p= src + stride*y;
1514
                        for(x=1; x<9; x++)
1515
                        {
1516
                                p++;
1517
                                *p = MIN(*p + 20, 255);
1518
                        }
1519
                }
1520
//                src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1521
        }
1522
#endif
1523
#endif
1524
}
1525

    
1526
/**
1527
 * Deinterlaces the given block
1528
 * will be called for every 8x8 block and can read & write from line 4-15
1529
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1530
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1531
 */
1532
static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1533
{
1534
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1535
        src+= 4*stride;
1536
        asm volatile(
1537
                "leal (%0, %1), %%eax                                \n\t"
1538
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
1539
//        0        1        2        3        4        5        6        7        8        9
1540
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1        %0+8%1        ecx+4%1
1541

    
1542
                "movq (%0), %%mm0                                \n\t"
1543
                "movq (%%eax, %1), %%mm1                        \n\t"
1544
                PAVGB(%%mm1, %%mm0)
1545
                "movq %%mm0, (%%eax)                                \n\t"
1546
                "movq (%0, %1, 4), %%mm0                        \n\t"
1547
                PAVGB(%%mm0, %%mm1)
1548
                "movq %%mm1, (%%eax, %1, 2)                        \n\t"
1549
                "movq (%%ecx, %1), %%mm1                        \n\t"
1550
                PAVGB(%%mm1, %%mm0)
1551
                "movq %%mm0, (%%ecx)                                \n\t"
1552
                "movq (%0, %1, 8), %%mm0                        \n\t"
1553
                PAVGB(%%mm0, %%mm1)
1554
                "movq %%mm1, (%%ecx, %1, 2)                        \n\t"
1555

    
1556
                : : "r" (src), "r" (stride)
1557
                : "%eax", "%ecx"
1558
        );
1559
#else
1560
        int x;
1561
        src+= 4*stride;
1562
        for(x=0; x<8; x++)
1563
        {
1564
                src[stride]   = (src[0]        + src[stride*2])>>1;
1565
                src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1566
                src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1567
                src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1568
                src++;
1569
        }
1570
#endif
1571
}
1572

    
1573
/**
1574
 * Deinterlaces the given block
1575
 * will be called for every 8x8 block and can read & write from line 4-15
1576
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1577
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1578
 * this filter will read lines 3-15 and write 7-13
1579
 * no cliping in C version
1580
 */
1581
static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1582
{
1583
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1584
        src+= stride*3;
1585
        asm volatile(
1586
                "leal (%0, %1), %%eax                                \n\t"
1587
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1588
                "leal (%%edx, %1, 4), %%ecx                        \n\t"
1589
                "addl %1, %%ecx                                        \n\t"
1590
                "pxor %%mm7, %%mm7                                \n\t"
1591
//        0        1        2        3        4        5        6        7        8        9        10
1592
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1 ecx
1593

    
1594
#define DEINT_CUBIC(a,b,c,d,e)\
1595
                "movq " #a ", %%mm0                                \n\t"\
1596
                "movq " #b ", %%mm1                                \n\t"\
1597
                "movq " #d ", %%mm2                                \n\t"\
1598
                "movq " #e ", %%mm3                                \n\t"\
1599
                PAVGB(%%mm2, %%mm1)                                        /* (b+d) /2 */\
1600
                PAVGB(%%mm3, %%mm0)                                        /* a(a+e) /2 */\
1601
                "movq %%mm0, %%mm2                                \n\t"\
1602
                "punpcklbw %%mm7, %%mm0                                \n\t"\
1603
                "punpckhbw %%mm7, %%mm2                                \n\t"\
1604
                "movq %%mm1, %%mm3                                \n\t"\
1605
                "punpcklbw %%mm7, %%mm1                                \n\t"\
1606
                "punpckhbw %%mm7, %%mm3                                \n\t"\
1607
                "psubw %%mm1, %%mm0                                \n\t"        /* L(a+e - (b+d))/2 */\
1608
                "psubw %%mm3, %%mm2                                \n\t"        /* H(a+e - (b+d))/2 */\
1609
                "psraw $3, %%mm0                                \n\t"        /* L(a+e - (b+d))/16 */\
1610
                "psraw $3, %%mm2                                \n\t"        /* H(a+e - (b+d))/16 */\
1611
                "psubw %%mm0, %%mm1                                \n\t"        /* L(9b + 9d - a - e)/16 */\
1612
                "psubw %%mm2, %%mm3                                \n\t"        /* H(9b + 9d - a - e)/16 */\
1613
                "packuswb %%mm3, %%mm1                                \n\t"\
1614
                "movq %%mm1, " #c "                                \n\t"
1615

    
1616
DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))
1617
DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))
1618
DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))
1619
DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1620

    
1621
                : : "r" (src), "r" (stride)
1622
                : "%eax", "%edx", "ecx"
1623
        );
1624
#else
1625
        int x;
1626
        src+= stride*3;
1627
        for(x=0; x<8; x++)
1628
        {
1629
                src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1630
                src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1631
                src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1632
                src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
1633
                src++;
1634
        }
1635
#endif
1636
}
1637

    
1638
/**
1639
 * Deinterlaces the given block
1640
 * will be called for every 8x8 block and can read & write from line 4-15
1641
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1642
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1643
 * this filter will read lines 4-13 and write 5-11
1644
 * no cliping in C version
1645
 */
1646
static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1647
{
1648
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1649
        src+= stride*4;
1650
        asm volatile(
1651
                "leal (%0, %1), %%eax                                \n\t"
1652
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1653
                "pxor %%mm7, %%mm7                                \n\t"
1654
                "movq (%2), %%mm0                                \n\t"
1655
//        0        1        2        3        4        5        6        7        8        9        10
1656
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1 ecx
1657

    
1658
#define DEINT_FF(a,b,c,d)\
1659
                "movq " #a ", %%mm1                                \n\t"\
1660
                "movq " #b ", %%mm2                                \n\t"\
1661
                "movq " #c ", %%mm3                                \n\t"\
1662
                "movq " #d ", %%mm4                                \n\t"\
1663
                PAVGB(%%mm3, %%mm1)                                        \
1664
                PAVGB(%%mm4, %%mm0)                                        \
1665
                "movq %%mm0, %%mm3                                \n\t"\
1666
                "punpcklbw %%mm7, %%mm0                                \n\t"\
1667
                "punpckhbw %%mm7, %%mm3                                \n\t"\
1668
                "movq %%mm1, %%mm4                                \n\t"\
1669
                "punpcklbw %%mm7, %%mm1                                \n\t"\
1670
                "punpckhbw %%mm7, %%mm4                                \n\t"\
1671
                "psllw $2, %%mm1                                \n\t"\
1672
                "psllw $2, %%mm4                                \n\t"\
1673
                "psubw %%mm0, %%mm1                                \n\t"\
1674
                "psubw %%mm3, %%mm4                                \n\t"\
1675
                "movq %%mm2, %%mm5                                \n\t"\
1676
                "movq %%mm2, %%mm0                                \n\t"\
1677
                "punpcklbw %%mm7, %%mm2                                \n\t"\
1678
                "punpckhbw %%mm7, %%mm5                                \n\t"\
1679
                "paddw %%mm2, %%mm1                                \n\t"\
1680
                "paddw %%mm5, %%mm4                                \n\t"\
1681
                "psraw $2, %%mm1                                \n\t"\
1682
                "psraw $2, %%mm4                                \n\t"\
1683
                "packuswb %%mm4, %%mm1                                \n\t"\
1684
                "movq %%mm1, " #b "                                \n\t"\
1685

    
1686
DEINT_FF((%0)       , (%%eax)       , (%%eax, %1), (%%eax, %1, 2))
1687
DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx)       )
1688
DEINT_FF((%0, %1, 4), (%%edx)       , (%%edx, %1), (%%edx, %1, 2))
1689
DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4))
1690

    
1691
                "movq %%mm0, (%2)                                \n\t"
1692
                : : "r" (src), "r" (stride), "r"(tmp)
1693
                : "%eax", "%edx"
1694
        );
1695
#else
1696
        int x;
1697
        src+= stride*4;
1698
        for(x=0; x<8; x++)
1699
        {
1700
                int t1= tmp[x];
1701
                int t2= src[stride*1];
1702

    
1703
                src[stride*1]= (-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3;
1704
                t1= src[stride*4];
1705
                src[stride*3]= (-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3;
1706
                t2= src[stride*6];
1707
                src[stride*5]= (-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3;
1708
                t1= src[stride*8];
1709
                src[stride*7]= (-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3;
1710
                tmp[x]= t1;
1711

    
1712
                src++;
1713
        }
1714
#endif
1715
}
1716

    
1717
/**
1718
 * Deinterlaces the given block
1719
 * will be called for every 8x8 block and can read & write from line 4-15
1720
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1721
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1722
 * will shift the image up by 1 line (FIXME if this is a problem)
1723
 * this filter will read lines 4-13 and write 4-11
1724
 */
1725
static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride)
1726
{
1727
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1728
        src+= 4*stride;
1729
        asm volatile(
1730
                "leal (%0, %1), %%eax                                \n\t"
1731
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1732
//        0        1        2        3        4        5        6        7        8        9
1733
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1734

    
1735
                "movq (%0), %%mm0                                \n\t" // L0
1736
                "movq (%%eax, %1), %%mm1                        \n\t" // L2
1737
                PAVGB(%%mm1, %%mm0)                                      // L0+L2
1738
                "movq (%%eax), %%mm2                                \n\t" // L1
1739
                PAVGB(%%mm2, %%mm0)
1740
                "movq %%mm0, (%0)                                \n\t"
1741
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // L3
1742
                PAVGB(%%mm0, %%mm2)                                      // L1+L3
1743
                PAVGB(%%mm1, %%mm2)                                      // 2L2 + L1 + L3
1744
                "movq %%mm2, (%%eax)                                \n\t"
1745
                "movq (%0, %1, 4), %%mm2                        \n\t" // L4
1746
                PAVGB(%%mm2, %%mm1)                                      // L2+L4
1747
                PAVGB(%%mm0, %%mm1)                                      // 2L3 + L2 + L4
1748
                "movq %%mm1, (%%eax, %1)                        \n\t"
1749
                "movq (%%edx), %%mm1                                \n\t" // L5
1750
                PAVGB(%%mm1, %%mm0)                                      // L3+L5
1751
                PAVGB(%%mm2, %%mm0)                                      // 2L4 + L3 + L5
1752
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
1753
                "movq (%%edx, %1), %%mm0                        \n\t" // L6
1754
                PAVGB(%%mm0, %%mm2)                                      // L4+L6
1755
                PAVGB(%%mm1, %%mm2)                                      // 2L5 + L4 + L6
1756
                "movq %%mm2, (%0, %1, 4)                        \n\t"
1757
                "movq (%%edx, %1, 2), %%mm2                        \n\t" // L7
1758
                PAVGB(%%mm2, %%mm1)                                      // L5+L7
1759
                PAVGB(%%mm0, %%mm1)                                      // 2L6 + L5 + L7
1760
                "movq %%mm1, (%%edx)                                \n\t"
1761
                "movq (%0, %1, 8), %%mm1                        \n\t" // L8
1762
                PAVGB(%%mm1, %%mm0)                                      // L6+L8
1763
                PAVGB(%%mm2, %%mm0)                                      // 2L7 + L6 + L8
1764
                "movq %%mm0, (%%edx, %1)                        \n\t"
1765
                "movq (%%edx, %1, 4), %%mm0                        \n\t" // L9
1766
                PAVGB(%%mm0, %%mm2)                                      // L7+L9
1767
                PAVGB(%%mm1, %%mm2)                                      // 2L8 + L7 + L9
1768
                "movq %%mm2, (%%edx, %1, 2)                        \n\t"
1769

    
1770

    
1771
                : : "r" (src), "r" (stride)
1772
                : "%eax", "%edx"
1773
        );
1774
#else
1775
        int x;
1776
        src+= 4*stride;
1777
        for(x=0; x<8; x++)
1778
        {
1779
                src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
1780
                src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
1781
                src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
1782
                src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
1783
                src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
1784
                src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
1785
                src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
1786
                src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
1787
                src++;
1788
        }
1789
#endif
1790
}
1791

    
1792
/**
1793
 * Deinterlaces the given block
1794
 * will be called for every 8x8 block and can read & write from line 4-15,
1795
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1796
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1797
 */
1798
static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1799
{
1800
#ifdef HAVE_MMX
1801
        src+= 4*stride;
1802
#ifdef HAVE_MMX2
1803
        asm volatile(
1804
                "leal (%0, %1), %%eax                                \n\t"
1805
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1806
//        0        1        2        3        4        5        6        7        8        9
1807
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1808

    
1809
                "movq (%0), %%mm0                                \n\t" //
1810
                "movq (%%eax, %1), %%mm2                        \n\t" //
1811
                "movq (%%eax), %%mm1                                \n\t" //
1812
                "movq %%mm0, %%mm3                                \n\t"
1813
                "pmaxub %%mm1, %%mm0                                \n\t" //
1814
                "pminub %%mm3, %%mm1                                \n\t" //
1815
                "pmaxub %%mm2, %%mm1                                \n\t" //
1816
                "pminub %%mm1, %%mm0                                \n\t"
1817
                "movq %%mm0, (%%eax)                                \n\t"
1818

    
1819
                "movq (%0, %1, 4), %%mm0                        \n\t" //
1820
                "movq (%%eax, %1, 2), %%mm1                        \n\t" //
1821
                "movq %%mm2, %%mm3                                \n\t"
1822
                "pmaxub %%mm1, %%mm2                                \n\t" //
1823
                "pminub %%mm3, %%mm1                                \n\t" //
1824
                "pmaxub %%mm0, %%mm1                                \n\t" //
1825
                "pminub %%mm1, %%mm2                                \n\t"
1826
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
1827

    
1828
                "movq (%%edx), %%mm2                                \n\t" //
1829
                "movq (%%edx, %1), %%mm1                        \n\t" //
1830
                "movq %%mm2, %%mm3                                \n\t"
1831
                "pmaxub %%mm0, %%mm2                                \n\t" //
1832
                "pminub %%mm3, %%mm0                                \n\t" //
1833
                "pmaxub %%mm1, %%mm0                                \n\t" //
1834
                "pminub %%mm0, %%mm2                                \n\t"
1835
                "movq %%mm2, (%%edx)                                \n\t"
1836

    
1837
                "movq (%%edx, %1, 2), %%mm2                        \n\t" //
1838
                "movq (%0, %1, 8), %%mm0                        \n\t" //
1839
                "movq %%mm2, %%mm3                                \n\t"
1840
                "pmaxub %%mm0, %%mm2                                \n\t" //
1841
                "pminub %%mm3, %%mm0                                \n\t" //
1842
                "pmaxub %%mm1, %%mm0                                \n\t" //
1843
                "pminub %%mm0, %%mm2                                \n\t"
1844
                "movq %%mm2, (%%edx, %1, 2)                        \n\t"
1845

    
1846

    
1847
                : : "r" (src), "r" (stride)
1848
                : "%eax", "%edx"
1849
        );
1850

    
1851
#else // MMX without MMX2
1852
        asm volatile(
1853
                "leal (%0, %1), %%eax                                \n\t"
1854
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1855
//        0        1        2        3        4        5        6        7        8        9
1856
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1857
                "pxor %%mm7, %%mm7                                \n\t"
1858

    
1859
#define MEDIAN(a,b,c)\
1860
                "movq " #a ", %%mm0                                \n\t"\
1861
                "movq " #b ", %%mm2                                \n\t"\
1862
                "movq " #c ", %%mm1                                \n\t"\
1863
                "movq %%mm0, %%mm3                                \n\t"\
1864
                "movq %%mm1, %%mm4                                \n\t"\
1865
                "movq %%mm2, %%mm5                                \n\t"\
1866
                "psubusb %%mm1, %%mm3                                \n\t"\
1867
                "psubusb %%mm2, %%mm4                                \n\t"\
1868
                "psubusb %%mm0, %%mm5                                \n\t"\
1869
                "pcmpeqb %%mm7, %%mm3                                \n\t"\
1870
                "pcmpeqb %%mm7, %%mm4                                \n\t"\
1871
                "pcmpeqb %%mm7, %%mm5                                \n\t"\
1872
                "movq %%mm3, %%mm6                                \n\t"\
1873
                "pxor %%mm4, %%mm3                                \n\t"\
1874
                "pxor %%mm5, %%mm4                                \n\t"\
1875
                "pxor %%mm6, %%mm5                                \n\t"\
1876
                "por %%mm3, %%mm1                                \n\t"\
1877
                "por %%mm4, %%mm2                                \n\t"\
1878
                "por %%mm5, %%mm0                                \n\t"\
1879
                "pand %%mm2, %%mm0                                \n\t"\
1880
                "pand %%mm1, %%mm0                                \n\t"\
1881
                "movq %%mm0, " #b "                                \n\t"
1882

    
1883
MEDIAN((%0), (%%eax), (%%eax, %1))
1884
MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
1885
MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1))
1886
MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
1887

    
1888
                : : "r" (src), "r" (stride)
1889
                : "%eax", "%edx"
1890
        );
1891
#endif // MMX
1892
#else
1893
        //FIXME
1894
        int x;
1895
        src+= 4*stride;
1896
        for(x=0; x<8; x++)
1897
        {
1898
                src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
1899
                src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
1900
                src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
1901
                src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
1902
                src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
1903
                src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
1904
                src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
1905
                src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
1906
                src++;
1907
        }
1908
#endif
1909
}
1910

    
1911
#ifdef HAVE_MMX
1912
/**
1913
 * transposes and shift the given 8x8 Block into dst1 and dst2
1914
 */
1915
static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
1916
{
1917
        asm(
1918
                "leal (%0, %1), %%eax                                \n\t"
1919
//        0        1        2        3        4        5        6        7        8        9
1920
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1921
                "movq (%0), %%mm0                \n\t" // 12345678
1922
                "movq (%%eax), %%mm1                \n\t" // abcdefgh
1923
                "movq %%mm0, %%mm2                \n\t" // 12345678
1924
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
1925
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
1926

    
1927
                "movq (%%eax, %1), %%mm1        \n\t"
1928
                "movq (%%eax, %1, 2), %%mm3        \n\t"
1929
                "movq %%mm1, %%mm4                \n\t"
1930
                "punpcklbw %%mm3, %%mm1                \n\t"
1931
                "punpckhbw %%mm3, %%mm4                \n\t"
1932

    
1933
                "movq %%mm0, %%mm3                \n\t"
1934
                "punpcklwd %%mm1, %%mm0                \n\t"
1935
                "punpckhwd %%mm1, %%mm3                \n\t"
1936
                "movq %%mm2, %%mm1                \n\t"
1937
                "punpcklwd %%mm4, %%mm2                \n\t"
1938
                "punpckhwd %%mm4, %%mm1                \n\t"
1939

    
1940
                "movd %%mm0, 128(%2)                \n\t"
1941
                "psrlq $32, %%mm0                \n\t"
1942
                "movd %%mm0, 144(%2)                \n\t"
1943
                "movd %%mm3, 160(%2)                \n\t"
1944
                "psrlq $32, %%mm3                \n\t"
1945
                "movd %%mm3, 176(%2)                \n\t"
1946
                "movd %%mm3, 48(%3)                \n\t"
1947
                "movd %%mm2, 192(%2)                \n\t"
1948
                "movd %%mm2, 64(%3)                \n\t"
1949
                "psrlq $32, %%mm2                \n\t"
1950
                "movd %%mm2, 80(%3)                \n\t"
1951
                "movd %%mm1, 96(%3)                \n\t"
1952
                "psrlq $32, %%mm1                \n\t"
1953
                "movd %%mm1, 112(%3)                \n\t"
1954

    
1955
                "leal (%%eax, %1, 4), %%eax        \n\t"
1956
                
1957
                "movq (%0, %1, 4), %%mm0        \n\t" // 12345678
1958
                "movq (%%eax), %%mm1                \n\t" // abcdefgh
1959
                "movq %%mm0, %%mm2                \n\t" // 12345678
1960
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
1961
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
1962

    
1963
                "movq (%%eax, %1), %%mm1        \n\t"
1964
                "movq (%%eax, %1, 2), %%mm3        \n\t"
1965
                "movq %%mm1, %%mm4                \n\t"
1966
                "punpcklbw %%mm3, %%mm1                \n\t"
1967
                "punpckhbw %%mm3, %%mm4                \n\t"
1968

    
1969
                "movq %%mm0, %%mm3                \n\t"
1970
                "punpcklwd %%mm1, %%mm0                \n\t"
1971
                "punpckhwd %%mm1, %%mm3                \n\t"
1972
                "movq %%mm2, %%mm1                \n\t"
1973
                "punpcklwd %%mm4, %%mm2                \n\t"
1974
                "punpckhwd %%mm4, %%mm1                \n\t"
1975

    
1976
                "movd %%mm0, 132(%2)                \n\t"
1977
                "psrlq $32, %%mm0                \n\t"
1978
                "movd %%mm0, 148(%2)                \n\t"
1979
                "movd %%mm3, 164(%2)                \n\t"
1980
                "psrlq $32, %%mm3                \n\t"
1981
                "movd %%mm3, 180(%2)                \n\t"
1982
                "movd %%mm3, 52(%3)                \n\t"
1983
                "movd %%mm2, 196(%2)                \n\t"
1984
                "movd %%mm2, 68(%3)                \n\t"
1985
                "psrlq $32, %%mm2                \n\t"
1986
                "movd %%mm2, 84(%3)                \n\t"
1987
                "movd %%mm1, 100(%3)                \n\t"
1988
                "psrlq $32, %%mm1                \n\t"
1989
                "movd %%mm1, 116(%3)                \n\t"
1990

    
1991

    
1992
        :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
1993
        : "%eax"
1994
        );
1995
}
1996

    
1997
/**
1998
 * transposes the given 8x8 block
1999
 */
2000
static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2001
{
2002
        asm(
2003
                "leal (%0, %1), %%eax                                \n\t"
2004
                "leal (%%eax, %1, 4), %%edx                        \n\t"
2005
//        0        1        2        3        4        5        6        7        8        9
2006
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
2007
                "movq (%2), %%mm0                \n\t" // 12345678
2008
                "movq 16(%2), %%mm1                \n\t" // abcdefgh
2009
                "movq %%mm0, %%mm2                \n\t" // 12345678
2010
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2011
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2012

    
2013
                "movq 32(%2), %%mm1                \n\t"
2014
                "movq 48(%2), %%mm3                \n\t"
2015
                "movq %%mm1, %%mm4                \n\t"
2016
                "punpcklbw %%mm3, %%mm1                \n\t"
2017
                "punpckhbw %%mm3, %%mm4                \n\t"
2018

    
2019
                "movq %%mm0, %%mm3                \n\t"
2020
                "punpcklwd %%mm1, %%mm0                \n\t"
2021
                "punpckhwd %%mm1, %%mm3                \n\t"
2022
                "movq %%mm2, %%mm1                \n\t"
2023
                "punpcklwd %%mm4, %%mm2                \n\t"
2024
                "punpckhwd %%mm4, %%mm1                \n\t"
2025

    
2026
                "movd %%mm0, (%0)                \n\t"
2027
                "psrlq $32, %%mm0                \n\t"
2028
                "movd %%mm0, (%%eax)                \n\t"
2029
                "movd %%mm3, (%%eax, %1)        \n\t"
2030
                "psrlq $32, %%mm3                \n\t"
2031
                "movd %%mm3, (%%eax, %1, 2)        \n\t"
2032
                "movd %%mm2, (%0, %1, 4)        \n\t"
2033
                "psrlq $32, %%mm2                \n\t"
2034
                "movd %%mm2, (%%edx)                \n\t"
2035
                "movd %%mm1, (%%edx, %1)        \n\t"
2036
                "psrlq $32, %%mm1                \n\t"
2037
                "movd %%mm1, (%%edx, %1, 2)        \n\t"
2038

    
2039

    
2040
                "movq 64(%2), %%mm0                \n\t" // 12345678
2041
                "movq 80(%2), %%mm1                \n\t" // abcdefgh
2042
                "movq %%mm0, %%mm2                \n\t" // 12345678
2043
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2044
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2045

    
2046
                "movq 96(%2), %%mm1                \n\t"
2047
                "movq 112(%2), %%mm3                \n\t"
2048
                "movq %%mm1, %%mm4                \n\t"
2049
                "punpcklbw %%mm3, %%mm1                \n\t"
2050
                "punpckhbw %%mm3, %%mm4                \n\t"
2051

    
2052
                "movq %%mm0, %%mm3                \n\t"
2053
                "punpcklwd %%mm1, %%mm0                \n\t"
2054
                "punpckhwd %%mm1, %%mm3                \n\t"
2055
                "movq %%mm2, %%mm1                \n\t"
2056
                "punpcklwd %%mm4, %%mm2                \n\t"
2057
                "punpckhwd %%mm4, %%mm1                \n\t"
2058

    
2059
                "movd %%mm0, 4(%0)                \n\t"
2060
                "psrlq $32, %%mm0                \n\t"
2061
                "movd %%mm0, 4(%%eax)                \n\t"
2062
                "movd %%mm3, 4(%%eax, %1)        \n\t"
2063
                "psrlq $32, %%mm3                \n\t"
2064
                "movd %%mm3, 4(%%eax, %1, 2)        \n\t"
2065
                "movd %%mm2, 4(%0, %1, 4)        \n\t"
2066
                "psrlq $32, %%mm2                \n\t"
2067
                "movd %%mm2, 4(%%edx)                \n\t"
2068
                "movd %%mm1, 4(%%edx, %1)        \n\t"
2069
                "psrlq $32, %%mm1                \n\t"
2070
                "movd %%mm1, 4(%%edx, %1, 2)        \n\t"
2071

    
2072
        :: "r" (dst), "r" (dstStride), "r" (src)
2073
        : "%eax", "%edx"
2074
        );
2075
}
2076
#endif
2077
//static int test=0;
2078

    
2079
static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2080
                                    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2081
{
2082
        // to save a register (FIXME do this outside of the loops)
2083
        tempBluredPast[127]= maxNoise[0];
2084
        tempBluredPast[128]= maxNoise[1];
2085
        tempBluredPast[129]= maxNoise[2];
2086
        
2087
#define FAST_L2_DIFF
2088
//#define L1_DIFF //u should change the thresholds too if u try that one
2089
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2090
        asm volatile(
2091
                "leal (%2, %2, 2), %%eax                        \n\t" // 3*stride
2092
                "leal (%2, %2, 4), %%edx                        \n\t" // 5*stride
2093
                "leal (%%edx, %2, 2), %%ecx                        \n\t" // 7*stride
2094
//        0        1        2        3        4        5        6        7        8        9
2095
//        %x        %x+%2        %x+2%2        %x+eax        %x+4%2        %x+edx        %x+2eax        %x+ecx        %x+8%2
2096
//FIXME reorder?
2097
#ifdef L1_DIFF //needs mmx2
2098
                "movq (%0), %%mm0                                \n\t" // L0
2099
                "psadbw (%1), %%mm0                                \n\t" // |L0-R0|
2100
                "movq (%0, %2), %%mm1                                \n\t" // L1
2101
                "psadbw (%1, %2), %%mm1                                \n\t" // |L1-R1|
2102
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2103
                "psadbw (%1, %2, 2), %%mm2                        \n\t" // |L2-R2|
2104
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2105
                "psadbw (%1, %%eax), %%mm3                        \n\t" // |L3-R3|
2106

    
2107
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2108
                "paddw %%mm1, %%mm0                                \n\t"
2109
                "psadbw (%1, %2, 4), %%mm4                        \n\t" // |L4-R4|
2110
                "movq (%0, %%edx), %%mm5                        \n\t" // L5
2111
                "paddw %%mm2, %%mm0                                \n\t"
2112
                "psadbw (%1, %%edx), %%mm5                        \n\t" // |L5-R5|
2113
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2114
                "paddw %%mm3, %%mm0                                \n\t"
2115
                "psadbw (%1, %%eax, 2), %%mm6                        \n\t" // |L6-R6|
2116
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2117
                "paddw %%mm4, %%mm0                                \n\t"
2118
                "psadbw (%1, %%ecx), %%mm7                        \n\t" // |L7-R7|
2119
                "paddw %%mm5, %%mm6                                \n\t"
2120
                "paddw %%mm7, %%mm6                                \n\t"
2121
                "paddw %%mm6, %%mm0                                \n\t"
2122
#elif defined (FAST_L2_DIFF)
2123
                "pcmpeqb %%mm7, %%mm7                                \n\t"
2124
                "movq "MANGLE(b80)", %%mm6                        \n\t"
2125
                "pxor %%mm0, %%mm0                                \n\t"
2126
#define L2_DIFF_CORE(a, b)\
2127
                "movq " #a ", %%mm5                                \n\t"\
2128
                "movq " #b ", %%mm2                                \n\t"\
2129
                "pxor %%mm7, %%mm2                                \n\t"\
2130
                PAVGB(%%mm2, %%mm5)\
2131
                "paddb %%mm6, %%mm5                                \n\t"\
2132
                "movq %%mm5, %%mm2                                \n\t"\
2133
                "psllw $8, %%mm5                                \n\t"\
2134
                "pmaddwd %%mm5, %%mm5                                \n\t"\
2135
                "pmaddwd %%mm2, %%mm2                                \n\t"\
2136
                "paddd %%mm2, %%mm5                                \n\t"\
2137
                "psrld $14, %%mm5                                \n\t"\
2138
                "paddd %%mm5, %%mm0                                \n\t"
2139

    
2140
L2_DIFF_CORE((%0), (%1))
2141
L2_DIFF_CORE((%0, %2), (%1, %2))
2142
L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2143
L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2144
L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2145
L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2146
L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2147
L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2148

    
2149
#else
2150
                "pxor %%mm7, %%mm7                                \n\t"
2151
                "pxor %%mm0, %%mm0                                \n\t"
2152
#define L2_DIFF_CORE(a, b)\
2153
                "movq " #a ", %%mm5                                \n\t"\
2154
                "movq " #b ", %%mm2                                \n\t"\
2155
                "movq %%mm5, %%mm1                                \n\t"\
2156
                "movq %%mm2, %%mm3                                \n\t"\
2157
                "punpcklbw %%mm7, %%mm5                                \n\t"\
2158
                "punpckhbw %%mm7, %%mm1                                \n\t"\
2159
                "punpcklbw %%mm7, %%mm2                                \n\t"\
2160
                "punpckhbw %%mm7, %%mm3                                \n\t"\
2161
                "psubw %%mm2, %%mm5                                \n\t"\
2162
                "psubw %%mm3, %%mm1                                \n\t"\
2163
                "pmaddwd %%mm5, %%mm5                                \n\t"\
2164
                "pmaddwd %%mm1, %%mm1                                \n\t"\
2165
                "paddd %%mm1, %%mm5                                \n\t"\
2166
                "paddd %%mm5, %%mm0                                \n\t"
2167

    
2168
L2_DIFF_CORE((%0), (%1))
2169
L2_DIFF_CORE((%0, %2), (%1, %2))
2170
L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2171
L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2172
L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2173
L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2174
L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2175
L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2176

    
2177
#endif
2178

    
2179
                "movq %%mm0, %%mm4                                \n\t"
2180
                "psrlq $32, %%mm0                                \n\t"
2181
                "paddd %%mm0, %%mm4                                \n\t"
2182
                "movd %%mm4, %%ecx                                \n\t"
2183
                "shll $2, %%ecx                                        \n\t"
2184
                "movl %3, %%edx                                        \n\t"
2185
                "addl -4(%%edx), %%ecx                                \n\t"
2186
                "addl 4(%%edx), %%ecx                                \n\t"
2187
                "addl -1024(%%edx), %%ecx                        \n\t"
2188
                "addl $4, %%ecx                                        \n\t"
2189
                "addl 1024(%%edx), %%ecx                        \n\t"
2190
                "shrl $3, %%ecx                                        \n\t"
2191
                "movl %%ecx, (%%edx)                                \n\t"
2192

    
2193
//                "movl %3, %%ecx                                        \n\t"
2194
//                "movl %%ecx, test                                \n\t"
2195
//                "jmp 4f \n\t"
2196
                "cmpl 512(%%edx), %%ecx                                \n\t"
2197
                " jb 2f                                                \n\t"
2198
                "cmpl 516(%%edx), %%ecx                                \n\t"
2199
                " jb 1f                                                \n\t"
2200

    
2201
                "leal (%%eax, %2, 2), %%edx                        \n\t" // 5*stride
2202
                "leal (%%edx, %2, 2), %%ecx                        \n\t" // 7*stride
2203
                "movq (%0), %%mm0                                \n\t" // L0
2204
                "movq (%0, %2), %%mm1                                \n\t" // L1
2205
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2206
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2207
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2208
                "movq (%0, %%edx), %%mm5                        \n\t" // L5
2209
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2210
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2211
                "movq %%mm0, (%1)                                \n\t" // L0
2212
                "movq %%mm1, (%1, %2)                                \n\t" // L1
2213
                "movq %%mm2, (%1, %2, 2)                        \n\t" // L2
2214
                "movq %%mm3, (%1, %%eax)                        \n\t" // L3
2215
                "movq %%mm4, (%1, %2, 4)                        \n\t" // L4
2216
                "movq %%mm5, (%1, %%edx)                        \n\t" // L5
2217
                "movq %%mm6, (%1, %%eax, 2)                        \n\t" // L6
2218
                "movq %%mm7, (%1, %%ecx)                        \n\t" // L7
2219
                "jmp 4f                                                \n\t"
2220

    
2221
                "1:                                                \n\t"
2222
                "leal (%%eax, %2, 2), %%edx                        \n\t" // 5*stride
2223
                "leal (%%edx, %2, 2), %%ecx                        \n\t" // 7*stride
2224
                "movq (%0), %%mm0                                \n\t" // L0
2225
                PAVGB((%1), %%mm0)                                      // L0
2226
                "movq (%0, %2), %%mm1                                \n\t" // L1
2227
                PAVGB((%1, %2), %%mm1)                                      // L1
2228
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2229
                PAVGB((%1, %2, 2), %%mm2)                              // L2
2230
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2231
                PAVGB((%1, %%eax), %%mm3)                              // L3
2232
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2233
                PAVGB((%1, %2, 4), %%mm4)                              // L4
2234
                "movq (%0, %%edx), %%mm5                        \n\t" // L5
2235
                PAVGB((%1, %%edx), %%mm5)                              // L5
2236
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2237
                PAVGB((%1, %%eax, 2), %%mm6)                              // L6
2238
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2239
                PAVGB((%1, %%ecx), %%mm7)                              // L7
2240
                "movq %%mm0, (%1)                                \n\t" // R0
2241
                "movq %%mm1, (%1, %2)                                \n\t" // R1
2242
                "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2243
                "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2244
                "movq %%mm4, (%1, %2, 4)                        \n\t" // R4
2245
                "movq %%mm5, (%1, %%edx)                        \n\t" // R5
2246
                "movq %%mm6, (%1, %%eax, 2)                        \n\t" // R6
2247
                "movq %%mm7, (%1, %%ecx)                        \n\t" // R7
2248
                "movq %%mm0, (%0)                                \n\t" // L0
2249
                "movq %%mm1, (%0, %2)                                \n\t" // L1
2250
                "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2251
                "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2252
                "movq %%mm4, (%0, %2, 4)                        \n\t" // L4
2253
                "movq %%mm5, (%0, %%edx)                        \n\t" // L5
2254
                "movq %%mm6, (%0, %%eax, 2)                        \n\t" // L6
2255
                "movq %%mm7, (%0, %%ecx)                        \n\t" // L7
2256
                "jmp 4f                                                \n\t"
2257

    
2258
                "2:                                                \n\t"
2259
                "cmpl 508(%%edx), %%ecx                                \n\t"
2260
                " jb 3f                                                \n\t"
2261

    
2262
                "leal (%%eax, %2, 2), %%edx                        \n\t" // 5*stride
2263
                "leal (%%edx, %2, 2), %%ecx                        \n\t" // 7*stride
2264
                "movq (%0), %%mm0                                \n\t" // L0
2265
                "movq (%0, %2), %%mm1                                \n\t" // L1
2266
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2267
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2268
                "movq (%1), %%mm4                                \n\t" // R0
2269
                "movq (%1, %2), %%mm5                                \n\t" // R1
2270
                "movq (%1, %2, 2), %%mm6                        \n\t" // R2
2271
                "movq (%1, %%eax), %%mm7                        \n\t" // R3
2272
                PAVGB(%%mm4, %%mm0)
2273
                PAVGB(%%mm5, %%mm1)
2274
                PAVGB(%%mm6, %%mm2)
2275
                PAVGB(%%mm7, %%mm3)
2276
                PAVGB(%%mm4, %%mm0)
2277
                PAVGB(%%mm5, %%mm1)
2278
                PAVGB(%%mm6, %%mm2)
2279
                PAVGB(%%mm7, %%mm3)
2280
                "movq %%mm0, (%1)                                \n\t" // R0
2281
                "movq %%mm1, (%1, %2)                                \n\t" // R1
2282
                "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2283
                "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2284
                "movq %%mm0, (%0)                                \n\t" // L0
2285
                "movq %%mm1, (%0, %2)                                \n\t" // L1
2286
                "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2287
                "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2288

    
2289
                "movq (%0, %2, 4), %%mm0                        \n\t" // L4
2290
                "movq (%0, %%edx), %%mm1                        \n\t" // L5
2291
                "movq (%0, %%eax, 2), %%mm2                        \n\t" // L6
2292
                "movq (%0, %%ecx), %%mm3                        \n\t" // L7
2293
                "movq (%1, %2, 4), %%mm4                        \n\t" // R4
2294
                "movq (%1, %%edx), %%mm5                        \n\t" // R5
2295
                "movq (%1, %%eax, 2), %%mm6                        \n\t" // R6
2296
                "movq (%1, %%ecx), %%mm7                        \n\t" // R7
2297
                PAVGB(%%mm4, %%mm0)
2298
                PAVGB(%%mm5, %%mm1)
2299
                PAVGB(%%mm6, %%mm2)
2300
                PAVGB(%%mm7, %%mm3)
2301
                PAVGB(%%mm4, %%mm0)
2302
                PAVGB(%%mm5, %%mm1)
2303
                PAVGB(%%mm6, %%mm2)
2304
                PAVGB(%%mm7, %%mm3)
2305
                "movq %%mm0, (%1, %2, 4)                        \n\t" // R4
2306
                "movq %%mm1, (%1, %%edx)                        \n\t" // R5
2307
                "movq %%mm2, (%1, %%eax, 2)                        \n\t" // R6
2308
                "movq %%mm3, (%1, %%ecx)                        \n\t" // R7
2309
                "movq %%mm0, (%0, %2, 4)                        \n\t" // L4
2310
                "movq %%mm1, (%0, %%edx)                        \n\t" // L5
2311
                "movq %%mm2, (%0, %%eax, 2)                        \n\t" // L6
2312
                "movq %%mm3, (%0, %%ecx)                        \n\t" // L7
2313
                "jmp 4f                                                \n\t"
2314

    
2315
                "3:                                                \n\t"
2316
                "leal (%%eax, %2, 2), %%edx                        \n\t" // 5*stride
2317
                "leal (%%edx, %2, 2), %%ecx                        \n\t" // 7*stride
2318
                "movq (%0), %%mm0                                \n\t" // L0
2319
                "movq (%0, %2), %%mm1                                \n\t" // L1
2320
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2321
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2322
                "movq (%1), %%mm4                                \n\t" // R0
2323
                "movq (%1, %2), %%mm5                                \n\t" // R1
2324
                "movq (%1, %2, 2), %%mm6                        \n\t" // R2
2325
                "movq (%1, %%eax), %%mm7                        \n\t" // R3
2326
                PAVGB(%%mm4, %%mm0)
2327
                PAVGB(%%mm5, %%mm1)
2328
                PAVGB(%%mm6, %%mm2)
2329
                PAVGB(%%mm7, %%mm3)
2330
                PAVGB(%%mm4, %%mm0)
2331
                PAVGB(%%mm5, %%mm1)
2332
                PAVGB(%%mm6, %%mm2)
2333
                PAVGB(%%mm7, %%mm3)
2334
                PAVGB(%%mm4, %%mm0)
2335
                PAVGB(%%mm5, %%mm1)
2336
                PAVGB(%%mm6, %%mm2)
2337
                PAVGB(%%mm7, %%mm3)
2338
                "movq %%mm0, (%1)                                \n\t" // R0
2339
                "movq %%mm1, (%1, %2)                                \n\t" // R1
2340
                "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2341
                "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2342
                "movq %%mm0, (%0)                                \n\t" // L0
2343
                "movq %%mm1, (%0, %2)                                \n\t" // L1
2344
                "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2345
                "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2346

    
2347
                "movq (%0, %2, 4), %%mm0                        \n\t" // L4
2348
                "movq (%0, %%edx), %%mm1                        \n\t" // L5
2349
                "movq (%0, %%eax, 2), %%mm2                        \n\t" // L6
2350
                "movq (%0, %%ecx), %%mm3                        \n\t" // L7
2351
                "movq (%1, %2, 4), %%mm4                        \n\t" // R4
2352
                "movq (%1, %%edx), %%mm5                        \n\t" // R5
2353
                "movq (%1, %%eax, 2), %%mm6                        \n\t" // R6
2354
                "movq (%1, %%ecx), %%mm7                        \n\t" // R7
2355
                PAVGB(%%mm4, %%mm0)
2356
                PAVGB(%%mm5, %%mm1)
2357
                PAVGB(%%mm6, %%mm2)
2358
                PAVGB(%%mm7, %%mm3)
2359
                PAVGB(%%mm4, %%mm0)
2360
                PAVGB(%%mm5, %%mm1)
2361
                PAVGB(%%mm6, %%mm2)
2362
                PAVGB(%%mm7, %%mm3)
2363
                PAVGB(%%mm4, %%mm0)
2364
                PAVGB(%%mm5, %%mm1)
2365
                PAVGB(%%mm6, %%mm2)
2366
                PAVGB(%%mm7, %%mm3)
2367
                "movq %%mm0, (%1, %2, 4)                        \n\t" // R4
2368
                "movq %%mm1, (%1, %%edx)                        \n\t" // R5
2369
                "movq %%mm2, (%1, %%eax, 2)                        \n\t" // R6
2370
                "movq %%mm3, (%1, %%ecx)                        \n\t" // R7
2371
                "movq %%mm0, (%0, %2, 4)                        \n\t" // L4
2372
                "movq %%mm1, (%0, %%edx)                        \n\t" // L5
2373
                "movq %%mm2, (%0, %%eax, 2)                        \n\t" // L6
2374
                "movq %%mm3, (%0, %%ecx)                        \n\t" // L7
2375

    
2376
                "4:                                                \n\t"
2377

    
2378
                :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
2379
                : "%eax", "%edx", "%ecx", "memory"
2380
                );
2381
//printf("%d\n", test);
2382
#else
2383
{
2384
        int y;
2385
        int d=0;
2386
        int sysd=0;
2387
        int i;
2388

    
2389
        for(y=0; y<8; y++)
2390
        {
2391
                int x;
2392
                for(x=0; x<8; x++)
2393
                {
2394
                        int ref= tempBlured[ x + y*stride ];
2395
                        int cur= src[ x + y*stride ];
2396
                        int d1=ref - cur;
2397
//                        if(x==0 || x==7) d1+= d1>>1;
2398
//                        if(y==0 || y==7) d1+= d1>>1;
2399
//                        d+= ABS(d1);
2400
                        d+= d1*d1;
2401
                        sysd+= d1;
2402
                }
2403
        }
2404
        i=d;
2405
        d=         (
2406
                4*d
2407
                +(*(tempBluredPast-256))
2408
                +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2409
                +(*(tempBluredPast+256))
2410
                +4)>>3;
2411
        *tempBluredPast=i;
2412
//        ((*tempBluredPast)*3 + d + 2)>>2;
2413

    
2414
//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2415
/*
2416
Switch between
2417
 1  0  0  0  0  0  0  (0)
2418
64 32 16  8  4  2  1  (1)
2419
64 48 36 27 20 15 11 (33) (approx)
2420
64 56 49 43 37 33 29 (200) (approx)
2421
*/
2422
        if(d > maxNoise[1])
2423
        {
2424
                if(d < maxNoise[2])
2425
                {
2426
                        for(y=0; y<8; y++)
2427
                        {
2428
                                int x;
2429
                                for(x=0; x<8; x++)
2430
                                {
2431
                                        int ref= tempBlured[ x + y*stride ];
2432
                                        int cur= src[ x + y*stride ];
2433
                                        tempBlured[ x + y*stride ]=
2434
                                        src[ x + y*stride ]=
2435
                                                (ref + cur + 1)>>1;
2436
                                }
2437
                        }
2438
                }
2439
                else
2440
                {
2441
                        for(y=0; y<8; y++)
2442
                        {
2443
                                int x;
2444
                                for(x=0; x<8; x++)
2445
                                {
2446
                                        tempBlured[ x + y*stride ]= src[ x + y*stride ];
2447
                                }
2448
                        }
2449
                }
2450
        }
2451
        else
2452
        {
2453
                if(d < maxNoise[0])
2454
                {
2455
                        for(y=0; y<8; y++)
2456
                        {
2457
                                int x;
2458
                                for(x=0; x<8; x++)
2459
                                {
2460
                                        int ref= tempBlured[ x + y*stride ];
2461
                                        int cur= src[ x + y*stride ];
2462
                                        tempBlured[ x + y*stride ]=
2463
                                        src[ x + y*stride ]=
2464
                                                (ref*7 + cur + 4)>>3;
2465
                                }
2466
                        }
2467
                }
2468
                else
2469
                {
2470
                        for(y=0; y<8; y++)
2471
                        {
2472
                                int x;
2473
                                for(x=0; x<8; x++)
2474
                                {
2475
                                        int ref= tempBlured[ x + y*stride ];
2476
                                        int cur= src[ x + y*stride ];
2477
                                        tempBlured[ x + y*stride ]=
2478
                                        src[ x + y*stride ]=
2479
                                                (ref*3 + cur + 2)>>2;
2480
                                }
2481
                        }
2482
                }
2483
        }
2484
}
2485
#endif
2486
}
2487

    
2488
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2489
        QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
2490

    
2491
/**
2492
 * Copies a block from src to dst and fixes the blacklevel
2493
 * levelFix == 0 -> dont touch the brighness & contrast
2494
 */
2495
#undef SCALED_CPY
2496

    
2497
static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2498
        int levelFix, int64_t *packedOffsetAndScale)
2499
{
2500
#ifndef HAVE_MMX
2501
        int i;
2502
#endif
2503
        if(levelFix)
2504
        {
2505
#ifdef HAVE_MMX
2506
                                        asm volatile(
2507
                                                "movq (%%eax), %%mm2        \n\t" // packedYOffset
2508
                                                "movq 8(%%eax), %%mm3        \n\t" // packedYScale
2509
                                                "leal (%2,%4), %%eax        \n\t"
2510
                                                "leal (%3,%5), %%edx        \n\t"
2511
                                                "pxor %%mm4, %%mm4        \n\t"
2512
#ifdef HAVE_MMX2
2513
#define SCALED_CPY(src1, src2, dst1, dst2)                                        \
2514
                                                "movq " #src1 ", %%mm0        \n\t"\
2515
                                                "movq " #src1 ", %%mm5        \n\t"\
2516
                                                "movq " #src2 ", %%mm1        \n\t"\
2517
                                                "movq " #src2 ", %%mm6        \n\t"\
2518
                                                "punpcklbw %%mm0, %%mm0 \n\t"\
2519
                                                "punpckhbw %%mm5, %%mm5 \n\t"\
2520
                                                "punpcklbw %%mm1, %%mm1 \n\t"\
2521
                                                "punpckhbw %%mm6, %%mm6 \n\t"\
2522
                                                "pmulhuw %%mm3, %%mm0        \n\t"\
2523
                                                "pmulhuw %%mm3, %%mm5        \n\t"\
2524
                                                "pmulhuw %%mm3, %%mm1        \n\t"\
2525
                                                "pmulhuw %%mm3, %%mm6        \n\t"\
2526
                                                "psubw %%mm2, %%mm0        \n\t"\
2527
                                                "psubw %%mm2, %%mm5        \n\t"\
2528
                                                "psubw %%mm2, %%mm1        \n\t"\
2529
                                                "psubw %%mm2, %%mm6        \n\t"\
2530
                                                "packuswb %%mm5, %%mm0        \n\t"\
2531
                                                "packuswb %%mm6, %%mm1        \n\t"\
2532
                                                "movq %%mm0, " #dst1 "        \n\t"\
2533
                                                "movq %%mm1, " #dst2 "        \n\t"\
2534

    
2535
#else //HAVE_MMX2
2536
#define SCALED_CPY(src1, src2, dst1, dst2)                                        \
2537
                                                "movq " #src1 ", %%mm0        \n\t"\
2538
                                                "movq " #src1 ", %%mm5        \n\t"\
2539
                                                "punpcklbw %%mm4, %%mm0 \n\t"\
2540
                                                "punpckhbw %%mm4, %%mm5 \n\t"\
2541
                                                "psubw %%mm2, %%mm0        \n\t"\
2542
                                                "psubw %%mm2, %%mm5        \n\t"\
2543
                                                "movq " #src2 ", %%mm1        \n\t"\
2544
                                                "psllw $6, %%mm0        \n\t"\
2545
                                                "psllw $6, %%mm5        \n\t"\
2546
                                                "pmulhw %%mm3, %%mm0        \n\t"\
2547
                                                "movq " #src2 ", %%mm6        \n\t"\
2548
                                                "pmulhw %%mm3, %%mm5        \n\t"\
2549
                                                "punpcklbw %%mm4, %%mm1 \n\t"\
2550
                                                "punpckhbw %%mm4, %%mm6 \n\t"\
2551
                                                "psubw %%mm2, %%mm1        \n\t"\
2552
                                                "psubw %%mm2, %%mm6        \n\t"\
2553
                                                "psllw $6, %%mm1        \n\t"\
2554
                                                "psllw $6, %%mm6        \n\t"\
2555
                                                "pmulhw %%mm3, %%mm1        \n\t"\
2556
                                                "pmulhw %%mm3, %%mm6        \n\t"\
2557
                                                "packuswb %%mm5, %%mm0        \n\t"\
2558
                                                "packuswb %%mm6, %%mm1        \n\t"\
2559
                                                "movq %%mm0, " #dst1 "        \n\t"\
2560
                                                "movq %%mm1, " #dst2 "        \n\t"\
2561

    
2562
#endif //!HAVE_MMX2
2563

    
2564
SCALED_CPY((%2)       , (%2, %4)      , (%3)       , (%3, %5))
2565
SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2))
2566
SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4))
2567
                                                "leal (%%eax,%4,4), %%eax        \n\t"
2568
                                                "leal (%%edx,%5,4), %%edx        \n\t"
2569
SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
2570

    
2571

    
2572
                                                : "=&a" (packedOffsetAndScale)
2573
                                                : "0" (packedOffsetAndScale),
2574
                                                "r"(src),
2575
                                                "r"(dst),
2576
                                                "r" (srcStride),
2577
                                                "r" (dstStride)
2578
                                                : "%edx"
2579
                                        );
2580
#else
2581
                                for(i=0; i<8; i++)
2582
                                        memcpy(        &(dst[dstStride*i]),
2583
                                                &(src[srcStride*i]), BLOCK_SIZE);
2584
#endif
2585
        }
2586
        else
2587
        {
2588
#ifdef HAVE_MMX
2589
                                        asm volatile(
2590
                                                "leal (%0,%2), %%eax        \n\t"
2591
                                                "leal (%1,%3), %%edx        \n\t"
2592

    
2593
#define SIMPLE_CPY(src1, src2, dst1, dst2)                                \
2594
                                                "movq " #src1 ", %%mm0        \n\t"\
2595
                                                "movq " #src2 ", %%mm1        \n\t"\
2596
                                                "movq %%mm0, " #dst1 "        \n\t"\
2597
                                                "movq %%mm1, " #dst2 "        \n\t"\
2598

    
2599
SIMPLE_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
2600
SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2))
2601
SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4))
2602
                                                "leal (%%eax,%2,4), %%eax        \n\t"
2603
                                                "leal (%%edx,%3,4), %%edx        \n\t"
2604
SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2))
2605

    
2606
                                                : : "r" (src),
2607
                                                "r" (dst),
2608
                                                "r" (srcStride),
2609
                                                "r" (dstStride)
2610
                                                : "%eax", "%edx"
2611
                                        );
2612
#else
2613
                                for(i=0; i<8; i++)
2614
                                        memcpy(        &(dst[dstStride*i]),
2615
                                                &(src[srcStride*i]), BLOCK_SIZE);
2616
#endif
2617
        }
2618
}
2619

    
2620
/**
2621
 * Duplicates the given 8 src pixels ? times upward
2622
 */
2623
static inline void RENAME(duplicate)(uint8_t src[], int stride)
2624
{
2625
#ifdef HAVE_MMX
2626
        asm volatile(
2627
                "movq (%0), %%mm0                \n\t"
2628
                "addl %1, %0                        \n\t"
2629
                "movq %%mm0, (%0)                \n\t"
2630
                "movq %%mm0, (%0, %1)                \n\t"
2631
                "movq %%mm0, (%0, %1, 2)        \n\t"
2632
                : "+r" (src)
2633
                : "r" (-stride)
2634
        );
2635
#else
2636
        int i;
2637
        uint8_t *p=src;
2638
        for(i=0; i<3; i++)
2639
        {
2640
                p-= stride;
2641
                memcpy(p, src, 8);
2642
        }
2643
#endif
2644
}
2645

    
2646
/**
2647
 * Filters array of bytes (Y or U or V values)
2648
 */
2649
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2650
        QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
2651
{
2652
        PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access
2653
        int x,y;
2654
#ifdef COMPILE_TIME_MODE
2655
        const int mode= COMPILE_TIME_MODE;
2656
#else
2657
        const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
2658
#endif
2659
        int black=0, white=255; // blackest black and whitest white in the picture
2660
        int QPCorrecture= 256*256;
2661

    
2662
        int copyAhead, i;
2663

    
2664
        //FIXME remove
2665
        uint64_t * const yHistogram= c.yHistogram;
2666
        uint8_t * const tempSrc= c.tempSrc;
2667
        uint8_t * const tempDst= c.tempDst;
2668
        const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
2669

    
2670
#ifdef HAVE_MMX
2671
        for(i=0; i<32; i++){
2672
                int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
2673
                int threshold= offset*2 + 1;
2674
                c.mmxDcOffset[i]= 0x7F - offset;
2675
                c.mmxDcThreshold[i]= 0x7F - threshold;
2676
                c.mmxDcOffset[i]*= 0x0101010101010101LL;
2677
                c.mmxDcThreshold[i]*= 0x0101010101010101LL;
2678
        }
2679
#endif
2680

    
2681
        if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
2682
        else if(   (mode & LINEAR_BLEND_DEINT_FILTER)
2683
                || (mode & FFMPEG_DEINT_FILTER)) copyAhead=14;
2684
        else if(   (mode & V_DEBLOCK)
2685
                || (mode & LINEAR_IPOL_DEINT_FILTER)
2686
                || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
2687
        else if(mode & V_X1_FILTER) copyAhead=11;
2688
//        else if(mode & V_RK1_FILTER) copyAhead=10;
2689
        else if(mode & DERING) copyAhead=9;
2690
        else copyAhead=8;
2691

    
2692
        copyAhead-= 8;
2693

    
2694
        if(!isColor)
2695
        {
2696
                uint64_t sum= 0;
2697
                int i;
2698
                uint64_t maxClipped;
2699
                uint64_t clipped;
2700
                double scale;
2701

    
2702
                c.frameNum++;
2703
                // first frame is fscked so we ignore it
2704
                if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
2705

    
2706
                for(i=0; i<256; i++)
2707
                {
2708
                        sum+= yHistogram[i];
2709
//                        printf("%d ", yHistogram[i]);
2710
                }
2711
//                printf("\n\n");
2712

    
2713
                /* we allways get a completly black picture first */
2714
                maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
2715

    
2716
                clipped= sum;
2717
                for(black=255; black>0; black--)
2718
                {
2719
                        if(clipped < maxClipped) break;
2720
                        clipped-= yHistogram[black];
2721
                }
2722

    
2723
                clipped= sum;
2724
                for(white=0; white<256; white++)
2725
                {
2726
                        if(clipped < maxClipped) break;
2727
                        clipped-= yHistogram[white];
2728
                }
2729

    
2730
                scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
2731

    
2732
#ifdef HAVE_MMX2
2733
                c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
2734
                c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
2735
#else
2736
                c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
2737
                c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
2738
#endif
2739

    
2740
                c.packedYOffset|= c.packedYOffset<<32;
2741
                c.packedYOffset|= c.packedYOffset<<16;
2742

    
2743
                c.packedYScale|= c.packedYScale<<32;
2744
                c.packedYScale|= c.packedYScale<<16;
2745
                
2746
                if(mode & LEVEL_FIX)        QPCorrecture= (int)(scale*256*256 + 0.5);
2747
                else                        QPCorrecture= 256*256;
2748
        }
2749
        else
2750
        {
2751
                c.packedYScale= 0x0100010001000100LL;
2752
                c.packedYOffset= 0;
2753
                QPCorrecture= 256*256;
2754
        }
2755

    
2756
        /* copy & deinterlace first row of blocks */
2757
        y=-BLOCK_SIZE;
2758
        {
2759
                uint8_t *srcBlock= &(src[y*srcStride]);
2760
                uint8_t *dstBlock= tempDst + dstStride;
2761

    
2762
                // From this point on it is guranteed that we can read and write 16 lines downward
2763
                // finish 1 block before the next otherwise we?ll might have a problem
2764
                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2765
                for(x=0; x<width; x+=BLOCK_SIZE)
2766
                {
2767

    
2768
#ifdef HAVE_MMX2
2769
/*
2770
                        prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
2771
                        prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
2772
                        prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
2773
                        prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
2774
*/
2775

    
2776
                        asm(
2777
                                "movl %4, %%eax                        \n\t"
2778
                                "shrl $2, %%eax                        \n\t"
2779
                                "andl $6, %%eax                        \n\t"
2780
                                "addl %5, %%eax                        \n\t"
2781
                                "movl %%eax, %%edx                \n\t"
2782
                                "imul %1, %%eax                        \n\t"
2783
                                "imul %3, %%edx                        \n\t"
2784
                                "prefetchnta 32(%%eax, %0)        \n\t"
2785
                                "prefetcht0 32(%%edx, %2)        \n\t"
2786
                                "addl %1, %%eax                        \n\t"
2787
                                "addl %3, %%edx                        \n\t"
2788
                                "prefetchnta 32(%%eax, %0)        \n\t"
2789
                                "prefetcht0 32(%%edx, %2)        \n\t"
2790
                        :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2791
                        "m" (x), "m" (copyAhead)
2792
                        : "%eax", "%edx"
2793
                        );
2794

    
2795
#elif defined(HAVE_3DNOW)
2796
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2797
/*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2798
                        prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2799
                        prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2800
                        prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2801
*/
2802
#endif
2803

    
2804
                        RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
2805
                                srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
2806

    
2807
                        RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
2808

    
2809
                        if(mode & LINEAR_IPOL_DEINT_FILTER)
2810
                                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2811
                        else if(mode & LINEAR_BLEND_DEINT_FILTER)
2812
                                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2813
                        else if(mode & MEDIAN_DEINT_FILTER)
2814
                                RENAME(deInterlaceMedian)(dstBlock, dstStride);
2815
                        else if(mode & CUBIC_IPOL_DEINT_FILTER)
2816
                                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
2817
                        else if(mode & FFMPEG_DEINT_FILTER)
2818
                                RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
2819
/*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
2820
                                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2821
*/
2822
                        dstBlock+=8;
2823
                        srcBlock+=8;
2824
                }
2825
                memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride );
2826
        }
2827

    
2828
//printf("\n");
2829
        for(y=0; y<height; y+=BLOCK_SIZE)
2830
        {
2831
                //1% speedup if these are here instead of the inner loop
2832
                uint8_t *srcBlock= &(src[y*srcStride]);
2833
                uint8_t *dstBlock= &(dst[y*dstStride]);
2834
#ifdef HAVE_MMX
2835
                uint8_t *tempBlock1= c.tempBlocks;
2836
                uint8_t *tempBlock2= c.tempBlocks + 8;
2837
#endif
2838
                int8_t *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
2839
                int8_t *nonBQPptr= isColor ? &c.nonBQPTable[(y>>3)*mbWidth] :&c.nonBQPTable[(y>>4)*mbWidth];
2840
                int QP=0;
2841
                /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
2842
                   if not than use a temporary buffer */
2843
                if(y+15 >= height)
2844
                {
2845
                        int i;
2846
                        /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
2847
                           blockcopy to dst later */
2848
                        memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
2849
                                srcStride*MAX(height-y-copyAhead, 0) );
2850

    
2851
                        /* duplicate last line of src to fill the void upto line (copyAhead+7) */
2852
                        for(i=MAX(height-y, 8); i<copyAhead+8; i++)
2853
                                memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
2854

    
2855
                        /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
2856
                        memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
2857

    
2858
                        /* duplicate last line of dst to fill the void upto line (copyAhead) */
2859
                        for(i=height-y+1; i<=copyAhead; i++)
2860
                                memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
2861

    
2862
                        dstBlock= tempDst + dstStride;
2863
                        srcBlock= tempSrc;
2864
                }
2865
//printf("\n");
2866

    
2867
                // From this point on it is guranteed that we can read and write 16 lines downward
2868
                // finish 1 block before the next otherwise we?ll might have a problem
2869
                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2870
                for(x=0; x<width; x+=BLOCK_SIZE)
2871
                {
2872
                        const int stride= dstStride;
2873
#ifdef HAVE_MMX
2874
                        uint8_t *tmpXchg;
2875
#endif
2876
                        if(isColor)
2877
                        {
2878
                                QP= QPptr[x>>3];
2879
                                c.nonBQP= nonBQPptr[x>>3];
2880
                        }
2881
                        else
2882
                        {
2883
                                QP= QPptr[x>>4];
2884
                                QP= (QP* QPCorrecture + 256*128)>>16;
2885
                                c.nonBQP= nonBQPptr[x>>4];
2886
                                c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
2887
                                yHistogram[ srcBlock[srcStride*12 + 4] ]++;
2888
                        }
2889
                        c.QP= QP;
2890
#ifdef HAVE_MMX
2891
                        asm volatile(
2892
                                "movd %1, %%mm7                                        \n\t"
2893
                                "packuswb %%mm7, %%mm7                                \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2894
                                "packuswb %%mm7, %%mm7                                \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2895
                                "packuswb %%mm7, %%mm7                                \n\t" // QP,..., QP
2896
                                "movq %%mm7, %0                        \n\t"
2897
                                : "=m" (c.pQPb) 
2898
                                : "r" (QP)
2899
                        );
2900
#endif
2901

    
2902

    
2903
#ifdef HAVE_MMX2
2904
/*
2905
                        prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
2906
                        prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
2907
                        prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
2908
                        prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
2909
*/
2910

    
2911
                        asm(
2912
                                "movl %4, %%eax                        \n\t"
2913
                                "shrl $2, %%eax                        \n\t"
2914
                                "andl $6, %%eax                        \n\t"
2915
                                "addl %5, %%eax                        \n\t"
2916
                                "movl %%eax, %%edx                \n\t"
2917
                                "imul %1, %%eax                        \n\t"
2918
                                "imul %3, %%edx                        \n\t"
2919
                                "prefetchnta 32(%%eax, %0)        \n\t"
2920
                                "prefetcht0 32(%%edx, %2)        \n\t"
2921
                                "addl %1, %%eax                        \n\t"
2922
                                "addl %3, %%edx                        \n\t"
2923
                                "prefetchnta 32(%%eax, %0)        \n\t"
2924
                                "prefetcht0 32(%%edx, %2)        \n\t"
2925
                        :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2926
                        "m" (x), "m" (copyAhead)
2927
                        : "%eax", "%edx"
2928
                        );
2929

    
2930
#elif defined(HAVE_3DNOW)
2931
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2932
/*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2933
                        prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2934
                        prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2935
                        prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2936
*/
2937
#endif
2938

    
2939
                        RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
2940
                                srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
2941

    
2942
                        if(mode & LINEAR_IPOL_DEINT_FILTER)
2943
                                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2944
                        else if(mode & LINEAR_BLEND_DEINT_FILTER)
2945
                                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2946
                        else if(mode & MEDIAN_DEINT_FILTER)
2947
                                RENAME(deInterlaceMedian)(dstBlock, dstStride);
2948
                        else if(mode & CUBIC_IPOL_DEINT_FILTER)
2949
                                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
2950
                        else if(mode & FFMPEG_DEINT_FILTER)
2951
                                RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
2952
/*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
2953
                                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2954
*/
2955

    
2956
                        /* only deblock if we have 2 blocks */
2957
                        if(y + 8 < height)
2958
                        {
2959
                                if(mode & V_X1_FILTER)
2960
                                        RENAME(vertX1Filter)(dstBlock, stride, &c);
2961
                                else if(mode & V_DEBLOCK)
2962
                                {
2963
                                        if( RENAME(isVertDC)(dstBlock, stride, &c))
2964
                                        {
2965
                                                if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c))
2966
                                                        RENAME(doVertLowPass)(dstBlock, stride, &c);
2967
                                        }
2968
                                        else
2969
                                                RENAME(doVertDefFilter)(dstBlock, stride, &c);
2970
                                }
2971
                        }
2972

    
2973
#ifdef HAVE_MMX
2974
                        RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
2975
#endif
2976
                        /* check if we have a previous block to deblock it with dstBlock */
2977
                        if(x - 8 >= 0)
2978
                        {
2979
#ifdef HAVE_MMX
2980
                                if(mode & H_X1_FILTER)
2981
                                        RENAME(vertX1Filter)(tempBlock1, 16, &c);
2982
                                else if(mode & H_DEBLOCK)
2983
                                {
2984
                                        if( RENAME(isVertDC)(tempBlock1, 16, &c))
2985
                                        {
2986
                                                if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c))
2987
                                                        RENAME(doVertLowPass)(tempBlock1, 16, &c);
2988
                                        }
2989
                                        else
2990
                                                RENAME(doVertDefFilter)(tempBlock1, 16, &c);
2991
                                }
2992

    
2993
                                RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
2994

    
2995
#else
2996
                                if(mode & H_X1_FILTER)
2997
                                        horizX1Filter(dstBlock-4, stride, QP);
2998
                                else if(mode & H_DEBLOCK)
2999
                                {
3000
                                        if( isHorizDC(dstBlock-4, stride, &c))
3001
                                        {
3002
                                                if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3003
                                                        doHorizLowPass(dstBlock-4, stride, QP);
3004
                                        }
3005
                                        else
3006
                                                doHorizDefFilter(dstBlock-4, stride, QP);
3007
                                }
3008
#endif
3009
                                if(mode & DERING)
3010
                                {
3011
                                //FIXME filter first line
3012
                                        if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3013
                                }
3014

    
3015
                                if(mode & TEMP_NOISE_FILTER)
3016
                                {
3017
                                        RENAME(tempNoiseReducer)(dstBlock-8, stride,
3018
                                                c.tempBlured[isColor] + y*dstStride + x,
3019
                                                c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3020
                                                c.ppMode.maxTmpNoise);
3021
                                }
3022
                        }
3023

    
3024
                        dstBlock+=8;
3025
                        srcBlock+=8;
3026

    
3027
#ifdef HAVE_MMX
3028
                        tmpXchg= tempBlock1;
3029
                        tempBlock1= tempBlock2;
3030
                        tempBlock2 = tmpXchg;
3031
#endif
3032
                }
3033

    
3034
                if(mode & DERING)
3035
                {
3036
                                if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3037
                }
3038

    
3039
                if((mode & TEMP_NOISE_FILTER))
3040
                {
3041
                        RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3042
                                c.tempBlured[isColor] + y*dstStride + x,
3043
                                c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3044
                                c.ppMode.maxTmpNoise);
3045
                }
3046

    
3047
                /* did we use a tmp buffer for the last lines*/
3048
                if(y+15 >= height)
3049
                {
3050
                        uint8_t *dstBlock= &(dst[y*dstStride]);
3051
                        memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3052
                }
3053
/*
3054
                for(x=0; x<width; x+=32)
3055
                {
3056
                        volatile int i;
3057
                        i+=        + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3058
                                + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3059
                                + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3060
//                                + dstBlock[x +13*dstStride]
3061
//                                + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3062
                }*/
3063
        }
3064
#ifdef HAVE_3DNOW
3065
        asm volatile("femms");
3066
#elif defined (HAVE_MMX)
3067
        asm volatile("emms");
3068
#endif
3069

    
3070
#ifdef DEBUG_BRIGHTNESS
3071
        if(!isColor)
3072
        {
3073
                int max=1;
3074
                int i;
3075
                for(i=0; i<256; i++)
3076
                        if(yHistogram[i] > max) max=yHistogram[i];
3077

    
3078
                for(i=1; i<256; i++)
3079
                {
3080
                        int x;
3081
                        int start=yHistogram[i-1]/(max/256+1);
3082
                        int end=yHistogram[i]/(max/256+1);
3083
                        int inc= end > start ? 1 : -1;
3084
                        for(x=start; x!=end+inc; x+=inc)
3085
                                dst[ i*dstStride + x]+=128;
3086
                }
3087

    
3088
                for(i=0; i<100; i+=2)
3089
                {
3090
                        dst[ (white)*dstStride + i]+=128;
3091
                        dst[ (black)*dstStride + i]+=128;
3092
                }
3093

    
3094
        }
3095
#endif
3096

    
3097
        *c2= c; //copy local context back
3098

    
3099
}