Statistics
| Branch: | Revision:

ffmpeg / postproc / postprocess_template.c @ 6b791538

History | View | Annotate | Download (92.1 KB)

1
/*
2
    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
#undef PAVGB
20
#undef PMINUB
21
#undef PMAXUB
22

    
23
#ifdef HAVE_MMX2
24
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
25
#elif defined (HAVE_3DNOW)
26
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
27
#endif
28

    
29
#ifdef HAVE_MMX2
30
#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
31
#elif defined (HAVE_MMX)
32
#define PMINUB(b,a,t) \
33
        "movq " #a ", " #t " \n\t"\
34
        "psubusb " #b ", " #t " \n\t"\
35
        "psubb " #t ", " #a " \n\t"
36
#endif
37

    
38
#ifdef HAVE_MMX2
39
#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
40
#elif defined (HAVE_MMX)
41
#define PMAXUB(a,b) \
42
        "psubusb " #a ", " #b " \n\t"\
43
        "paddb " #a ", " #b " \n\t"
44
#endif
45

    
46

    
47
//FIXME? |255-0| = 1 (shouldnt be a problem ...)
48
/**
49
 * Check if the middle 8x8 Block in the given 8x16 block is flat
50
 */
51
static inline int RENAME(isVertDC)(uint8_t src[], int stride){
52
        int numEq= 0;
53
#ifndef HAVE_MMX
54
        int y;
55
#endif
56
        src+= stride*4; // src points to begin of the 8x8 Block
57
#ifdef HAVE_MMX
58
asm volatile(
59
                "leal (%1, %2), %%eax                                \n\t"
60
                "leal (%%eax, %2, 4), %%ebx                        \n\t"
61
//        0        1        2        3        4        5        6        7        8        9
62
//        %1        eax        eax+%2        eax+2%2        %1+4%2        ebx        ebx+%2        ebx+2%2        %1+8%2        ebx+4%2
63
                "movq "MANGLE(mmxDCOffset)", %%mm7                \n\t" // mm7 = 0x7F
64
                "movq "MANGLE(mmxDCThreshold)", %%mm6                \n\t" // mm6 = 0x7D
65
                "movq (%1), %%mm0                                \n\t"
66
                "movq (%%eax), %%mm1                                \n\t"
67
                "psubb %%mm1, %%mm0                                \n\t" // mm0 = differnece
68
                "paddb %%mm7, %%mm0                                \n\t"
69
                "pcmpgtb %%mm6, %%mm0                                \n\t"
70

    
71
                "movq (%%eax,%2), %%mm2                                \n\t"
72
                "psubb %%mm2, %%mm1                                \n\t"
73
                "paddb %%mm7, %%mm1                                \n\t"
74
                "pcmpgtb %%mm6, %%mm1                                \n\t"
75
                "paddb %%mm1, %%mm0                                \n\t"
76

    
77
                "movq (%%eax, %2, 2), %%mm1                        \n\t"
78
                "psubb %%mm1, %%mm2                                \n\t"
79
                "paddb %%mm7, %%mm2                                \n\t"
80
                "pcmpgtb %%mm6, %%mm2                                \n\t"
81
                "paddb %%mm2, %%mm0                                \n\t"
82

    
83
                "movq (%1, %2, 4), %%mm2                        \n\t"
84
                "psubb %%mm2, %%mm1                                \n\t"
85
                "paddb %%mm7, %%mm1                                \n\t"
86
                "pcmpgtb %%mm6, %%mm1                                \n\t"
87
                "paddb %%mm1, %%mm0                                \n\t"
88

    
89
                "movq (%%ebx), %%mm1                                \n\t"
90
                "psubb %%mm1, %%mm2                                \n\t"
91
                "paddb %%mm7, %%mm2                                \n\t"
92
                "pcmpgtb %%mm6, %%mm2                                \n\t"
93
                "paddb %%mm2, %%mm0                                \n\t"
94

    
95
                "movq (%%ebx, %2), %%mm2                        \n\t"
96
                "psubb %%mm2, %%mm1                                \n\t"
97
                "paddb %%mm7, %%mm1                                \n\t"
98
                "pcmpgtb %%mm6, %%mm1                                \n\t"
99
                "paddb %%mm1, %%mm0                                \n\t"
100

    
101
                "movq (%%ebx, %2, 2), %%mm1                        \n\t"
102
                "psubb %%mm1, %%mm2                                \n\t"
103
                "paddb %%mm7, %%mm2                                \n\t"
104
                "pcmpgtb %%mm6, %%mm2                                \n\t"
105
                "paddb %%mm2, %%mm0                                \n\t"
106

    
107
                "                                                \n\t"
108
#ifdef HAVE_MMX2
109
                "pxor %%mm7, %%mm7                                \n\t"
110
                "psadbw %%mm7, %%mm0                                \n\t"
111
#else
112
                "movq %%mm0, %%mm1                                \n\t"
113
                "psrlw $8, %%mm0                                \n\t"
114
                "paddb %%mm1, %%mm0                                \n\t"
115
                "movq %%mm0, %%mm1                                \n\t"
116
                "psrlq $16, %%mm0                                \n\t"
117
                "paddb %%mm1, %%mm0                                \n\t"
118
                "movq %%mm0, %%mm1                                \n\t"
119
                "psrlq $32, %%mm0                                \n\t"
120
                "paddb %%mm1, %%mm0                                \n\t"
121
#endif
122
                "movd %%mm0, %0                                        \n\t"
123
                : "=r" (numEq)
124
                : "r" (src), "r" (stride)
125
                : "%eax", "%ebx"
126
                );
127
        numEq= (-numEq) &0xFF;
128

    
129
#else
130
        for(y=0; y<BLOCK_SIZE-1; y++)
131
        {
132
                if(((src[0] - src[0+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
133
                if(((src[1] - src[1+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
134
                if(((src[2] - src[2+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
135
                if(((src[3] - src[3+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
136
                if(((src[4] - src[4+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
137
                if(((src[5] - src[5+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
138
                if(((src[6] - src[6+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
139
                if(((src[7] - src[7+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
140
                src+= stride;
141
        }
142
#endif
143
/*        if(abs(numEq - asmEq) > 0)
144
        {
145
                printf("\nasm:%d  c:%d\n", asmEq, numEq);
146
                for(int y=0; y<8; y++)
147
                {
148
                        for(int x=0; x<8; x++)
149
                        {
150
                                printf("%d ", temp[x + y*stride]);
151
                        }
152
                        printf("\n");
153
                }
154
        }
155
*/
156
//        for(int i=0; i<numEq/8; i++) src[i]=255;
157
        return (numEq > vFlatnessThreshold) ? 1 : 0;
158
}
159

    
160
static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, int QP)
161
{
162
#ifdef HAVE_MMX
163
        int isOk;
164
        src+= stride*3;
165
        asm volatile(
166
//                "int $3 \n\t"
167
                "movq (%1, %2), %%mm0                                \n\t"
168
                "movq (%1, %2, 8), %%mm1                        \n\t"
169
                "movq %%mm0, %%mm2                                \n\t"
170
                "psubusb %%mm1, %%mm0                                \n\t"
171
                "psubusb %%mm2, %%mm1                                \n\t"
172
                "por %%mm1, %%mm0                                \n\t" // ABS Diff
173

    
174
                "movq "MANGLE(pQPb)", %%mm7                        \n\t" // QP,..., QP
175
                "paddusb %%mm7, %%mm7                                \n\t" // 2QP ... 2QP
176
                "psubusb %%mm7, %%mm0                                \n\t" // Diff <= 2QP -> 0
177
                "pcmpeqd "MANGLE(b00)", %%mm0                        \n\t"
178
                "psrlq $16, %%mm0                                \n\t"
179
                "pcmpeqd "MANGLE(bFF)", %%mm0                        \n\t"
180
//                "movd %%mm0, (%1, %2, 4)\n\t"
181
                "movd %%mm0, %0                                        \n\t"
182
                : "=r" (isOk)
183
                : "r" (src), "r" (stride)
184
                );
185
        return isOk;
186
#else
187

    
188
        int isOk2= 1;
189
        int x;
190
        src+= stride*3;
191
        for(x=0; x<BLOCK_SIZE; x++)
192
        {
193
                if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
194
        }
195
/*        if(isOk && !isOk2 || !isOk && isOk2)
196
        {
197
                printf("\nasm:%d  c:%d QP:%d\n", isOk, isOk2, QP);
198
                for(int y=0; y<9; y++)
199
                {
200
                        for(int x=0; x<8; x++)
201
                        {
202
                                printf("%d ", src[x + y*stride]);
203
                        }
204
                        printf("\n");
205
                }
206
        } */
207

    
208
        return isOk2;
209
#endif
210

    
211
}
212

    
213
/**
214
 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
215
 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
216
 */
217
static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP)
218
{
219
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
220
        src+= stride*3;
221
        asm volatile(        //"movv %0 %1 %2\n\t"
222
                "movq "MANGLE(pQPb)", %%mm0                        \n\t"  // QP,..., QP
223

    
224
                "movq (%0), %%mm6                                \n\t"
225
                "movq (%0, %1), %%mm5                                \n\t"
226
                "movq %%mm5, %%mm1                                \n\t"
227
                "movq %%mm6, %%mm2                                \n\t"
228
                "psubusb %%mm6, %%mm5                                \n\t"
229
                "psubusb %%mm1, %%mm2                                \n\t"
230
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
231
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
232
                "pcmpeqb "MANGLE(b00)", %%mm2                        \n\t" // diff <= QP -> FF
233

    
234
                "pand %%mm2, %%mm6                                \n\t"
235
                "pandn %%mm1, %%mm2                                \n\t"
236
                "por %%mm2, %%mm6                                \n\t"// First Line to Filter
237

    
238
                "movq (%0, %1, 8), %%mm5                        \n\t"
239
                "leal (%0, %1, 4), %%eax                        \n\t"
240
                "leal (%0, %1, 8), %%ebx                        \n\t"
241
                "subl %1, %%ebx                                        \n\t"
242
                "addl %1, %0                                        \n\t" // %0 points to line 1 not 0
243
                "movq (%0, %1, 8), %%mm7                        \n\t"
244
                "movq %%mm5, %%mm1                                \n\t"
245
                "movq %%mm7, %%mm2                                \n\t"
246
                "psubusb %%mm7, %%mm5                                \n\t"
247
                "psubusb %%mm1, %%mm2                                \n\t"
248
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
249
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
250
                "pcmpeqb "MANGLE(b00)", %%mm2                        \n\t" // diff <= QP -> FF
251

    
252
                "pand %%mm2, %%mm7                                \n\t"
253
                "pandn %%mm1, %%mm2                                \n\t"
254
                "por %%mm2, %%mm7                                \n\t" // First Line to Filter
255

    
256

    
257
                //         1        2        3        4        5        6        7        8
258
                //        %0        %0+%1        %0+2%1        eax        %0+4%1        eax+2%1        ebx        eax+4%1
259
                // 6 4 2 2 1 1
260
                // 6 4 4 2
261
                // 6 8 2
262

    
263
                "movq (%0, %1), %%mm0                                \n\t" //  1
264
                "movq %%mm0, %%mm1                                \n\t" //  1
265
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
266
                PAVGB(%%mm6, %%mm0)                                      //3 1        /4
267

    
268
                "movq (%0, %1, 4), %%mm2                        \n\t" //     1
269
                "movq %%mm2, %%mm5                                \n\t" //     1
270
                PAVGB((%%eax), %%mm2)                                      //    11        /2
271
                PAVGB((%0, %1, 2), %%mm2)                              //   211        /4
272
                "movq %%mm2, %%mm3                                \n\t" //   211        /4
273
                "movq (%0), %%mm4                                \n\t" // 1
274
                PAVGB(%%mm4, %%mm3)                                      // 4 211        /8
275
                PAVGB(%%mm0, %%mm3)                                      //642211        /16
276
                "movq %%mm3, (%0)                                \n\t" // X
277
                // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
278
                "movq %%mm1, %%mm0                                \n\t" //  1
279
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
280
                "movq %%mm4, %%mm3                                \n\t" // 1
281
                PAVGB((%0,%1,2), %%mm3)                                      // 1 1        /2
282
                PAVGB((%%eax,%1,2), %%mm5)                              //     11        /2
283
                PAVGB((%%eax), %%mm5)                                      //    211 /4
284
                PAVGB(%%mm5, %%mm3)                                      // 2 2211 /8
285
                PAVGB(%%mm0, %%mm3)                                      //4242211 /16
286
                "movq %%mm3, (%0,%1)                                \n\t" //  X
287
                // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
288
                PAVGB(%%mm4, %%mm6)                                      //11        /2
289
                "movq (%%ebx), %%mm0                                \n\t" //       1
290
                PAVGB((%%eax, %1, 2), %%mm0)                              //      11/2
291
                "movq %%mm0, %%mm3                                \n\t" //      11/2
292
                PAVGB(%%mm1, %%mm0)                                      //  2   11/4
293
                PAVGB(%%mm6, %%mm0)                                      //222   11/8
294
                PAVGB(%%mm2, %%mm0)                                      //22242211/16
295
                "movq (%0, %1, 2), %%mm2                        \n\t" //   1
296
                "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
297
                // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
298
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
299
                PAVGB((%%ebx), %%mm0)                                      //       11        /2
300
                PAVGB(%%mm0, %%mm6)                                      //11     11        /4
301
                PAVGB(%%mm1, %%mm4)                                      // 11                /2
302
                PAVGB(%%mm2, %%mm1)                                      //  11                /2
303
                PAVGB(%%mm1, %%mm6)                                      //1122   11        /8
304
                PAVGB(%%mm5, %%mm6)                                      //112242211        /16
305
                "movq (%%eax), %%mm5                                \n\t" //    1
306
                "movq %%mm6, (%%eax)                                \n\t" //    X
307
                // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
308
                "movq (%%eax, %1, 4), %%mm6                        \n\t" //        1
309
                PAVGB(%%mm7, %%mm6)                                      //        11        /2
310
                PAVGB(%%mm4, %%mm6)                                      // 11     11        /4
311
                PAVGB(%%mm3, %%mm6)                                      // 11   2211        /8
312
                PAVGB(%%mm5, %%mm2)                                      //   11                /2
313
                "movq (%0, %1, 4), %%mm4                        \n\t" //     1
314
                PAVGB(%%mm4, %%mm2)                                      //   112                /4
315
                PAVGB(%%mm2, %%mm6)                                      // 112242211        /16
316
                "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
317
                // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
318
                PAVGB(%%mm7, %%mm1)                                      //  11     2        /4
319
                PAVGB(%%mm4, %%mm5)                                      //    11                /2
320
                PAVGB(%%mm5, %%mm0)                                      //    11 11        /4
321
                "movq (%%eax, %1, 2), %%mm6                        \n\t" //      1
322
                PAVGB(%%mm6, %%mm1)                                      //  11  4  2        /8
323
                PAVGB(%%mm0, %%mm1)                                      //  11224222        /16
324
                "movq %%mm1, (%%eax, %1, 2)                        \n\t" //      X
325
                // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
326
                PAVGB((%%ebx), %%mm2)                                      //   112 4        /8
327
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
328
                PAVGB(%%mm0, %%mm6)                                      //      1 1        /2
329
                PAVGB(%%mm7, %%mm6)                                      //      1 12        /4
330
                PAVGB(%%mm2, %%mm6)                                      //   1122424        /4
331
                "movq %%mm6, (%%ebx)                                \n\t" //       X
332
                // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
333
                PAVGB(%%mm7, %%mm5)                                      //    11   2        /4
334
                PAVGB(%%mm7, %%mm5)                                      //    11   6        /8
335

    
336
                PAVGB(%%mm3, %%mm0)                                      //      112        /4
337
                PAVGB(%%mm0, %%mm5)                                      //    112246        /16
338
                "movq %%mm5, (%%eax, %1, 4)                        \n\t" //        X
339
                "subl %1, %0                                        \n\t"
340

    
341
                :
342
                : "r" (src), "r" (stride)
343
                : "%eax", "%ebx"
344
        );
345
#else
346
        const int l1= stride;
347
        const int l2= stride + l1;
348
        const int l3= stride + l2;
349
        const int l4= stride + l3;
350
        const int l5= stride + l4;
351
        const int l6= stride + l5;
352
        const int l7= stride + l6;
353
        const int l8= stride + l7;
354
        const int l9= stride + l8;
355
        int x;
356
        src+= stride*3;
357
        for(x=0; x<BLOCK_SIZE; x++)
358
        {
359
                const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
360
                const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
361

    
362
                int sums[9];
363
                sums[0] = first + src[l1];
364
                sums[1] = src[l1] + src[l2];
365
                sums[2] = src[l2] + src[l3];
366
                sums[3] = src[l3] + src[l4];
367
                sums[4] = src[l4] + src[l5];
368
                sums[5] = src[l5] + src[l6];
369
                sums[6] = src[l6] + src[l7];
370
                sums[7] = src[l7] + src[l8];
371
                sums[8] = src[l8] + last;
372

    
373
                src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
374
                src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
375
                src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
376
                src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
377
                src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
378
                src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
379
                src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
380
                src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
381

    
382
                src++;
383
        }
384

    
385
#endif
386
}
387

    
388
/**
389
 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
390
 * values are correctly clipped (MMX2)
391
 * values are wraparound (C)
392
 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
393
        0 8 16 24
394
        x = 8
395
        x/2 = 4
396
        x/8 = 1
397
        1 12 12 23
398
 */
399
static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
400
{
401
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
402
        src+= stride*3;
403
// FIXME rounding
404
        asm volatile(
405
                "pxor %%mm7, %%mm7                                \n\t" // 0
406
                "movq "MANGLE(b80)", %%mm6                        \n\t" // MIN_SIGNED_BYTE
407
                "leal (%0, %1), %%eax                                \n\t"
408
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
409
//        0        1        2        3        4        5        6        7        8        9
410
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
411
                "movq "MANGLE(pQPb)", %%mm0                        \n\t" // QP,..., QP
412
                "movq %%mm0, %%mm1                                \n\t" // QP,..., QP
413
                "paddusb "MANGLE(b02)", %%mm0                        \n\t"
414
                "psrlw $2, %%mm0                                \n\t"
415
                "pand "MANGLE(b3F)", %%mm0                        \n\t" // QP/4,..., QP/4
416
                "paddusb %%mm1, %%mm0                                \n\t" // QP*1.25 ...
417
                "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
418
                "movq (%%ebx), %%mm3                                \n\t" // line 5
419
                "movq %%mm2, %%mm4                                \n\t" // line 4
420
                "pcmpeqb %%mm5, %%mm5                                \n\t" // -1
421
                "pxor %%mm2, %%mm5                                \n\t" // -line 4 - 1
422
                PAVGB(%%mm3, %%mm5)
423
                "paddb %%mm6, %%mm5                                \n\t" // (l5-l4)/2
424
                "psubusb %%mm3, %%mm4                                \n\t"
425
                "psubusb %%mm2, %%mm3                                \n\t"
426
                "por %%mm3, %%mm4                                \n\t" // |l4 - l5|
427
                "psubusb %%mm0, %%mm4                                \n\t"
428
                "pcmpeqb %%mm7, %%mm4                                \n\t"
429
                "pand %%mm4, %%mm5                                \n\t" // d/2
430

    
431
//                "paddb %%mm6, %%mm2                                \n\t" // line 4 + 0x80
432
                "paddb %%mm5, %%mm2                                \n\t"
433
//                "psubb %%mm6, %%mm2                                \n\t"
434
                "movq %%mm2, (%0,%1, 4)                                \n\t"
435

    
436
                "movq (%%ebx), %%mm2                                \n\t"
437
//                "paddb %%mm6, %%mm2                                \n\t" // line 5 + 0x80
438
                "psubb %%mm5, %%mm2                                \n\t"
439
//                "psubb %%mm6, %%mm2                                \n\t"
440
                "movq %%mm2, (%%ebx)                                \n\t"
441

    
442
                "paddb %%mm6, %%mm5                                \n\t"
443
                "psrlw $2, %%mm5                                \n\t"
444
                "pand "MANGLE(b3F)", %%mm5                        \n\t"
445
                "psubb "MANGLE(b20)", %%mm5                        \n\t" // (l5-l4)/8
446

    
447
                "movq (%%eax, %1, 2), %%mm2                        \n\t"
448
                "paddb %%mm6, %%mm2                                \n\t" // line 3 + 0x80
449
                "paddsb %%mm5, %%mm2                                \n\t"
450
                "psubb %%mm6, %%mm2                                \n\t"
451
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
452

    
453
                "movq (%%ebx, %1), %%mm2                        \n\t"
454
                "paddb %%mm6, %%mm2                                \n\t" // line 6 + 0x80
455
                "psubsb %%mm5, %%mm2                                \n\t"
456
                "psubb %%mm6, %%mm2                                \n\t"
457
                "movq %%mm2, (%%ebx, %1)                        \n\t"
458

    
459
                :
460
                : "r" (src), "r" (stride)
461
                : "%eax", "%ebx"
462
        );
463
#else
464
         const int l1= stride;
465
        const int l2= stride + l1;
466
        const int l3= stride + l2;
467
        const int l4= stride + l3;
468
        const int l5= stride + l4;
469
        const int l6= stride + l5;
470
//        const int l7= stride + l6;
471
//        const int l8= stride + l7;
472
//        const int l9= stride + l8;
473
        int x;
474
        const int QP15= QP + (QP>>2);
475
        src+= stride*3;
476
        for(x=0; x<BLOCK_SIZE; x++)
477
        {
478
                const int v = (src[x+l5] - src[x+l4]);
479
                if(ABS(v) < QP15)
480
                {
481
                        src[x+l3] +=v>>3;
482
                        src[x+l4] +=v>>1;
483
                        src[x+l5] -=v>>1;
484
                        src[x+l6] -=v>>3;
485

    
486
                }
487
        }
488

    
489
#endif
490
}
491

    
492
/**
493
 * Experimental Filter 1
494
 * will not damage linear gradients
495
 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
496
 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
497
 * MMX2 version does correct clipping C version doesnt
498
 */
499
static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP)
500
{
501
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
502
        src+= stride*3;
503

    
504
        asm volatile(
505
                "pxor %%mm7, %%mm7                                \n\t" // 0
506
                "leal (%0, %1), %%eax                                \n\t"
507
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
508
//        0        1        2        3        4        5        6        7        8        9
509
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
510
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
511
                "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
512
                "movq %%mm1, %%mm2                                \n\t" // line 4
513
                "psubusb %%mm0, %%mm1                                \n\t"
514
                "psubusb %%mm2, %%mm0                                \n\t"
515
                "por %%mm1, %%mm0                                \n\t" // |l2 - l3|
516
                "movq (%%ebx), %%mm3                                \n\t" // line 5
517
                "movq (%%ebx, %1), %%mm4                        \n\t" // line 6
518
                "movq %%mm3, %%mm5                                \n\t" // line 5
519
                "psubusb %%mm4, %%mm3                                \n\t"
520
                "psubusb %%mm5, %%mm4                                \n\t"
521
                "por %%mm4, %%mm3                                \n\t" // |l5 - l6|
522
                PAVGB(%%mm3, %%mm0)                                      // (|l2 - l3| + |l5 - l6|)/2
523
                "movq %%mm2, %%mm1                                \n\t" // line 4
524
                "psubusb %%mm5, %%mm2                                \n\t"
525
                "movq %%mm2, %%mm4                                \n\t"
526
                "pcmpeqb %%mm7, %%mm2                                \n\t" // (l4 - l5) <= 0 ? -1 : 0
527
                "psubusb %%mm1, %%mm5                                \n\t"
528
                "por %%mm5, %%mm4                                \n\t" // |l4 - l5|
529
                "psubusb %%mm0, %%mm4                \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
530
                "movq %%mm4, %%mm3                                \n\t" // d
531
                "movq "MANGLE(pQPb)", %%mm0                        \n\t"
532
                "paddusb %%mm0, %%mm0                                \n\t"
533
                "psubusb %%mm0, %%mm4                                \n\t"
534
                "pcmpeqb %%mm7, %%mm4                                \n\t" // d <= QP ? -1 : 0
535
                "psubusb "MANGLE(b01)", %%mm3                        \n\t"
536
                "pand %%mm4, %%mm3                                \n\t" // d <= QP ? d : 0
537

    
538
                PAVGB(%%mm7, %%mm3)                                      // d/2
539
                "movq %%mm3, %%mm1                                \n\t" // d/2
540
                PAVGB(%%mm7, %%mm3)                                      // d/4
541
                PAVGB(%%mm1, %%mm3)                                      // 3*d/8
542

    
543
                "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
544
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
545
                "psubusb %%mm3, %%mm0                                \n\t"
546
                "pxor %%mm2, %%mm0                                \n\t"
547
                "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
548

    
549
                "movq (%%ebx), %%mm0                                \n\t" // line 5
550
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
551
                "paddusb %%mm3, %%mm0                                \n\t"
552
                "pxor %%mm2, %%mm0                                \n\t"
553
                "movq %%mm0, (%%ebx)                                \n\t" // line 5
554

    
555
                PAVGB(%%mm7, %%mm1)                                      // d/4
556

    
557
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
558
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
559
                "psubusb %%mm1, %%mm0                                \n\t"
560
                "pxor %%mm2, %%mm0                                \n\t"
561
                "movq %%mm0, (%%eax, %1, 2)                        \n\t" // line 3
562

    
563
                "movq (%%ebx, %1), %%mm0                        \n\t" // line 6
564
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
565
                "paddusb %%mm1, %%mm0                                \n\t"
566
                "pxor %%mm2, %%mm0                                \n\t"
567
                "movq %%mm0, (%%ebx, %1)                        \n\t" // line 6
568

    
569
                PAVGB(%%mm7, %%mm1)                                      // d/8
570

    
571
                "movq (%%eax, %1), %%mm0                        \n\t" // line 2
572
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
573
                "psubusb %%mm1, %%mm0                                \n\t"
574
                "pxor %%mm2, %%mm0                                \n\t"
575
                "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
576

    
577
                "movq (%%ebx, %1, 2), %%mm0                        \n\t" // line 7
578
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
579
                "paddusb %%mm1, %%mm0                                \n\t"
580
                "pxor %%mm2, %%mm0                                \n\t"
581
                "movq %%mm0, (%%ebx, %1, 2)                        \n\t" // line 7
582

    
583
                :
584
                : "r" (src), "r" (stride)
585
                : "%eax", "%ebx"
586
        );
587
#else
588

    
589
         const int l1= stride;
590
        const int l2= stride + l1;
591
        const int l3= stride + l2;
592
        const int l4= stride + l3;
593
        const int l5= stride + l4;
594
        const int l6= stride + l5;
595
        const int l7= stride + l6;
596
//        const int l8= stride + l7;
597
//        const int l9= stride + l8;
598
        int x;
599

    
600
        src+= stride*3;
601
        for(x=0; x<BLOCK_SIZE; x++)
602
        {
603
                int a= src[l3] - src[l4];
604
                int b= src[l4] - src[l5];
605
                int c= src[l5] - src[l6];
606

    
607
                int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
608
                d= MAX(d, 0);
609

    
610
                if(d < QP*2)
611
                {
612
                        int v = d * SIGN(-b);
613

    
614
                        src[l2] +=v>>3;
615
                        src[l3] +=v>>2;
616
                        src[l4] +=(3*v)>>3;
617
                        src[l5] -=(3*v)>>3;
618
                        src[l6] -=v>>2;
619
                        src[l7] -=v>>3;
620

    
621
                }
622
                src++;
623
        }
624
        /*
625
         const int l1= stride;
626
        const int l2= stride + l1;
627
        const int l3= stride + l2;
628
        const int l4= stride + l3;
629
        const int l5= stride + l4;
630
        const int l6= stride + l5;
631
        const int l7= stride + l6;
632
        const int l8= stride + l7;
633
        const int l9= stride + l8;
634
        for(int x=0; x<BLOCK_SIZE; x++)
635
        {
636
                int v2= src[l2];
637
                int v3= src[l3];
638
                int v4= src[l4];
639
                int v5= src[l5];
640
                int v6= src[l6];
641
                int v7= src[l7];
642

643
                if(ABS(v4-v5)<QP &&  ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
644
                {
645
                        src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6         )/16;
646
                        src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7  )/16;
647
                        src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
648
                        src[l6] = (       1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
649
                }
650
                src++;
651
        }
652
*/
653
#endif
654
}
655

    
656
static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP)
657
{
658
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
659
/*
660
        uint8_t tmp[16];
661
        const int l1= stride;
662
        const int l2= stride + l1;
663
        const int l3= stride + l2;
664
        const int l4= (int)tmp - (int)src - stride*3;
665
        const int l5= (int)tmp - (int)src - stride*3 + 8;
666
        const int l6= stride*3 + l3;
667
        const int l7= stride + l6;
668
        const int l8= stride + l7;
669

670
        memcpy(tmp, src+stride*7, 8);
671
        memcpy(tmp+8, src+stride*8, 8);
672
*/
673
        src+= stride*4;
674
        asm volatile(
675

    
676
#if 0 //sligtly more accurate and slightly slower
677
                "pxor %%mm7, %%mm7                                \n\t" // 0
678
                "leal (%0, %1), %%eax                                \n\t"
679
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
680
//        0        1        2        3        4        5        6        7
681
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ebx+%1        ebx+2%1
682
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1
683

684

685
                "movq (%0, %1, 2), %%mm0                        \n\t" // l2
686
                "movq (%0), %%mm1                                \n\t" // l0
687
                "movq %%mm0, %%mm2                                \n\t" // l2
688
                PAVGB(%%mm7, %%mm0)                                      // ~l2/2
689
                PAVGB(%%mm1, %%mm0)                                      // ~(l2 + 2l0)/4
690
                PAVGB(%%mm2, %%mm0)                                      // ~(5l2 + 2l0)/8
691

692
                "movq (%%eax), %%mm1                                \n\t" // l1
693
                "movq (%%eax, %1, 2), %%mm3                        \n\t" // l3
694
                "movq %%mm1, %%mm4                                \n\t" // l1
695
                PAVGB(%%mm7, %%mm1)                                      // ~l1/2
696
                PAVGB(%%mm3, %%mm1)                                      // ~(l1 + 2l3)/4
697
                PAVGB(%%mm4, %%mm1)                                      // ~(5l1 + 2l3)/8
698

699
                "movq %%mm0, %%mm4                                \n\t" // ~(5l2 + 2l0)/8
700
                "psubusb %%mm1, %%mm0                                \n\t"
701
                "psubusb %%mm4, %%mm1                                \n\t"
702
                "por %%mm0, %%mm1                                \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
703
// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
704

705
                "movq (%0, %1, 4), %%mm0                        \n\t" // l4
706
                "movq %%mm0, %%mm4                                \n\t" // l4
707
                PAVGB(%%mm7, %%mm0)                                      // ~l4/2
708
                PAVGB(%%mm2, %%mm0)                                      // ~(l4 + 2l2)/4
709
                PAVGB(%%mm4, %%mm0)                                      // ~(5l4 + 2l2)/8
710

711
                "movq (%%ebx), %%mm2                                \n\t" // l5
712
                "movq %%mm3, %%mm5                                \n\t" // l3
713
                PAVGB(%%mm7, %%mm3)                                      // ~l3/2
714
                PAVGB(%%mm2, %%mm3)                                      // ~(l3 + 2l5)/4
715
                PAVGB(%%mm5, %%mm3)                                      // ~(5l3 + 2l5)/8
716

717
                "movq %%mm0, %%mm6                                \n\t" // ~(5l4 + 2l2)/8
718
                "psubusb %%mm3, %%mm0                                \n\t"
719
                "psubusb %%mm6, %%mm3                                \n\t"
720
                "por %%mm0, %%mm3                                \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
721
                "pcmpeqb %%mm7, %%mm0                                \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
722
// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
723

724
                "movq (%%ebx, %1), %%mm6                        \n\t" // l6
725
                "movq %%mm6, %%mm5                                \n\t" // l6
726
                PAVGB(%%mm7, %%mm6)                                      // ~l6/2
727
                PAVGB(%%mm4, %%mm6)                                      // ~(l6 + 2l4)/4
728
                PAVGB(%%mm5, %%mm6)                                      // ~(5l6 + 2l4)/8
729

730
                "movq (%%ebx, %1, 2), %%mm5                        \n\t" // l7
731
                "movq %%mm2, %%mm4                                \n\t" // l5
732
                PAVGB(%%mm7, %%mm2)                                      // ~l5/2
733
                PAVGB(%%mm5, %%mm2)                                      // ~(l5 + 2l7)/4
734
                PAVGB(%%mm4, %%mm2)                                      // ~(5l5 + 2l7)/8
735

736
                "movq %%mm6, %%mm4                                \n\t" // ~(5l6 + 2l4)/8
737
                "psubusb %%mm2, %%mm6                                \n\t"
738
                "psubusb %%mm4, %%mm2                                \n\t"
739
                "por %%mm6, %%mm2                                \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
740
// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
741

742

743
                PMINUB(%%mm2, %%mm1, %%mm4)                              // MIN(|lenergy|,|renergy|)/8
744
                "movq "MANGLE(pQPb)", %%mm4                        \n\t" // QP //FIXME QP+1 ?
745
                "paddusb "MANGLE(b01)", %%mm4                        \n\t"
746
                "pcmpgtb %%mm3, %%mm4                                \n\t" // |menergy|/8 < QP
747
                "psubusb %%mm1, %%mm3                                \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
748
                "pand %%mm4, %%mm3                                \n\t"
749

750
                "movq %%mm3, %%mm1                                \n\t"
751
//                "psubusb "MANGLE(b01)", %%mm3                        \n\t"
752
                PAVGB(%%mm7, %%mm3)
753
                PAVGB(%%mm7, %%mm3)
754
                "paddusb %%mm1, %%mm3                                \n\t"
755
//                "paddusb "MANGLE(b01)", %%mm3                        \n\t"
756

757
                "movq (%%eax, %1, 2), %%mm6                        \n\t" //l3
758
                "movq (%0, %1, 4), %%mm5                        \n\t" //l4
759
                "movq (%0, %1, 4), %%mm4                        \n\t" //l4
760
                "psubusb %%mm6, %%mm5                                \n\t"
761
                "psubusb %%mm4, %%mm6                                \n\t"
762
                "por %%mm6, %%mm5                                \n\t" // |l3-l4|
763
                "pcmpeqb %%mm7, %%mm6                                \n\t" // SIGN(l3-l4)
764
                "pxor %%mm6, %%mm0                                \n\t"
765
                "pand %%mm0, %%mm3                                \n\t"
766
                PMINUB(%%mm5, %%mm3, %%mm0)
767

768
                "psubusb "MANGLE(b01)", %%mm3                        \n\t"
769
                PAVGB(%%mm7, %%mm3)
770

771
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
772
                "movq (%0, %1, 4), %%mm2                        \n\t"
773
                "pxor %%mm6, %%mm0                                \n\t"
774
                "pxor %%mm6, %%mm2                                \n\t"
775
                "psubb %%mm3, %%mm0                                \n\t"
776
                "paddb %%mm3, %%mm2                                \n\t"
777
                "pxor %%mm6, %%mm0                                \n\t"
778
                "pxor %%mm6, %%mm2                                \n\t"
779
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
780
                "movq %%mm2, (%0, %1, 4)                        \n\t"
781
#endif
782

    
783
                "leal (%0, %1), %%eax                                \n\t"
784
                "pcmpeqb %%mm6, %%mm6                                \n\t" // -1
785
//        0        1        2        3        4        5        6        7
786
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ebx+%1        ebx+2%1
787
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1
788

    
789

    
790
                "movq (%%eax, %1, 2), %%mm1                        \n\t" // l3
791
                "movq (%0, %1, 4), %%mm0                        \n\t" // l4
792
                "pxor %%mm6, %%mm1                                \n\t" // -l3-1
793
                PAVGB(%%mm1, %%mm0)                                      // -q+128 = (l4-l3+256)/2
794
// mm1=-l3-1, mm0=128-q
795

    
796
                "movq (%%eax, %1, 4), %%mm2                        \n\t" // l5
797
                "movq (%%eax, %1), %%mm3                        \n\t" // l2
798
                "pxor %%mm6, %%mm2                                \n\t" // -l5-1
799
                "movq %%mm2, %%mm5                                \n\t" // -l5-1
800
                "movq "MANGLE(b80)", %%mm4                        \n\t" // 128
801
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
802
                PAVGB(%%mm3, %%mm2)                                      // (l2-l5+256)/2
803
                PAVGB(%%mm0, %%mm4)                                      // ~(l4-l3)/4 + 128
804
                PAVGB(%%mm2, %%mm4)                                      // ~(l2-l5)/4 +(l4-l3)/8 + 128
805
                PAVGB(%%mm0, %%mm4)                                      // ~(l2-l5)/8 +5(l4-l3)/16 + 128
806
// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
807

    
808
                "movq (%%eax), %%mm2                                \n\t" // l1
809
                "pxor %%mm6, %%mm2                                \n\t" // -l1-1
810
                PAVGB(%%mm3, %%mm2)                                      // (l2-l1+256)/2
811
                PAVGB((%0), %%mm1)                                      // (l0-l3+256)/2
812
                "movq "MANGLE(b80)", %%mm3                        \n\t" // 128
813
                PAVGB(%%mm2, %%mm3)                                      // ~(l2-l1)/4 + 128
814
                PAVGB(%%mm1, %%mm3)                                      // ~(l0-l3)/4 +(l2-l1)/8 + 128
815
                PAVGB(%%mm2, %%mm3)                                      // ~(l0-l3)/8 +5(l2-l1)/16 + 128
816
// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
817

    
818
                PAVGB((%%ebx, %1), %%mm5)                              // (l6-l5+256)/2
819
                "movq (%%ebx, %1, 2), %%mm1                        \n\t" // l7
820
                "pxor %%mm6, %%mm1                                \n\t" // -l7-1
821
                PAVGB((%0, %1, 4), %%mm1)                              // (l4-l7+256)/2
822
                "movq "MANGLE(b80)", %%mm2                        \n\t" // 128
823
                PAVGB(%%mm5, %%mm2)                                      // ~(l6-l5)/4 + 128
824
                PAVGB(%%mm1, %%mm2)                                      // ~(l4-l7)/4 +(l6-l5)/8 + 128
825
                PAVGB(%%mm5, %%mm2)                                      // ~(l4-l7)/8 +5(l6-l5)/16 + 128
826
// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
827

    
828
                "movq "MANGLE(b00)", %%mm1                        \n\t" // 0
829
                "movq "MANGLE(b00)", %%mm5                        \n\t" // 0
830
                "psubb %%mm2, %%mm1                                \n\t" // 128 - renergy/16
831
                "psubb %%mm3, %%mm5                                \n\t" // 128 - lenergy/16
832
                PMAXUB(%%mm1, %%mm2)                                      // 128 + |renergy/16|
833
                 PMAXUB(%%mm5, %%mm3)                                      // 128 + |lenergy/16|
834
                PMINUB(%%mm2, %%mm3, %%mm1)                              // 128 + MIN(|lenergy|,|renergy|)/16
835

    
836
// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
837

    
838
                "movq "MANGLE(b00)", %%mm7                        \n\t" // 0
839
                "movq "MANGLE(pQPb)", %%mm2                        \n\t" // QP
840
                PAVGB(%%mm6, %%mm2)                                      // 128 + QP/2
841
                "psubb %%mm6, %%mm2                                \n\t"
842

    
843
                "movq %%mm4, %%mm1                                \n\t"
844
                "pcmpgtb %%mm7, %%mm1                                \n\t" // SIGN(menergy)
845
                "pxor %%mm1, %%mm4                                \n\t"
846
                "psubb %%mm1, %%mm4                                \n\t" // 128 + |menergy|/16
847
                "pcmpgtb %%mm4, %%mm2                                \n\t" // |menergy|/16 < QP/2
848
                "psubusb %%mm3, %%mm4                                \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
849
// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
850

    
851
                "movq %%mm4, %%mm3                                \n\t" // d
852
                "psubusb "MANGLE(b01)", %%mm4                        \n\t"
853
                PAVGB(%%mm7, %%mm4)                                      // d/32
854
                PAVGB(%%mm7, %%mm4)                                      // (d + 32)/64
855
                "paddb %%mm3, %%mm4                                \n\t" // 5d/64
856
                "pand %%mm2, %%mm4                                \n\t"
857

    
858
                "movq "MANGLE(b80)", %%mm5                        \n\t" // 128
859
                "psubb %%mm0, %%mm5                                \n\t" // q
860
                "paddsb %%mm6, %%mm5                                \n\t" // fix bad rounding
861
                "pcmpgtb %%mm5, %%mm7                                \n\t" // SIGN(q)
862
                "pxor %%mm7, %%mm5                                \n\t"
863

    
864
                PMINUB(%%mm5, %%mm4, %%mm3)                              // MIN(|q|, 5d/64)
865
                "pxor %%mm1, %%mm7                                \n\t" // SIGN(d*q)
866

    
867
                "pand %%mm7, %%mm4                                \n\t"
868
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
869
                "movq (%0, %1, 4), %%mm2                        \n\t"
870
                "pxor %%mm1, %%mm0                                \n\t"
871
                "pxor %%mm1, %%mm2                                \n\t"
872
                "paddb %%mm4, %%mm0                                \n\t"
873
                "psubb %%mm4, %%mm2                                \n\t"
874
                "pxor %%mm1, %%mm0                                \n\t"
875
                "pxor %%mm1, %%mm2                                \n\t"
876
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
877
                "movq %%mm2, (%0, %1, 4)                        \n\t"
878

    
879
                :
880
                : "r" (src), "r" (stride)
881
                : "%eax", "%ebx"
882
        );
883

    
884
/*
885
        {
886
        int x;
887
        src-= stride;
888
        for(x=0; x<BLOCK_SIZE; x++)
889
        {
890
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
891
                if(ABS(middleEnergy)< 8*QP)
892
                {
893
                        const int q=(src[l4] - src[l5])/2;
894
                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
895
                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
896

897
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
898
                        d= MAX(d, 0);
899

900
                        d= (5*d + 32) >> 6;
901
                        d*= SIGN(-middleEnergy);
902

903
                        if(q>0)
904
                        {
905
                                d= d<0 ? 0 : d;
906
                                d= d>q ? q : d;
907
                        }
908
                        else
909
                        {
910
                                d= d>0 ? 0 : d;
911
                                d= d<q ? q : d;
912
                        }
913

914
                        src[l4]-= d;
915
                        src[l5]+= d;
916
                }
917
                src++;
918
        }
919
src-=8;
920
        for(x=0; x<8; x++)
921
        {
922
                int y;
923
                for(y=4; y<6; y++)
924
                {
925
                        int d= src[x+y*stride] - tmp[x+(y-4)*8];
926
                        int ad= ABS(d);
927
                        static int max=0;
928
                        static int sum=0;
929
                        static int num=0;
930
                        static int bias=0;
931

932
                        if(max<ad) max=ad;
933
                        sum+= ad>3 ? 1 : 0;
934
                        if(ad>3)
935
                        {
936
                                src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
937
                        }
938
                        if(y==4) bias+=d;
939
                        num++;
940
                        if(num%1000000 == 0)
941
                        {
942
                                printf(" %d %d %d %d\n", num, sum, max, bias);
943
                        }
944
                }
945
        }
946
}
947
*/
948
#elif defined (HAVE_MMX)
949
        src+= stride*4;
950

    
951
        asm volatile(
952
                "pxor %%mm7, %%mm7                                \n\t"
953
                "leal (%0, %1), %%eax                                \n\t"
954
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
955
//        0        1        2        3        4        5        6        7
956
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ebx+%1        ebx+2%1
957
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1
958

    
959
                "movq (%0), %%mm0                                \n\t"
960
                "movq %%mm0, %%mm1                                \n\t"
961
                "punpcklbw %%mm7, %%mm0                                \n\t" // low part of line 0
962
                "punpckhbw %%mm7, %%mm1                                \n\t" // high part of line 0
963

    
964
                "movq (%%eax), %%mm2                                \n\t"
965
                "movq %%mm2, %%mm3                                \n\t"
966
                "punpcklbw %%mm7, %%mm2                                \n\t" // low part of line 1
967
                "punpckhbw %%mm7, %%mm3                                \n\t" // high part of line 1
968

    
969
                "movq (%%eax, %1), %%mm4                        \n\t"
970
                "movq %%mm4, %%mm5                                \n\t"
971
                "punpcklbw %%mm7, %%mm4                                \n\t" // low part of line 2
972
                "punpckhbw %%mm7, %%mm5                                \n\t" // high part of line 2
973

    
974
                "paddw %%mm0, %%mm0                                \n\t" // 2L0
975
                "paddw %%mm1, %%mm1                                \n\t" // 2H0
976
                "psubw %%mm4, %%mm2                                \n\t" // L1 - L2
977
                "psubw %%mm5, %%mm3                                \n\t" // H1 - H2
978
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - L1 + L2
979
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - H1 + H2
980

    
981
                "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
982
                "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
983
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2
984
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2
985

    
986
                "movq (%%eax, %1, 2), %%mm2                        \n\t"
987
                "movq %%mm2, %%mm3                                \n\t"
988
                "punpcklbw %%mm7, %%mm2                                \n\t" // L3
989
                "punpckhbw %%mm7, %%mm3                                \n\t" // H3
990

    
991
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - L3
992
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - H3
993
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
994
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
995
                "movq %%mm0, "MANGLE(temp0)"                        \n\t" // 2L0 - 5L1 + 5L2 - 2L3
996
                "movq %%mm1, "MANGLE(temp1)"                        \n\t" // 2H0 - 5H1 + 5H2 - 2H3
997

    
998
                "movq (%0, %1, 4), %%mm0                        \n\t"
999
                "movq %%mm0, %%mm1                                \n\t"
1000
                "punpcklbw %%mm7, %%mm0                                \n\t" // L4
1001
                "punpckhbw %%mm7, %%mm1                                \n\t" // H4
1002

    
1003
                "psubw %%mm0, %%mm2                                \n\t" // L3 - L4
1004
                "psubw %%mm1, %%mm3                                \n\t" // H3 - H4
1005
                "movq %%mm2, "MANGLE(temp2)"                        \n\t" // L3 - L4
1006
                "movq %%mm3, "MANGLE(temp3)"                        \n\t" // H3 - H4
1007
                "paddw %%mm4, %%mm4                                \n\t" // 2L2
1008
                "paddw %%mm5, %%mm5                                \n\t" // 2H2
1009
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - L3 + L4
1010
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - H3 + H4
1011

    
1012
                "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
1013
                "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
1014
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4
1015
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4
1016
//50 opcodes so far
1017
                "movq (%%ebx), %%mm2                                \n\t"
1018
                "movq %%mm2, %%mm3                                \n\t"
1019
                "punpcklbw %%mm7, %%mm2                                \n\t" // L5
1020
                "punpckhbw %%mm7, %%mm3                                \n\t" // H5
1021
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - L5
1022
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - H5
1023
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1024
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1025

    
1026
                "movq (%%ebx, %1), %%mm6                        \n\t"
1027
                "punpcklbw %%mm7, %%mm6                                \n\t" // L6
1028
                "psubw %%mm6, %%mm2                                \n\t" // L5 - L6
1029
                "movq (%%ebx, %1), %%mm6                        \n\t"
1030
                "punpckhbw %%mm7, %%mm6                                \n\t" // H6
1031
                "psubw %%mm6, %%mm3                                \n\t" // H5 - H6
1032

    
1033
                "paddw %%mm0, %%mm0                                \n\t" // 2L4
1034
                "paddw %%mm1, %%mm1                                \n\t" // 2H4
1035
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - L5 + L6
1036
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - H5 + H6
1037

    
1038
                "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
1039
                "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
1040
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6
1041
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6
1042

    
1043
                "movq (%%ebx, %1, 2), %%mm2                        \n\t"
1044
                "movq %%mm2, %%mm3                                \n\t"
1045
                "punpcklbw %%mm7, %%mm2                                \n\t" // L7
1046
                "punpckhbw %%mm7, %%mm3                                \n\t" // H7
1047

    
1048
                "paddw %%mm2, %%mm2                                \n\t" // 2L7
1049
                "paddw %%mm3, %%mm3                                \n\t" // 2H7
1050
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1051
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1052

    
1053
                "movq "MANGLE(temp0)", %%mm2                        \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1054
                "movq "MANGLE(temp1)", %%mm3                        \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1055

    
1056
#ifdef HAVE_MMX2
1057
                "movq %%mm7, %%mm6                                \n\t" // 0
1058
                "psubw %%mm0, %%mm6                                \n\t"
1059
                "pmaxsw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1060
                "movq %%mm7, %%mm6                                \n\t" // 0
1061
                "psubw %%mm1, %%mm6                                \n\t"
1062
                "pmaxsw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1063
                "movq %%mm7, %%mm6                                \n\t" // 0
1064
                "psubw %%mm2, %%mm6                                \n\t"
1065
                "pmaxsw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1066
                "movq %%mm7, %%mm6                                \n\t" // 0
1067
                "psubw %%mm3, %%mm6                                \n\t"
1068
                "pmaxsw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1069
#else
1070
                "movq %%mm7, %%mm6                                \n\t" // 0
1071
                "pcmpgtw %%mm0, %%mm6                                \n\t"
1072
                "pxor %%mm6, %%mm0                                \n\t"
1073
                "psubw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1074
                "movq %%mm7, %%mm6                                \n\t" // 0
1075
                "pcmpgtw %%mm1, %%mm6                                \n\t"
1076
                "pxor %%mm6, %%mm1                                \n\t"
1077
                "psubw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1078
                "movq %%mm7, %%mm6                                \n\t" // 0
1079
                "pcmpgtw %%mm2, %%mm6                                \n\t"
1080
                "pxor %%mm6, %%mm2                                \n\t"
1081
                "psubw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1082
                "movq %%mm7, %%mm6                                \n\t" // 0
1083
                "pcmpgtw %%mm3, %%mm6                                \n\t"
1084
                "pxor %%mm6, %%mm3                                \n\t"
1085
                "psubw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1086
#endif
1087

    
1088
#ifdef HAVE_MMX2
1089
                "pminsw %%mm2, %%mm0                                \n\t"
1090
                "pminsw %%mm3, %%mm1                                \n\t"
1091
#else
1092
                "movq %%mm0, %%mm6                                \n\t"
1093
                "psubusw %%mm2, %%mm6                                \n\t"
1094
                "psubw %%mm6, %%mm0                                \n\t"
1095
                "movq %%mm1, %%mm6                                \n\t"
1096
                "psubusw %%mm3, %%mm6                                \n\t"
1097
                "psubw %%mm6, %%mm1                                \n\t"
1098
#endif
1099

    
1100
                "movq %%mm7, %%mm6                                \n\t" // 0
1101
                "pcmpgtw %%mm4, %%mm6                                \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1102
                "pxor %%mm6, %%mm4                                \n\t"
1103
                "psubw %%mm6, %%mm4                                \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1104
                "pcmpgtw %%mm5, %%mm7                                \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1105
                "pxor %%mm7, %%mm5                                \n\t"
1106
                "psubw %%mm7, %%mm5                                \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1107
// 100 opcodes
1108
                "movd %2, %%mm2                                        \n\t" // QP
1109
                "punpcklwd %%mm2, %%mm2                                \n\t"
1110
                "punpcklwd %%mm2, %%mm2                                \n\t"
1111
                "psllw $3, %%mm2                                \n\t" // 8QP
1112
                "movq %%mm2, %%mm3                                \n\t" // 8QP
1113
                "pcmpgtw %%mm4, %%mm2                                \n\t"
1114
                "pcmpgtw %%mm5, %%mm3                                \n\t"
1115
                "pand %%mm2, %%mm4                                \n\t"
1116
                "pand %%mm3, %%mm5                                \n\t"
1117

    
1118

    
1119
                "psubusw %%mm0, %%mm4                                \n\t" // hd
1120
                "psubusw %%mm1, %%mm5                                \n\t" // ld
1121

    
1122

    
1123
                "movq "MANGLE(w05)", %%mm2                        \n\t" // 5
1124
                "pmullw %%mm2, %%mm4                                \n\t"
1125
                "pmullw %%mm2, %%mm5                                \n\t"
1126
                "movq "MANGLE(w20)", %%mm2                        \n\t" // 32
1127
                "paddw %%mm2, %%mm4                                \n\t"
1128
                "paddw %%mm2, %%mm5                                \n\t"
1129
                "psrlw $6, %%mm4                                \n\t"
1130
                "psrlw $6, %%mm5                                \n\t"
1131

    
1132
/*
1133
                "movq w06, %%mm2                                \n\t" // 6
1134
                "paddw %%mm2, %%mm4                                \n\t"
1135
                "paddw %%mm2, %%mm5                                \n\t"
1136
                "movq w1400, %%mm2                                \n\t" // 1400h = 5120 = 5/64*2^16
1137
//FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1138
                "pmulhw %%mm2, %%mm4                                \n\t" // hd/13
1139
                "pmulhw %%mm2, %%mm5                                \n\t" // ld/13
1140
*/
1141

    
1142
                "movq "MANGLE(temp2)", %%mm0                        \n\t" // L3 - L4
1143
                "movq "MANGLE(temp3)", %%mm1                        \n\t" // H3 - H4
1144

    
1145
                "pxor %%mm2, %%mm2                                \n\t"
1146
                "pxor %%mm3, %%mm3                                \n\t"
1147

    
1148
                "pcmpgtw %%mm0, %%mm2                                \n\t" // sign (L3-L4)
1149
                "pcmpgtw %%mm1, %%mm3                                \n\t" // sign (H3-H4)
1150
                "pxor %%mm2, %%mm0                                \n\t"
1151
                "pxor %%mm3, %%mm1                                \n\t"
1152
                "psubw %%mm2, %%mm0                                \n\t" // |L3-L4|
1153
                "psubw %%mm3, %%mm1                                \n\t" // |H3-H4|
1154
                "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1155
                "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1156

    
1157
                "pxor %%mm6, %%mm2                                \n\t"
1158
                "pxor %%mm7, %%mm3                                \n\t"
1159
                "pand %%mm2, %%mm4                                \n\t"
1160
                "pand %%mm3, %%mm5                                \n\t"
1161

    
1162
#ifdef HAVE_MMX2
1163
                "pminsw %%mm0, %%mm4                                \n\t"
1164
                "pminsw %%mm1, %%mm5                                \n\t"
1165
#else
1166
                "movq %%mm4, %%mm2                                \n\t"
1167
                "psubusw %%mm0, %%mm2                                \n\t"
1168
                "psubw %%mm2, %%mm4                                \n\t"
1169
                "movq %%mm5, %%mm2                                \n\t"
1170
                "psubusw %%mm1, %%mm2                                \n\t"
1171
                "psubw %%mm2, %%mm5                                \n\t"
1172
#endif
1173
                "pxor %%mm6, %%mm4                                \n\t"
1174
                "pxor %%mm7, %%mm5                                \n\t"
1175
                "psubw %%mm6, %%mm4                                \n\t"
1176
                "psubw %%mm7, %%mm5                                \n\t"
1177
                "packsswb %%mm5, %%mm4                                \n\t"
1178
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
1179
                "paddb   %%mm4, %%mm0                                \n\t"
1180
                "movq %%mm0, (%%eax, %1, 2)                         \n\t"
1181
                "movq (%0, %1, 4), %%mm0                        \n\t"
1182
                "psubb %%mm4, %%mm0                                \n\t"
1183
                "movq %%mm0, (%0, %1, 4)                         \n\t"
1184

    
1185
                :
1186
                : "r" (src), "r" (stride), "r" (QP)
1187
                : "%eax", "%ebx"
1188
        );
1189
#else
1190
        const int l1= stride;
1191
        const int l2= stride + l1;
1192
        const int l3= stride + l2;
1193
        const int l4= stride + l3;
1194
        const int l5= stride + l4;
1195
        const int l6= stride + l5;
1196
        const int l7= stride + l6;
1197
        const int l8= stride + l7;
1198
//        const int l9= stride + l8;
1199
        int x;
1200
        src+= stride*3;
1201
        for(x=0; x<BLOCK_SIZE; x++)
1202
        {
1203
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1204
                if(ABS(middleEnergy) < 8*QP)
1205
                {
1206
                        const int q=(src[l4] - src[l5])/2;
1207
                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1208
                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1209

    
1210
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1211
                        d= MAX(d, 0);
1212

    
1213
                        d= (5*d + 32) >> 6;
1214
                        d*= SIGN(-middleEnergy);
1215

    
1216
                        if(q>0)
1217
                        {
1218
                                d= d<0 ? 0 : d;
1219
                                d= d>q ? q : d;
1220
                        }
1221
                        else
1222
                        {
1223
                                d= d>0 ? 0 : d;
1224
                                d= d<q ? q : d;
1225
                        }
1226

    
1227
                        src[l4]-= d;
1228
                        src[l5]+= d;
1229
                }
1230
                src++;
1231
        }
1232
#endif
1233
}
1234

    
1235
static inline void RENAME(dering)(uint8_t src[], int stride, int QP)
1236
{
1237
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1238
        asm volatile(
1239
                "movq "MANGLE(pQPb)", %%mm0                        \n\t"
1240
                "paddusb %%mm0, %%mm0                                \n\t"
1241
                "movq %%mm0, "MANGLE(pQPb2)"                        \n\t"
1242

    
1243
                "leal (%0, %1), %%eax                                \n\t"
1244
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1245
//        0        1        2        3        4        5        6        7        8        9
1246
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1247

    
1248
                "pcmpeqb %%mm7, %%mm7                                \n\t"
1249
                "pxor %%mm6, %%mm6                                \n\t"
1250
#undef FIND_MIN_MAX
1251
#ifdef HAVE_MMX2
1252
#define FIND_MIN_MAX(addr)\
1253
                "movq " #addr ", %%mm0                                \n\t"\
1254
                "pminub %%mm0, %%mm7                                \n\t"\
1255
                "pmaxub %%mm0, %%mm6                                \n\t"
1256
#else
1257
#define FIND_MIN_MAX(addr)\
1258
                "movq " #addr ", %%mm0                                \n\t"\
1259
                "movq %%mm7, %%mm1                                \n\t"\
1260
                "psubusb %%mm0, %%mm6                                \n\t"\
1261
                "paddb %%mm0, %%mm6                                \n\t"\
1262
                "psubusb %%mm0, %%mm1                                \n\t"\
1263
                "psubb %%mm1, %%mm7                                \n\t"
1264
#endif
1265

    
1266
FIND_MIN_MAX((%%eax))
1267
FIND_MIN_MAX((%%eax, %1))
1268
FIND_MIN_MAX((%%eax, %1, 2))
1269
FIND_MIN_MAX((%0, %1, 4))
1270
FIND_MIN_MAX((%%ebx))
1271
FIND_MIN_MAX((%%ebx, %1))
1272
FIND_MIN_MAX((%%ebx, %1, 2))
1273
FIND_MIN_MAX((%0, %1, 8))
1274

    
1275
                "movq %%mm7, %%mm4                                \n\t"
1276
                "psrlq $8, %%mm7                                \n\t"
1277
#ifdef HAVE_MMX2
1278
                "pminub %%mm4, %%mm7                                \n\t" // min of pixels
1279
                "pshufw $0xF9, %%mm7, %%mm4                        \n\t"
1280
                "pminub %%mm4, %%mm7                                \n\t" // min of pixels
1281
                "pshufw $0xFE, %%mm7, %%mm4                        \n\t"
1282
                "pminub %%mm4, %%mm7                                \n\t"
1283
#else
1284
                "movq %%mm7, %%mm1                                \n\t"
1285
                "psubusb %%mm4, %%mm1                                \n\t"
1286
                "psubb %%mm1, %%mm7                                \n\t"
1287
                "movq %%mm7, %%mm4                                \n\t"
1288
                "psrlq $16, %%mm7                                \n\t"
1289
                "movq %%mm7, %%mm1                                \n\t"
1290
                "psubusb %%mm4, %%mm1                                \n\t"
1291
                "psubb %%mm1, %%mm7                                \n\t"
1292
                "movq %%mm7, %%mm4                                \n\t"
1293
                "psrlq $32, %%mm7                                \n\t"
1294
                "movq %%mm7, %%mm1                                \n\t"
1295
                "psubusb %%mm4, %%mm1                                \n\t"
1296
                "psubb %%mm1, %%mm7                                \n\t"
1297
#endif
1298

    
1299

    
1300
                "movq %%mm6, %%mm4                                \n\t"
1301
                "psrlq $8, %%mm6                                \n\t"
1302
#ifdef HAVE_MMX2
1303
                "pmaxub %%mm4, %%mm6                                \n\t" // max of pixels
1304
                "pshufw $0xF9, %%mm6, %%mm4                        \n\t"
1305
                "pmaxub %%mm4, %%mm6                                \n\t"
1306
                "pshufw $0xFE, %%mm6, %%mm4                        \n\t"
1307
                "pmaxub %%mm4, %%mm6                                \n\t"
1308
#else
1309
                "psubusb %%mm4, %%mm6                                \n\t"
1310
                "paddb %%mm4, %%mm6                                \n\t"
1311
                "movq %%mm6, %%mm4                                \n\t"
1312
                "psrlq $16, %%mm6                                \n\t"
1313
                "psubusb %%mm4, %%mm6                                \n\t"
1314
                "paddb %%mm4, %%mm6                                \n\t"
1315
                "movq %%mm6, %%mm4                                \n\t"
1316
                "psrlq $32, %%mm6                                \n\t"
1317
                "psubusb %%mm4, %%mm6                                \n\t"
1318
                "paddb %%mm4, %%mm6                                \n\t"
1319
#endif
1320
                "movq %%mm6, %%mm0                                \n\t" // max
1321
                "psubb %%mm7, %%mm6                                \n\t" // max - min
1322
                "movd %%mm6, %%ecx                                \n\t"
1323
                "cmpb "MANGLE(deringThreshold)", %%cl                \n\t"
1324
                " jb 1f                                                \n\t"
1325
                PAVGB(%%mm0, %%mm7)                                      // a=(max + min)/2
1326
                "punpcklbw %%mm7, %%mm7                                \n\t"
1327
                "punpcklbw %%mm7, %%mm7                                \n\t"
1328
                "punpcklbw %%mm7, %%mm7                                \n\t"
1329
                "movq %%mm7, "MANGLE(temp0)"                        \n\t"
1330

    
1331
                "movq (%0), %%mm0                                \n\t" // L10
1332
                "movq %%mm0, %%mm1                                \n\t" // L10
1333
                "movq %%mm0, %%mm2                                \n\t" // L10
1334
                "psllq $8, %%mm1                                \n\t"
1335
                "psrlq $8, %%mm2                                \n\t"
1336
                "movd -4(%0), %%mm3                                \n\t"
1337
                "movd 8(%0), %%mm4                                \n\t"
1338
                "psrlq $24, %%mm3                                \n\t"
1339
                "psllq $56, %%mm4                                \n\t"
1340
                "por %%mm3, %%mm1                                \n\t" // L00
1341
                "por %%mm4, %%mm2                                \n\t" // L20
1342
                "movq %%mm1, %%mm3                                \n\t" // L00
1343
                PAVGB(%%mm2, %%mm1)                                      // (L20 + L00)/2
1344
                PAVGB(%%mm0, %%mm1)                                      // (L20 + L00 + 2L10)/4
1345
                "psubusb %%mm7, %%mm0                                \n\t"
1346
                "psubusb %%mm7, %%mm2                                \n\t"
1347
                "psubusb %%mm7, %%mm3                                \n\t"
1348
                "pcmpeqb "MANGLE(b00)", %%mm0                        \n\t" // L10 > a ? 0 : -1
1349
                "pcmpeqb "MANGLE(b00)", %%mm2                        \n\t" // L20 > a ? 0 : -1
1350
                "pcmpeqb "MANGLE(b00)", %%mm3                        \n\t" // L00 > a ? 0 : -1
1351
                "paddb %%mm2, %%mm0                                \n\t"
1352
                "paddb %%mm3, %%mm0                                \n\t"
1353

    
1354
                "movq (%%eax), %%mm2                                \n\t" // L11
1355
                "movq %%mm2, %%mm3                                \n\t" // L11
1356
                "movq %%mm2, %%mm4                                \n\t" // L11
1357
                "psllq $8, %%mm3                                \n\t"
1358
                "psrlq $8, %%mm4                                \n\t"
1359
                "movd -4(%%eax), %%mm5                                \n\t"
1360
                "movd 8(%%eax), %%mm6                                \n\t"
1361
                "psrlq $24, %%mm5                                \n\t"
1362
                "psllq $56, %%mm6                                \n\t"
1363
                "por %%mm5, %%mm3                                \n\t" // L01
1364
                "por %%mm6, %%mm4                                \n\t" // L21
1365
                "movq %%mm3, %%mm5                                \n\t" // L01
1366
                PAVGB(%%mm4, %%mm3)                                      // (L21 + L01)/2
1367
                PAVGB(%%mm2, %%mm3)                                      // (L21 + L01 + 2L11)/4
1368
                "psubusb %%mm7, %%mm2                                \n\t"
1369
                "psubusb %%mm7, %%mm4                                \n\t"
1370
                "psubusb %%mm7, %%mm5                                \n\t"
1371
                "pcmpeqb "MANGLE(b00)", %%mm2                        \n\t" // L11 > a ? 0 : -1
1372
                "pcmpeqb "MANGLE(b00)", %%mm4                        \n\t" // L21 > a ? 0 : -1
1373
                "pcmpeqb "MANGLE(b00)", %%mm5                        \n\t" // L01 > a ? 0 : -1
1374
                "paddb %%mm4, %%mm2                                \n\t"
1375
                "paddb %%mm5, %%mm2                                \n\t"
1376
// 0, 2, 3, 1
1377
#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1378
                "movq " #src ", " #sx "                                \n\t" /* src[0] */\
1379
                "movq " #sx ", " #lx "                                \n\t" /* src[0] */\
1380
                "movq " #sx ", " #t0 "                                \n\t" /* src[0] */\
1381
                "psllq $8, " #lx "                                \n\t"\
1382
                "psrlq $8, " #t0 "                                \n\t"\
1383
                "movd -4" #src ", " #t1 "                        \n\t"\
1384
                "psrlq $24, " #t1 "                                \n\t"\
1385
                "por " #t1 ", " #lx "                                \n\t" /* src[-1] */\
1386
                "movd 8" #src ", " #t1 "                        \n\t"\
1387
                "psllq $56, " #t1 "                                \n\t"\
1388
                "por " #t1 ", " #t0 "                                \n\t" /* src[+1] */\
1389
                "movq " #lx ", " #t1 "                                \n\t" /* src[-1] */\
1390
                PAVGB(t0, lx)                                              /* (src[-1] + src[+1])/2 */\
1391
                PAVGB(sx, lx)                                      /* (src[-1] + 2src[0] + src[+1])/4 */\
1392
                PAVGB(lx, pplx)                                             \
1393
                "movq " #lx ", "MANGLE(temp1)"                        \n\t"\
1394
                "movq "MANGLE(temp0)", " #lx "                        \n\t"\
1395
                "psubusb " #lx ", " #t1 "                        \n\t"\
1396
                "psubusb " #lx ", " #t0 "                        \n\t"\
1397
                "psubusb " #lx ", " #sx "                        \n\t"\
1398
                "movq "MANGLE(b00)", " #lx "                        \n\t"\
1399
                "pcmpeqb " #lx ", " #t1 "                        \n\t" /* src[-1] > a ? 0 : -1*/\
1400
                "pcmpeqb " #lx ", " #t0 "                        \n\t" /* src[+1] > a ? 0 : -1*/\
1401
                "pcmpeqb " #lx ", " #sx "                        \n\t" /* src[0]  > a ? 0 : -1*/\
1402
                "paddb " #t1 ", " #t0 "                                \n\t"\
1403
                "paddb " #t0 ", " #sx "                                \n\t"\
1404
\
1405
                PAVGB(plx, pplx)                                      /* filtered */\
1406
                "movq " #dst ", " #t0 "                                \n\t" /* dst */\
1407
                "movq " #t0 ", " #t1 "                                \n\t" /* dst */\
1408
                "psubusb "MANGLE(pQPb2)", " #t0 "                \n\t"\
1409
                "paddusb "MANGLE(pQPb2)", " #t1 "                \n\t"\
1410
                PMAXUB(t0, pplx)\
1411
                PMINUB(t1, pplx, t0)\
1412
                "paddb " #sx ", " #ppsx "                        \n\t"\
1413
                "paddb " #psx ", " #ppsx "                        \n\t"\
1414
                "#paddb "MANGLE(b02)", " #ppsx "                \n\t"\
1415
                "pand "MANGLE(b08)", " #ppsx "                        \n\t"\
1416
                "pcmpeqb " #lx ", " #ppsx "                        \n\t"\
1417
                "pand " #ppsx ", " #pplx "                        \n\t"\
1418
                "pandn " #dst ", " #ppsx "                        \n\t"\
1419
                "por " #pplx ", " #ppsx "                        \n\t"\
1420
                "movq " #ppsx ", " #dst "                        \n\t"\
1421
                "movq "MANGLE(temp1)", " #lx "                        \n\t"
1422

    
1423
/*
1424
0000000
1425
1111111
1426

1427
1111110
1428
1111101
1429
1111100
1430
1111011
1431
1111010
1432
1111001
1433

1434
1111000
1435
1110111
1436

1437
*/
1438
//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1439
DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1440
DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1441
DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1442
DERING_CORE((%0, %1, 4),(%%ebx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1443
DERING_CORE((%%ebx),(%%ebx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1444
DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1445
DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1446
DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1447

    
1448
                "1:                        \n\t"
1449
                : : "r" (src), "r" (stride), "r" (QP)
1450
                : "%eax", "%ebx", "%ecx"
1451
        );
1452
#else
1453
        int y;
1454
        int min=255;
1455
        int max=0;
1456
        int avg;
1457
        uint8_t *p;
1458
        int s[10];
1459

    
1460
        for(y=1; y<9; y++)
1461
        {
1462
                int x;
1463
                p= src + stride*y;
1464
                for(x=1; x<9; x++)
1465
                {
1466
                        p++;
1467
                        if(*p > max) max= *p;
1468
                        if(*p < min) min= *p;
1469
                }
1470
        }
1471
        avg= (min + max + 1)/2;
1472

    
1473
        if(max - min <deringThreshold) return;
1474

    
1475
        for(y=0; y<10; y++)
1476
        {
1477
                int x;
1478
                int t = 0;
1479
                p= src + stride*y;
1480
                for(x=0; x<10; x++)
1481
                {
1482
                        if(*p > avg) t |= (1<<x);
1483
                        p++;
1484
                }
1485
                t |= (~t)<<16;
1486
                t &= (t<<1) & (t>>1);
1487
                s[y] = t;
1488
        }
1489

    
1490
        for(y=1; y<9; y++)
1491
        {
1492
                int x;
1493
                int t = s[y-1] & s[y] & s[y+1];
1494
                t|= t>>16;
1495

    
1496
                p= src + stride*y;
1497
                for(x=1; x<9; x++)
1498
                {
1499
                        p++;
1500
                        if(t & (1<<x))
1501
                        {
1502
                                int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1503
                                      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1504
                                      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1505
                                f= (f + 8)>>4;
1506

    
1507
#ifdef DEBUG_DERING_THRESHOLD
1508
                                asm volatile("emms\n\t":);
1509
                                {
1510
                                static long long numPixels=0;
1511
                                if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1512
//                                if((max-min)<20 || (max-min)*QP<200)
1513
//                                if((max-min)*QP < 500)
1514
//                                if(max-min<QP/2)
1515
                                if(max-min < 20)
1516
                                {
1517
                                        static int numSkiped=0;
1518
                                        static int errorSum=0;
1519
                                        static int worstQP=0;
1520
                                        static int worstRange=0;
1521
                                        static int worstDiff=0;
1522
                                        int diff= (f - *p);
1523
                                        int absDiff= ABS(diff);
1524
                                        int error= diff*diff;
1525

    
1526
                                        if(x==1 || x==8 || y==1 || y==8) continue;
1527

    
1528
                                        numSkiped++;
1529
                                        if(absDiff > worstDiff)
1530
                                        {
1531
                                                worstDiff= absDiff;
1532
                                                worstQP= QP;
1533
                                                worstRange= max-min;
1534
                                        }
1535
                                        errorSum+= error;
1536

    
1537
                                        if(1024LL*1024LL*1024LL % numSkiped == 0)
1538
                                        {
1539
                                                printf( "sum:%1.3f, skip:%d, wQP:%d, "
1540
                                                        "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1541
                                                        (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1542
                                                        worstDiff, (float)numSkiped/numPixels);
1543
                                        }
1544
                                }
1545
                                }
1546
#endif
1547
                                if     (*p + 2*QP < f) *p= *p + 2*QP;
1548
                                else if(*p - 2*QP > f) *p= *p - 2*QP;
1549
                                else *p=f;
1550
                        }
1551
                }
1552
        }
1553
#ifdef DEBUG_DERING_THRESHOLD
1554
        if(max-min < 20)
1555
        {
1556
                for(y=1; y<9; y++)
1557
                {
1558
                        int x;
1559
                        int t = 0;
1560
                        p= src + stride*y;
1561
                        for(x=1; x<9; x++)
1562
                        {
1563
                                p++;
1564
                                *p = MIN(*p + 20, 255);
1565
                        }
1566
                }
1567
//                src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1568
        }
1569
#endif
1570
#endif
1571
}
1572

    
1573
/**
1574
 * Deinterlaces the given block
1575
 * will be called for every 8x8 block and can read & write from line 4-15
1576
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1577
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1578
 */
1579
static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1580
{
1581
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1582
        src+= 4*stride;
1583
        asm volatile(
1584
                "leal (%0, %1), %%eax                                \n\t"
1585
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1586
//        0        1        2        3        4        5        6        7        8        9
1587
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1588

    
1589
                "movq (%0), %%mm0                                \n\t"
1590
                "movq (%%eax, %1), %%mm1                        \n\t"
1591
                PAVGB(%%mm1, %%mm0)
1592
                "movq %%mm0, (%%eax)                                \n\t"
1593
                "movq (%0, %1, 4), %%mm0                        \n\t"
1594
                PAVGB(%%mm0, %%mm1)
1595
                "movq %%mm1, (%%eax, %1, 2)                        \n\t"
1596
                "movq (%%ebx, %1), %%mm1                        \n\t"
1597
                PAVGB(%%mm1, %%mm0)
1598
                "movq %%mm0, (%%ebx)                                \n\t"
1599
                "movq (%0, %1, 8), %%mm0                        \n\t"
1600
                PAVGB(%%mm0, %%mm1)
1601
                "movq %%mm1, (%%ebx, %1, 2)                        \n\t"
1602

    
1603
                : : "r" (src), "r" (stride)
1604
                : "%eax", "%ebx"
1605
        );
1606
#else
1607
        int x;
1608
        src+= 4*stride;
1609
        for(x=0; x<8; x++)
1610
        {
1611
                src[stride]   = (src[0]        + src[stride*2])>>1;
1612
                src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1613
                src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1614
                src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1615
                src++;
1616
        }
1617
#endif
1618
}
1619

    
1620
/**
1621
 * Deinterlaces the given block
1622
 * will be called for every 8x8 block and can read & write from line 4-15
1623
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1624
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1625
 * this filter will read lines 3-15 and write 7-13
1626
 * no cliping in C version
1627
 */
1628
static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1629
{
1630
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1631
        src+= stride*3;
1632
        asm volatile(
1633
                "leal (%0, %1), %%eax                                \n\t"
1634
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1635
                "leal (%%ebx, %1, 4), %%ecx                        \n\t"
1636
                "addl %1, %%ecx                                        \n\t"
1637
                "pxor %%mm7, %%mm7                                \n\t"
1638
//        0        1        2        3        4        5        6        7        8        9        10
1639
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1 ecx
1640

    
1641
#define DEINT_CUBIC(a,b,c,d,e)\
1642
                "movq " #a ", %%mm0                                \n\t"\
1643
                "movq " #b ", %%mm1                                \n\t"\
1644
                "movq " #d ", %%mm2                                \n\t"\
1645
                "movq " #e ", %%mm3                                \n\t"\
1646
                PAVGB(%%mm2, %%mm1)                                        /* (b+d) /2 */\
1647
                PAVGB(%%mm3, %%mm0)                                        /* a(a+e) /2 */\
1648
                "movq %%mm0, %%mm2                                \n\t"\
1649
                "punpcklbw %%mm7, %%mm0                                \n\t"\
1650
                "punpckhbw %%mm7, %%mm2                                \n\t"\
1651
                "movq %%mm1, %%mm3                                \n\t"\
1652
                "punpcklbw %%mm7, %%mm1                                \n\t"\
1653
                "punpckhbw %%mm7, %%mm3                                \n\t"\
1654
                "psubw %%mm1, %%mm0                                \n\t"        /* L(a+e - (b+d))/2 */\
1655
                "psubw %%mm3, %%mm2                                \n\t"        /* H(a+e - (b+d))/2 */\
1656
                "psraw $3, %%mm0                                \n\t"        /* L(a+e - (b+d))/16 */\
1657
                "psraw $3, %%mm2                                \n\t"        /* H(a+e - (b+d))/16 */\
1658
                "psubw %%mm0, %%mm1                                \n\t"        /* L(9b + 9d - a - e)/16 */\
1659
                "psubw %%mm2, %%mm3                                \n\t"        /* H(9b + 9d - a - e)/16 */\
1660
                "packuswb %%mm3, %%mm1                                \n\t"\
1661
                "movq %%mm1, " #c "                                \n\t"
1662

    
1663
DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
1664
DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
1665
DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
1666
DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1667

    
1668
                : : "r" (src), "r" (stride)
1669
                : "%eax", "%ebx", "ecx"
1670
        );
1671
#else
1672
        int x;
1673
        src+= stride*3;
1674
        for(x=0; x<8; x++)
1675
        {
1676
                src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1677
                src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1678
                src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1679
                src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
1680
                src++;
1681
        }
1682
#endif
1683
}
1684

    
1685
/**
1686
 * Deinterlaces the given block
1687
 * will be called for every 8x8 block and can read & write from line 4-15
1688
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1689
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1690
 * will shift the image up by 1 line (FIXME if this is a problem)
1691
 * this filter will read lines 4-13 and write 4-11
1692
 */
1693
static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride)
1694
{
1695
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1696
        src+= 4*stride;
1697
        asm volatile(
1698
                "leal (%0, %1), %%eax                                \n\t"
1699
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1700
//        0        1        2        3        4        5        6        7        8        9
1701
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1702

    
1703
                "movq (%0), %%mm0                                \n\t" // L0
1704
                "movq (%%eax, %1), %%mm1                        \n\t" // L2
1705
                PAVGB(%%mm1, %%mm0)                                      // L0+L2
1706
                "movq (%%eax), %%mm2                                \n\t" // L1
1707
                PAVGB(%%mm2, %%mm0)
1708
                "movq %%mm0, (%0)                                \n\t"
1709
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // L3
1710
                PAVGB(%%mm0, %%mm2)                                      // L1+L3
1711
                PAVGB(%%mm1, %%mm2)                                      // 2L2 + L1 + L3
1712
                "movq %%mm2, (%%eax)                                \n\t"
1713
                "movq (%0, %1, 4), %%mm2                        \n\t" // L4
1714
                PAVGB(%%mm2, %%mm1)                                      // L2+L4
1715
                PAVGB(%%mm0, %%mm1)                                      // 2L3 + L2 + L4
1716
                "movq %%mm1, (%%eax, %1)                        \n\t"
1717
                "movq (%%ebx), %%mm1                                \n\t" // L5
1718
                PAVGB(%%mm1, %%mm0)                                      // L3+L5
1719
                PAVGB(%%mm2, %%mm0)                                      // 2L4 + L3 + L5
1720
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
1721
                "movq (%%ebx, %1), %%mm0                        \n\t" // L6
1722
                PAVGB(%%mm0, %%mm2)                                      // L4+L6
1723
                PAVGB(%%mm1, %%mm2)                                      // 2L5 + L4 + L6
1724
                "movq %%mm2, (%0, %1, 4)                        \n\t"
1725
                "movq (%%ebx, %1, 2), %%mm2                        \n\t" // L7
1726
                PAVGB(%%mm2, %%mm1)                                      // L5+L7
1727
                PAVGB(%%mm0, %%mm1)                                      // 2L6 + L5 + L7
1728
                "movq %%mm1, (%%ebx)                                \n\t"
1729
                "movq (%0, %1, 8), %%mm1                        \n\t" // L8
1730
                PAVGB(%%mm1, %%mm0)                                      // L6+L8
1731
                PAVGB(%%mm2, %%mm0)                                      // 2L7 + L6 + L8
1732
                "movq %%mm0, (%%ebx, %1)                        \n\t"
1733
                "movq (%%ebx, %1, 4), %%mm0                        \n\t" // L9
1734
                PAVGB(%%mm0, %%mm2)                                      // L7+L9
1735
                PAVGB(%%mm1, %%mm2)                                      // 2L8 + L7 + L9
1736
                "movq %%mm2, (%%ebx, %1, 2)                        \n\t"
1737

    
1738

    
1739
                : : "r" (src), "r" (stride)
1740
                : "%eax", "%ebx"
1741
        );
1742
#else
1743
        int x;
1744
        src+= 4*stride;
1745
        for(x=0; x<8; x++)
1746
        {
1747
                src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
1748
                src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
1749
                src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
1750
                src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
1751
                src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
1752
                src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
1753
                src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
1754
                src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
1755
                src++;
1756
        }
1757
#endif
1758
}
1759

    
1760
/**
1761
 * Deinterlaces the given block
1762
 * will be called for every 8x8 block and can read & write from line 4-15,
1763
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1764
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1765
 */
1766
static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1767
{
1768
#ifdef HAVE_MMX
1769
        src+= 4*stride;
1770
#ifdef HAVE_MMX2
1771
        asm volatile(
1772
                "leal (%0, %1), %%eax                                \n\t"
1773
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1774
//        0        1        2        3        4        5        6        7        8        9
1775
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1776

    
1777
                "movq (%0), %%mm0                                \n\t" //
1778
                "movq (%%eax, %1), %%mm2                        \n\t" //
1779
                "movq (%%eax), %%mm1                                \n\t" //
1780
                "movq %%mm0, %%mm3                                \n\t"
1781
                "pmaxub %%mm1, %%mm0                                \n\t" //
1782
                "pminub %%mm3, %%mm1                                \n\t" //
1783
                "pmaxub %%mm2, %%mm1                                \n\t" //
1784
                "pminub %%mm1, %%mm0                                \n\t"
1785
                "movq %%mm0, (%%eax)                                \n\t"
1786

    
1787
                "movq (%0, %1, 4), %%mm0                        \n\t" //
1788
                "movq (%%eax, %1, 2), %%mm1                        \n\t" //
1789
                "movq %%mm2, %%mm3                                \n\t"
1790
                "pmaxub %%mm1, %%mm2                                \n\t" //
1791
                "pminub %%mm3, %%mm1                                \n\t" //
1792
                "pmaxub %%mm0, %%mm1                                \n\t" //
1793
                "pminub %%mm1, %%mm2                                \n\t"
1794
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
1795

    
1796
                "movq (%%ebx), %%mm2                                \n\t" //
1797
                "movq (%%ebx, %1), %%mm1                        \n\t" //
1798
                "movq %%mm2, %%mm3                                \n\t"
1799
                "pmaxub %%mm0, %%mm2                                \n\t" //
1800
                "pminub %%mm3, %%mm0                                \n\t" //
1801
                "pmaxub %%mm1, %%mm0                                \n\t" //
1802
                "pminub %%mm0, %%mm2                                \n\t"
1803
                "movq %%mm2, (%%ebx)                                \n\t"
1804

    
1805
                "movq (%%ebx, %1, 2), %%mm2                        \n\t" //
1806
                "movq (%0, %1, 8), %%mm0                        \n\t" //
1807
                "movq %%mm2, %%mm3                                \n\t"
1808
                "pmaxub %%mm0, %%mm2                                \n\t" //
1809
                "pminub %%mm3, %%mm0                                \n\t" //
1810
                "pmaxub %%mm1, %%mm0                                \n\t" //
1811
                "pminub %%mm0, %%mm2                                \n\t"
1812
                "movq %%mm2, (%%ebx, %1, 2)                        \n\t"
1813

    
1814

    
1815
                : : "r" (src), "r" (stride)
1816
                : "%eax", "%ebx"
1817
        );
1818

    
1819
#else // MMX without MMX2
1820
        asm volatile(
1821
                "leal (%0, %1), %%eax                                \n\t"
1822
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1823
//        0        1        2        3        4        5        6        7        8        9
1824
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1825
                "pxor %%mm7, %%mm7                                \n\t"
1826

    
1827
#define MEDIAN(a,b,c)\
1828
                "movq " #a ", %%mm0                                \n\t"\
1829
                "movq " #b ", %%mm2                                \n\t"\
1830
                "movq " #c ", %%mm1                                \n\t"\
1831
                "movq %%mm0, %%mm3                                \n\t"\
1832
                "movq %%mm1, %%mm4                                \n\t"\
1833
                "movq %%mm2, %%mm5                                \n\t"\
1834
                "psubusb %%mm1, %%mm3                                \n\t"\
1835
                "psubusb %%mm2, %%mm4                                \n\t"\
1836
                "psubusb %%mm0, %%mm5                                \n\t"\
1837
                "pcmpeqb %%mm7, %%mm3                                \n\t"\
1838
                "pcmpeqb %%mm7, %%mm4                                \n\t"\
1839
                "pcmpeqb %%mm7, %%mm5                                \n\t"\
1840
                "movq %%mm3, %%mm6                                \n\t"\
1841
                "pxor %%mm4, %%mm3                                \n\t"\
1842
                "pxor %%mm5, %%mm4                                \n\t"\
1843
                "pxor %%mm6, %%mm5                                \n\t"\
1844
                "por %%mm3, %%mm1                                \n\t"\
1845
                "por %%mm4, %%mm2                                \n\t"\
1846
                "por %%mm5, %%mm0                                \n\t"\
1847
                "pand %%mm2, %%mm0                                \n\t"\
1848
                "pand %%mm1, %%mm0                                \n\t"\
1849
                "movq %%mm0, " #b "                                \n\t"
1850

    
1851
MEDIAN((%0), (%%eax), (%%eax, %1))
1852
MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
1853
MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
1854
MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
1855

    
1856
                : : "r" (src), "r" (stride)
1857
                : "%eax", "%ebx"
1858
        );
1859
#endif // MMX
1860
#else
1861
        //FIXME
1862
        int x;
1863
        src+= 4*stride;
1864
        for(x=0; x<8; x++)
1865
        {
1866
                src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
1867
                src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
1868
                src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
1869
                src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
1870
                src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
1871
                src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
1872
                src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
1873
                src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
1874
                src++;
1875
        }
1876
#endif
1877
}
1878

    
1879
#ifdef HAVE_MMX
1880
/**
1881
 * transposes and shift the given 8x8 Block into dst1 and dst2
1882
 */
1883
static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
1884
{
1885
        asm(
1886
                "leal (%0, %1), %%eax                                \n\t"
1887
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1888
//        0        1        2        3        4        5        6        7        8        9
1889
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1890
                "movq (%0), %%mm0                \n\t" // 12345678
1891
                "movq (%%eax), %%mm1                \n\t" // abcdefgh
1892
                "movq %%mm0, %%mm2                \n\t" // 12345678
1893
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
1894
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
1895

    
1896
                "movq (%%eax, %1), %%mm1        \n\t"
1897
                "movq (%%eax, %1, 2), %%mm3        \n\t"
1898
                "movq %%mm1, %%mm4                \n\t"
1899
                "punpcklbw %%mm3, %%mm1                \n\t"
1900
                "punpckhbw %%mm3, %%mm4                \n\t"
1901

    
1902
                "movq %%mm0, %%mm3                \n\t"
1903
                "punpcklwd %%mm1, %%mm0                \n\t"
1904
                "punpckhwd %%mm1, %%mm3                \n\t"
1905
                "movq %%mm2, %%mm1                \n\t"
1906
                "punpcklwd %%mm4, %%mm2                \n\t"
1907
                "punpckhwd %%mm4, %%mm1                \n\t"
1908

    
1909
                "movd %%mm0, 128(%2)                \n\t"
1910
                "psrlq $32, %%mm0                \n\t"
1911
                "movd %%mm0, 144(%2)                \n\t"
1912
                "movd %%mm3, 160(%2)                \n\t"
1913
                "psrlq $32, %%mm3                \n\t"
1914
                "movd %%mm3, 176(%2)                \n\t"
1915
                "movd %%mm3, 48(%3)                \n\t"
1916
                "movd %%mm2, 192(%2)                \n\t"
1917
                "movd %%mm2, 64(%3)                \n\t"
1918
                "psrlq $32, %%mm2                \n\t"
1919
                "movd %%mm2, 80(%3)                \n\t"
1920
                "movd %%mm1, 96(%3)                \n\t"
1921
                "psrlq $32, %%mm1                \n\t"
1922
                "movd %%mm1, 112(%3)                \n\t"
1923

    
1924
                "movq (%0, %1, 4), %%mm0        \n\t" // 12345678
1925
                "movq (%%ebx), %%mm1                \n\t" // abcdefgh
1926
                "movq %%mm0, %%mm2                \n\t" // 12345678
1927
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
1928
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
1929

    
1930
                "movq (%%ebx, %1), %%mm1        \n\t"
1931
                "movq (%%ebx, %1, 2), %%mm3        \n\t"
1932
                "movq %%mm1, %%mm4                \n\t"
1933
                "punpcklbw %%mm3, %%mm1                \n\t"
1934
                "punpckhbw %%mm3, %%mm4                \n\t"
1935

    
1936
                "movq %%mm0, %%mm3                \n\t"
1937
                "punpcklwd %%mm1, %%mm0                \n\t"
1938
                "punpckhwd %%mm1, %%mm3                \n\t"
1939
                "movq %%mm2, %%mm1                \n\t"
1940
                "punpcklwd %%mm4, %%mm2                \n\t"
1941
                "punpckhwd %%mm4, %%mm1                \n\t"
1942

    
1943
                "movd %%mm0, 132(%2)                \n\t"
1944
                "psrlq $32, %%mm0                \n\t"
1945
                "movd %%mm0, 148(%2)                \n\t"
1946
                "movd %%mm3, 164(%2)                \n\t"
1947
                "psrlq $32, %%mm3                \n\t"
1948
                "movd %%mm3, 180(%2)                \n\t"
1949
                "movd %%mm3, 52(%3)                \n\t"
1950
                "movd %%mm2, 196(%2)                \n\t"
1951
                "movd %%mm2, 68(%3)                \n\t"
1952
                "psrlq $32, %%mm2                \n\t"
1953
                "movd %%mm2, 84(%3)                \n\t"
1954
                "movd %%mm1, 100(%3)                \n\t"
1955
                "psrlq $32, %%mm1                \n\t"
1956
                "movd %%mm1, 116(%3)                \n\t"
1957

    
1958

    
1959
        :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
1960
        : "%eax", "%ebx"
1961
        );
1962
}
1963

    
1964
/**
1965
 * transposes the given 8x8 block
1966
 */
1967
static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
1968
{
1969
        asm(
1970
                "leal (%0, %1), %%eax                                \n\t"
1971
                "leal (%%eax, %1, 4), %%ebx                        \n\t"
1972
//        0        1        2        3        4        5        6        7        8        9
1973
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ebx        ebx+%1        ebx+2%1        %0+8%1        ebx+4%1
1974
                "movq (%2), %%mm0                \n\t" // 12345678
1975
                "movq 16(%2), %%mm1                \n\t" // abcdefgh
1976
                "movq %%mm0, %%mm2                \n\t" // 12345678
1977
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
1978
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
1979

    
1980
                "movq 32(%2), %%mm1                \n\t"
1981
                "movq 48(%2), %%mm3                \n\t"
1982
                "movq %%mm1, %%mm4                \n\t"
1983
                "punpcklbw %%mm3, %%mm1                \n\t"
1984
                "punpckhbw %%mm3, %%mm4                \n\t"
1985

    
1986
                "movq %%mm0, %%mm3                \n\t"
1987
                "punpcklwd %%mm1, %%mm0                \n\t"
1988
                "punpckhwd %%mm1, %%mm3                \n\t"
1989
                "movq %%mm2, %%mm1                \n\t"
1990
                "punpcklwd %%mm4, %%mm2                \n\t"
1991
                "punpckhwd %%mm4, %%mm1                \n\t"
1992

    
1993
                "movd %%mm0, (%0)                \n\t"
1994
                "psrlq $32, %%mm0                \n\t"
1995
                "movd %%mm0, (%%eax)                \n\t"
1996
                "movd %%mm3, (%%eax, %1)        \n\t"
1997
                "psrlq $32, %%mm3                \n\t"
1998
                "movd %%mm3, (%%eax, %1, 2)        \n\t"
1999
                "movd %%mm2, (%0, %1, 4)        \n\t"
2000
                "psrlq $32, %%mm2                \n\t"
2001
                "movd %%mm2, (%%ebx)                \n\t"
2002
                "movd %%mm1, (%%ebx, %1)        \n\t"
2003
                "psrlq $32, %%mm1                \n\t"
2004
                "movd %%mm1, (%%ebx, %1, 2)        \n\t"
2005

    
2006

    
2007
                "movq 64(%2), %%mm0                \n\t" // 12345678
2008
                "movq 80(%2), %%mm1                \n\t" // abcdefgh
2009
                "movq %%mm0, %%mm2                \n\t" // 12345678
2010
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2011
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2012

    
2013
                "movq 96(%2), %%mm1                \n\t"
2014
                "movq 112(%2), %%mm3                \n\t"
2015
                "movq %%mm1, %%mm4                \n\t"
2016
                "punpcklbw %%mm3, %%mm1                \n\t"
2017
                "punpckhbw %%mm3, %%mm4                \n\t"
2018

    
2019
                "movq %%mm0, %%mm3                \n\t"
2020
                "punpcklwd %%mm1, %%mm0                \n\t"
2021
                "punpckhwd %%mm1, %%mm3                \n\t"
2022
                "movq %%mm2, %%mm1                \n\t"
2023
                "punpcklwd %%mm4, %%mm2                \n\t"
2024
                "punpckhwd %%mm4, %%mm1                \n\t"
2025

    
2026
                "movd %%mm0, 4(%0)                \n\t"
2027
                "psrlq $32, %%mm0                \n\t"
2028
                "movd %%mm0, 4(%%eax)                \n\t"
2029
                "movd %%mm3, 4(%%eax, %1)        \n\t"
2030
                "psrlq $32, %%mm3                \n\t"
2031
                "movd %%mm3, 4(%%eax, %1, 2)        \n\t"
2032
                "movd %%mm2, 4(%0, %1, 4)        \n\t"
2033
                "psrlq $32, %%mm2                \n\t"
2034
                "movd %%mm2, 4(%%ebx)                \n\t"
2035
                "movd %%mm1, 4(%%ebx, %1)        \n\t"
2036
                "psrlq $32, %%mm1                \n\t"
2037
                "movd %%mm1, 4(%%ebx, %1, 2)        \n\t"
2038

    
2039
        :: "r" (dst), "r" (dstStride), "r" (src)
2040
        : "%eax", "%ebx"
2041
        );
2042
}
2043
#endif
2044
//static int test=0;
2045

    
2046
static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2047
                                    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2048
{
2049
#define FAST_L2_DIFF
2050
//#define L1_DIFF //u should change the thresholds too if u try that one
2051
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2052
        asm volatile(
2053
                "leal (%2, %2, 2), %%eax                        \n\t" // 3*stride
2054
                "leal (%2, %2, 4), %%ebx                        \n\t" // 5*stride
2055
                "leal (%%ebx, %2, 2), %%ecx                        \n\t" // 7*stride
2056
//        0        1        2        3        4        5        6        7        8        9
2057
//        %x        %x+%2        %x+2%2        %x+eax        %x+4%2        %x+ebx        %x+2eax        %x+ecx        %x+8%2
2058
//FIXME reorder?
2059
#ifdef L1_DIFF //needs mmx2
2060
                "movq (%0), %%mm0                                \n\t" // L0
2061
                "psadbw (%1), %%mm0                                \n\t" // |L0-R0|
2062
                "movq (%0, %2), %%mm1                                \n\t" // L1
2063
                "psadbw (%1, %2), %%mm1                                \n\t" // |L1-R1|
2064
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2065
                "psadbw (%1, %2, 2), %%mm2                        \n\t" // |L2-R2|
2066
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2067
                "psadbw (%1, %%eax), %%mm3                        \n\t" // |L3-R3|
2068

    
2069
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2070
                "paddw %%mm1, %%mm0                                \n\t"
2071
                "psadbw (%1, %2, 4), %%mm4                        \n\t" // |L4-R4|
2072
                "movq (%0, %%ebx), %%mm5                        \n\t" // L5
2073
                "paddw %%mm2, %%mm0                                \n\t"
2074
                "psadbw (%1, %%ebx), %%mm5                        \n\t" // |L5-R5|
2075
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2076
                "paddw %%mm3, %%mm0                                \n\t"
2077
                "psadbw (%1, %%eax, 2), %%mm6                        \n\t" // |L6-R6|
2078
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2079
                "paddw %%mm4, %%mm0                                \n\t"
2080
                "psadbw (%1, %%ecx), %%mm7                        \n\t" // |L7-R7|
2081
                "paddw %%mm5, %%mm6                                \n\t"
2082
                "paddw %%mm7, %%mm6                                \n\t"
2083
                "paddw %%mm6, %%mm0                                \n\t"
2084
#elif defined (FAST_L2_DIFF)
2085
                "pcmpeqb %%mm7, %%mm7                                \n\t"
2086
                "movq "MANGLE(b80)", %%mm6                        \n\t"
2087
                "pxor %%mm0, %%mm0                                \n\t"
2088
#define L2_DIFF_CORE(a, b)\
2089
                "movq " #a ", %%mm5                                \n\t"\
2090
                "movq " #b ", %%mm2                                \n\t"\
2091
                "pxor %%mm7, %%mm2                                \n\t"\
2092
                PAVGB(%%mm2, %%mm5)\
2093
                "paddb %%mm6, %%mm5                                \n\t"\
2094
                "movq %%mm5, %%mm2                                \n\t"\
2095
                "psllw $8, %%mm5                                \n\t"\
2096
                "pmaddwd %%mm5, %%mm5                                \n\t"\
2097
                "pmaddwd %%mm2, %%mm2                                \n\t"\
2098
                "paddd %%mm2, %%mm5                                \n\t"\
2099
                "psrld $14, %%mm5                                \n\t"\
2100
                "paddd %%mm5, %%mm0                                \n\t"
2101

    
2102
L2_DIFF_CORE((%0), (%1))
2103
L2_DIFF_CORE((%0, %2), (%1, %2))
2104
L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2105
L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2106
L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2107
L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2108
L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2109
L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2110

    
2111
#else
2112
                "pxor %%mm7, %%mm7                                \n\t"
2113
                "pxor %%mm0, %%mm0                                \n\t"
2114
#define L2_DIFF_CORE(a, b)\
2115
                "movq " #a ", %%mm5                                \n\t"\
2116
                "movq " #b ", %%mm2                                \n\t"\
2117
                "movq %%mm5, %%mm1                                \n\t"\
2118
                "movq %%mm2, %%mm3                                \n\t"\
2119
                "punpcklbw %%mm7, %%mm5                                \n\t"\
2120
                "punpckhbw %%mm7, %%mm1                                \n\t"\
2121
                "punpcklbw %%mm7, %%mm2                                \n\t"\
2122
                "punpckhbw %%mm7, %%mm3                                \n\t"\
2123
                "psubw %%mm2, %%mm5                                \n\t"\
2124
                "psubw %%mm3, %%mm1                                \n\t"\
2125
                "pmaddwd %%mm5, %%mm5                                \n\t"\
2126
                "pmaddwd %%mm1, %%mm1                                \n\t"\
2127
                "paddd %%mm1, %%mm5                                \n\t"\
2128
                "paddd %%mm5, %%mm0                                \n\t"
2129

    
2130
L2_DIFF_CORE((%0), (%1))
2131
L2_DIFF_CORE((%0, %2), (%1, %2))
2132
L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2133
L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2134
L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2135
L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2136
L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2137
L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2138

    
2139
#endif
2140

    
2141
                "movq %%mm0, %%mm4                                \n\t"
2142
                "psrlq $32, %%mm0                                \n\t"
2143
                "paddd %%mm0, %%mm4                                \n\t"
2144
                "movd %%mm4, %%ecx                                \n\t"
2145
                "shll $2, %%ecx                                        \n\t"
2146
                "movl %3, %%ebx                                        \n\t"
2147
                "addl -4(%%ebx), %%ecx                                \n\t"
2148
                "addl 4(%%ebx), %%ecx                                \n\t"
2149
                "addl -1024(%%ebx), %%ecx                        \n\t"
2150
                "addl $4, %%ecx                                        \n\t"
2151
                "addl 1024(%%ebx), %%ecx                        \n\t"
2152
                "shrl $3, %%ecx                                        \n\t"
2153
                "movl %%ecx, (%%ebx)                                \n\t"
2154
                "leal (%%eax, %2, 2), %%ebx                        \n\t" // 5*stride
2155

    
2156
//                "movl %3, %%ecx                                        \n\t"
2157
//                "movl %%ecx, test                                \n\t"
2158
//                "jmp 4f \n\t"
2159
                "cmpl 4+"MANGLE(maxTmpNoise)", %%ecx                \n\t"
2160
                " jb 2f                                                \n\t"
2161
                "cmpl 8+"MANGLE(maxTmpNoise)", %%ecx                \n\t"
2162
                " jb 1f                                                \n\t"
2163

    
2164
                "leal (%%ebx, %2, 2), %%ecx                        \n\t" // 7*stride
2165
                "movq (%0), %%mm0                                \n\t" // L0
2166
                "movq (%0, %2), %%mm1                                \n\t" // L1
2167
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2168
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2169
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2170
                "movq (%0, %%ebx), %%mm5                        \n\t" // L5
2171
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2172
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2173
                "movq %%mm0, (%1)                                \n\t" // L0
2174
                "movq %%mm1, (%1, %2)                                \n\t" // L1
2175
                "movq %%mm2, (%1, %2, 2)                        \n\t" // L2
2176
                "movq %%mm3, (%1, %%eax)                        \n\t" // L3
2177
                "movq %%mm4, (%1, %2, 4)                        \n\t" // L4
2178
                "movq %%mm5, (%1, %%ebx)                        \n\t" // L5
2179
                "movq %%mm6, (%1, %%eax, 2)                        \n\t" // L6
2180
                "movq %%mm7, (%1, %%ecx)                        \n\t" // L7
2181
                "jmp 4f                                                \n\t"
2182

    
2183
                "1:                                                \n\t"
2184
                "leal (%%ebx, %2, 2), %%ecx                        \n\t" // 7*stride
2185
                "movq (%0), %%mm0                                \n\t" // L0
2186
                PAVGB((%1), %%mm0)                                      // L0
2187
                "movq (%0, %2), %%mm1                                \n\t" // L1
2188
                PAVGB((%1, %2), %%mm1)                                      // L1
2189
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2190
                PAVGB((%1, %2, 2), %%mm2)                              // L2
2191
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2192
                PAVGB((%1, %%eax), %%mm3)                              // L3
2193
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2194
                PAVGB((%1, %2, 4), %%mm4)                              // L4
2195
                "movq (%0, %%ebx), %%mm5                        \n\t" // L5
2196
                PAVGB((%1, %%ebx), %%mm5)                              // L5
2197
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2198
                PAVGB((%1, %%eax, 2), %%mm6)                              // L6
2199
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2200
                PAVGB((%1, %%ecx), %%mm7)                              // L7
2201
                "movq %%mm0, (%1)                                \n\t" // R0
2202
                "movq %%mm1, (%1, %2)                                \n\t" // R1
2203
                "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2204
                "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2205
                "movq %%mm4, (%1, %2, 4)                        \n\t" // R4
2206
                "movq %%mm5, (%1, %%ebx)                        \n\t" // R5
2207
                "movq %%mm6, (%1, %%eax, 2)                        \n\t" // R6
2208
                "movq %%mm7, (%1, %%ecx)                        \n\t" // R7
2209
                "movq %%mm0, (%0)                                \n\t" // L0
2210
                "movq %%mm1, (%0, %2)                                \n\t" // L1
2211
                "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2212
                "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2213
                "movq %%mm4, (%0, %2, 4)                        \n\t" // L4
2214
                "movq %%mm5, (%0, %%ebx)                        \n\t" // L5
2215
                "movq %%mm6, (%0, %%eax, 2)                        \n\t" // L6
2216
                "movq %%mm7, (%0, %%ecx)                        \n\t" // L7
2217
                "jmp 4f                                                \n\t"
2218

    
2219
                "2:                                                \n\t"
2220
                "cmpl "MANGLE(maxTmpNoise)", %%ecx                \n\t"
2221
                " jb 3f                                                \n\t"
2222

    
2223
                "leal (%%ebx, %2, 2), %%ecx                        \n\t" // 7*stride
2224
                "movq (%0), %%mm0                                \n\t" // L0
2225
                "movq (%0, %2), %%mm1                                \n\t" // L1
2226
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2227
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2228
                "movq (%1), %%mm4                                \n\t" // R0
2229
                "movq (%1, %2), %%mm5                                \n\t" // R1
2230
                "movq (%1, %2, 2), %%mm6                        \n\t" // R2
2231
                "movq (%1, %%eax), %%mm7                        \n\t" // R3
2232
                PAVGB(%%mm4, %%mm0)
2233
                PAVGB(%%mm5, %%mm1)
2234
                PAVGB(%%mm6, %%mm2)
2235
                PAVGB(%%mm7, %%mm3)
2236
                PAVGB(%%mm4, %%mm0)
2237
                PAVGB(%%mm5, %%mm1)
2238
                PAVGB(%%mm6, %%mm2)
2239
                PAVGB(%%mm7, %%mm3)
2240
                "movq %%mm0, (%1)                                \n\t" // R0
2241
                "movq %%mm1, (%1, %2)                                \n\t" // R1
2242
                "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2243
                "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2244
                "movq %%mm0, (%0)                                \n\t" // L0
2245
                "movq %%mm1, (%0, %2)                                \n\t" // L1
2246
                "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2247
                "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2248

    
2249
                "movq (%0, %2, 4), %%mm0                        \n\t" // L4
2250
                "movq (%0, %%ebx), %%mm1                        \n\t" // L5
2251
                "movq (%0, %%eax, 2), %%mm2                        \n\t" // L6
2252
                "movq (%0, %%ecx), %%mm3                        \n\t" // L7
2253
                "movq (%1, %2, 4), %%mm4                        \n\t" // R4
2254
                "movq (%1, %%ebx), %%mm5                        \n\t" // R5
2255
                "movq (%1, %%eax, 2), %%mm6                        \n\t" // R6
2256
                "movq (%1, %%ecx), %%mm7                        \n\t" // R7
2257
                PAVGB(%%mm4, %%mm0)
2258
                PAVGB(%%mm5, %%mm1)
2259
                PAVGB(%%mm6, %%mm2)
2260
                PAVGB(%%mm7, %%mm3)
2261
                PAVGB(%%mm4, %%mm0)
2262
                PAVGB(%%mm5, %%mm1)
2263
                PAVGB(%%mm6, %%mm2)
2264
                PAVGB(%%mm7, %%mm3)
2265
                "movq %%mm0, (%1, %2, 4)                        \n\t" // R4
2266
                "movq %%mm1, (%1, %%ebx)                        \n\t" // R5
2267
                "movq %%mm2, (%1, %%eax, 2)                        \n\t" // R6
2268
                "movq %%mm3, (%1, %%ecx)                        \n\t" // R7
2269
                "movq %%mm0, (%0, %2, 4)                        \n\t" // L4
2270
                "movq %%mm1, (%0, %%ebx)                        \n\t" // L5
2271
                "movq %%mm2, (%0, %%eax, 2)                        \n\t" // L6
2272
                "movq %%mm3, (%0, %%ecx)                        \n\t" // L7
2273
                "jmp 4f                                                \n\t"
2274

    
2275
                "3:                                                \n\t"
2276
                "leal (%%ebx, %2, 2), %%ecx                        \n\t" // 7*stride
2277
                "movq (%0), %%mm0                                \n\t" // L0
2278
                "movq (%0, %2), %%mm1                                \n\t" // L1
2279
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2280
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2281
                "movq (%1), %%mm4                                \n\t" // R0
2282
                "movq (%1, %2), %%mm5                                \n\t" // R1
2283
                "movq (%1, %2, 2), %%mm6                        \n\t" // R2
2284
                "movq (%1, %%eax), %%mm7                        \n\t" // R3
2285
                PAVGB(%%mm4, %%mm0)
2286
                PAVGB(%%mm5, %%mm1)
2287
                PAVGB(%%mm6, %%mm2)
2288
                PAVGB(%%mm7, %%mm3)
2289
                PAVGB(%%mm4, %%mm0)
2290
                PAVGB(%%mm5, %%mm1)
2291
                PAVGB(%%mm6, %%mm2)
2292
                PAVGB(%%mm7, %%mm3)
2293
                PAVGB(%%mm4, %%mm0)
2294
                PAVGB(%%mm5, %%mm1)
2295
                PAVGB(%%mm6, %%mm2)
2296
                PAVGB(%%mm7, %%mm3)
2297
                "movq %%mm0, (%1)                                \n\t" // R0
2298
                "movq %%mm1, (%1, %2)                                \n\t" // R1
2299
                "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2300
                "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2301
                "movq %%mm0, (%0)                                \n\t" // L0
2302
                "movq %%mm1, (%0, %2)                                \n\t" // L1
2303
                "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2304
                "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2305

    
2306
                "movq (%0, %2, 4), %%mm0                        \n\t" // L4
2307
                "movq (%0, %%ebx), %%mm1                        \n\t" // L5
2308
                "movq (%0, %%eax, 2), %%mm2                        \n\t" // L6
2309
                "movq (%0, %%ecx), %%mm3                        \n\t" // L7
2310
                "movq (%1, %2, 4), %%mm4                        \n\t" // R4
2311
                "movq (%1, %%ebx), %%mm5                        \n\t" // R5
2312
                "movq (%1, %%eax, 2), %%mm6                        \n\t" // R6
2313
                "movq (%1, %%ecx), %%mm7                        \n\t" // R7
2314
                PAVGB(%%mm4, %%mm0)
2315
                PAVGB(%%mm5, %%mm1)
2316
                PAVGB(%%mm6, %%mm2)
2317
                PAVGB(%%mm7, %%mm3)
2318
                PAVGB(%%mm4, %%mm0)
2319
                PAVGB(%%mm5, %%mm1)
2320
                PAVGB(%%mm6, %%mm2)
2321
                PAVGB(%%mm7, %%mm3)
2322
                PAVGB(%%mm4, %%mm0)
2323
                PAVGB(%%mm5, %%mm1)
2324
                PAVGB(%%mm6, %%mm2)
2325
                PAVGB(%%mm7, %%mm3)
2326
                "movq %%mm0, (%1, %2, 4)                        \n\t" // R4
2327
                "movq %%mm1, (%1, %%ebx)                        \n\t" // R5
2328
                "movq %%mm2, (%1, %%eax, 2)                        \n\t" // R6
2329
                "movq %%mm3, (%1, %%ecx)                        \n\t" // R7
2330
                "movq %%mm0, (%0, %2, 4)                        \n\t" // L4
2331
                "movq %%mm1, (%0, %%ebx)                        \n\t" // L5
2332
                "movq %%mm2, (%0, %%eax, 2)                        \n\t" // L6
2333
                "movq %%mm3, (%0, %%ecx)                        \n\t" // L7
2334

    
2335
                "4:                                                \n\t"
2336

    
2337
                :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
2338
                : "%eax", "%ebx", "%ecx", "memory"
2339
                );
2340
//printf("%d\n", test);
2341
#else
2342
        int y;
2343
        int d=0;
2344
        int sysd=0;
2345
        int i;
2346

    
2347
        for(y=0; y<8; y++)
2348
        {
2349
                int x;
2350
                for(x=0; x<8; x++)
2351
                {
2352
                        int ref= tempBlured[ x + y*stride ];
2353
                        int cur= src[ x + y*stride ];
2354
                        int d1=ref - cur;
2355
//                        if(x==0 || x==7) d1+= d1>>1;
2356
//                        if(y==0 || y==7) d1+= d1>>1;
2357
//                        d+= ABS(d1);
2358
                        d+= d1*d1;
2359
                        sysd+= d1;
2360
                }
2361
        }
2362
        i=d;
2363
        d=         (
2364
                4*d
2365
                +(*(tempBluredPast-256))
2366
                +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2367
                +(*(tempBluredPast+256))
2368
                +4)>>3;
2369
        *tempBluredPast=i;
2370
//        ((*tempBluredPast)*3 + d + 2)>>2;
2371

    
2372
//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2373
/*
2374
Switch between
2375
 1  0  0  0  0  0  0  (0)
2376
64 32 16  8  4  2  1  (1)
2377
64 48 36 27 20 15 11 (33) (approx)
2378
64 56 49 43 37 33 29 (200) (approx)
2379
*/
2380
        if(d > maxNoise[1])
2381
        {
2382
                if(d < maxNoise[2])
2383
                {
2384
                        for(y=0; y<8; y++)
2385
                        {
2386
                                int x;
2387
                                for(x=0; x<8; x++)
2388
                                {
2389
                                        int ref= tempBlured[ x + y*stride ];
2390
                                        int cur= src[ x + y*stride ];
2391
                                        tempBlured[ x + y*stride ]=
2392
                                        src[ x + y*stride ]=
2393
                                                (ref + cur + 1)>>1;
2394
                                }
2395
                        }
2396
                }
2397
                else
2398
                {
2399
                        for(y=0; y<8; y++)
2400
                        {
2401
                                int x;
2402
                                for(x=0; x<8; x++)
2403
                                {
2404
                                        tempBlured[ x + y*stride ]= src[ x + y*stride ];
2405
                                }
2406
                        }
2407
                }
2408
        }
2409
        else
2410
        {
2411
                if(d < maxNoise[0])
2412
                {
2413
                        for(y=0; y<8; y++)
2414
                        {
2415
                                int x;
2416
                                for(x=0; x<8; x++)
2417
                                {
2418
                                        int ref= tempBlured[ x + y*stride ];
2419
                                        int cur= src[ x + y*stride ];
2420
                                        tempBlured[ x + y*stride ]=
2421
                                        src[ x + y*stride ]=
2422
                                                (ref*7 + cur + 4)>>3;
2423
                                }
2424
                        }
2425
                }
2426
                else
2427
                {
2428
                        for(y=0; y<8; y++)
2429
                        {
2430
                                int x;
2431
                                for(x=0; x<8; x++)
2432
                                {
2433
                                        int ref= tempBlured[ x + y*stride ];
2434
                                        int cur= src[ x + y*stride ];
2435
                                        tempBlured[ x + y*stride ]=
2436
                                        src[ x + y*stride ]=
2437
                                                (ref*3 + cur + 2)>>2;
2438
                                }
2439
                        }
2440
                }
2441
        }
2442
#endif
2443
}
2444

    
2445
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2446
        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
2447

    
2448
/**
2449
 * Copies a block from src to dst and fixes the blacklevel
2450
 * levelFix == 0 -> dont touch the brighness & contrast
2451
 */
2452
#undef SCALED_CPY
2453

    
2454
static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2455
        int levelFix)
2456
{
2457
#ifndef HAVE_MMX
2458
        int i;
2459
#endif
2460
        if(levelFix)
2461
        {
2462
#ifdef HAVE_MMX
2463
                                        asm volatile(
2464
                                                "leal (%0,%2), %%eax        \n\t"
2465
                                                "leal (%1,%3), %%ebx        \n\t"
2466
                                                "movq "MANGLE(packedYOffset)", %%mm2\n\t"
2467
                                                "movq "MANGLE(packedYScale)", %%mm3\n\t"
2468
                                                "pxor %%mm4, %%mm4        \n\t"
2469
#ifdef HAVE_MMX2
2470
#define SCALED_CPY(src1, src2, dst1, dst2)                                        \
2471
                                                "movq " #src1 ", %%mm0        \n\t"\
2472
                                                "movq " #src1 ", %%mm5        \n\t"\
2473
                                                "movq " #src2 ", %%mm1        \n\t"\
2474
                                                "movq " #src2 ", %%mm6        \n\t"\
2475
                                                "punpcklbw %%mm0, %%mm0 \n\t"\
2476
                                                "punpckhbw %%mm5, %%mm5 \n\t"\
2477
                                                "punpcklbw %%mm1, %%mm1 \n\t"\
2478
                                                "punpckhbw %%mm6, %%mm6 \n\t"\
2479
                                                "pmulhuw %%mm3, %%mm0        \n\t"\
2480
                                                "pmulhuw %%mm3, %%mm5        \n\t"\
2481
                                                "pmulhuw %%mm3, %%mm1        \n\t"\
2482
                                                "pmulhuw %%mm3, %%mm6        \n\t"\
2483
                                                "psubw %%mm2, %%mm0        \n\t"\
2484
                                                "psubw %%mm2, %%mm5        \n\t"\
2485
                                                "psubw %%mm2, %%mm1        \n\t"\
2486
                                                "psubw %%mm2, %%mm6        \n\t"\
2487
                                                "packuswb %%mm5, %%mm0        \n\t"\
2488
                                                "packuswb %%mm6, %%mm1        \n\t"\
2489
                                                "movq %%mm0, " #dst1 "        \n\t"\
2490
                                                "movq %%mm1, " #dst2 "        \n\t"\
2491

    
2492
#else //HAVE_MMX2
2493
#define SCALED_CPY(src1, src2, dst1, dst2)                                        \
2494
                                                "movq " #src1 ", %%mm0        \n\t"\
2495
                                                "movq " #src1 ", %%mm5        \n\t"\
2496
                                                "punpcklbw %%mm4, %%mm0 \n\t"\
2497
                                                "punpckhbw %%mm4, %%mm5 \n\t"\
2498
                                                "psubw %%mm2, %%mm0        \n\t"\
2499
                                                "psubw %%mm2, %%mm5        \n\t"\
2500
                                                "movq " #src2 ", %%mm1        \n\t"\
2501
                                                "psllw $6, %%mm0        \n\t"\
2502
                                                "psllw $6, %%mm5        \n\t"\
2503
                                                "pmulhw %%mm3, %%mm0        \n\t"\
2504
                                                "movq " #src2 ", %%mm6        \n\t"\
2505
                                                "pmulhw %%mm3, %%mm5        \n\t"\
2506
                                                "punpcklbw %%mm4, %%mm1 \n\t"\
2507
                                                "punpckhbw %%mm4, %%mm6 \n\t"\
2508
                                                "psubw %%mm2, %%mm1        \n\t"\
2509
                                                "psubw %%mm2, %%mm6        \n\t"\
2510
                                                "psllw $6, %%mm1        \n\t"\
2511
                                                "psllw $6, %%mm6        \n\t"\
2512
                                                "pmulhw %%mm3, %%mm1        \n\t"\
2513
                                                "pmulhw %%mm3, %%mm6        \n\t"\
2514
                                                "packuswb %%mm5, %%mm0        \n\t"\
2515
                                                "packuswb %%mm6, %%mm1        \n\t"\
2516
                                                "movq %%mm0, " #dst1 "        \n\t"\
2517
                                                "movq %%mm1, " #dst2 "        \n\t"\
2518

    
2519
#endif //!HAVE_MMX2
2520

    
2521
SCALED_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
2522
SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
2523
SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
2524
                                                "leal (%%eax,%2,4), %%eax        \n\t"
2525
                                                "leal (%%ebx,%3,4), %%ebx        \n\t"
2526
SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
2527

    
2528

    
2529
                                                : : "r"(src),
2530
                                                "r"(dst),
2531
                                                "r" (srcStride),
2532
                                                "r" (dstStride)
2533
                                                : "%eax", "%ebx"
2534
                                        );
2535
#else
2536
                                for(i=0; i<8; i++)
2537
                                        memcpy(        &(dst[dstStride*i]),
2538
                                                &(src[srcStride*i]), BLOCK_SIZE);
2539
#endif
2540
        }
2541
        else
2542
        {
2543
#ifdef HAVE_MMX
2544
                                        asm volatile(
2545
                                                "leal (%0,%2), %%eax        \n\t"
2546
                                                "leal (%1,%3), %%ebx        \n\t"
2547

    
2548
#define SIMPLE_CPY(src1, src2, dst1, dst2)                                \
2549
                                                "movq " #src1 ", %%mm0        \n\t"\
2550
                                                "movq " #src2 ", %%mm1        \n\t"\
2551
                                                "movq %%mm0, " #dst1 "        \n\t"\
2552
                                                "movq %%mm1, " #dst2 "        \n\t"\
2553

    
2554
SIMPLE_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
2555
SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
2556
SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
2557
                                                "leal (%%eax,%2,4), %%eax        \n\t"
2558
                                                "leal (%%ebx,%3,4), %%ebx        \n\t"
2559
SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
2560

    
2561
                                                : : "r" (src),
2562
                                                "r" (dst),
2563
                                                "r" (srcStride),
2564
                                                "r" (dstStride)
2565
                                                : "%eax", "%ebx"
2566
                                        );
2567
#else
2568
                                for(i=0; i<8; i++)
2569
                                        memcpy(        &(dst[dstStride*i]),
2570
                                                &(src[srcStride*i]), BLOCK_SIZE);
2571
#endif
2572
        }
2573
}
2574

    
2575
/**
2576
 * Duplicates the given 8 src pixels ? times upward
2577
 */
2578
static inline void RENAME(duplicate)(uint8_t src[], int stride)
2579
{
2580
#ifdef HAVE_MMX
2581
        asm volatile(
2582
                "movq (%0), %%mm0                \n\t"
2583
                "addl %1, %0                        \n\t"
2584
                "movq %%mm0, (%0)                \n\t"
2585
                "movq %%mm0, (%0, %1)                \n\t"
2586
                "movq %%mm0, (%0, %1, 2)        \n\t"
2587
                : "+r" (src)
2588
                : "r" (-stride)
2589
        );
2590
#else
2591
        int i;
2592
        uint8_t *p=src;
2593
        for(i=0; i<3; i++)
2594
        {
2595
                p-= stride;
2596
                memcpy(p, src, 8);
2597
        }
2598
#endif
2599
}
2600

    
2601
/**
2602
 * Filters array of bytes (Y or U or V values)
2603
 */
2604
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2605
        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode)
2606
{
2607
        int x,y;
2608
#ifdef COMPILE_TIME_MODE
2609
        const int mode= COMPILE_TIME_MODE;
2610
#else
2611
        const int mode= isColor ? ppMode->chromMode : ppMode->lumMode;
2612
#endif
2613
        /* we need 64bit here otherwise we?ll going to have a problem
2614
           after watching a black picture for 5 hours*/
2615
        static uint64_t *yHistogram= NULL;
2616
        int black=0, white=255; // blackest black and whitest white in the picture
2617
        int QPCorrecture= 256*256;
2618

    
2619
        /* Temporary buffers for handling the last row(s) */
2620
        static uint8_t *tempDst= NULL;
2621
        static uint8_t *tempSrc= NULL;
2622

    
2623
        /* Temporary buffers for handling the last block */
2624
        static uint8_t *tempDstBlock= NULL;
2625
        static uint8_t *tempSrcBlock= NULL;
2626

    
2627
        /* Temporal noise reducing buffers */
2628
        static uint8_t *tempBlured[3]= {NULL,NULL,NULL};
2629
        static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL};
2630

    
2631
        int copyAhead;
2632

    
2633
#ifdef PP_FUNNY_STRIDE
2634
        uint8_t *dstBlockPtrBackup;
2635
        uint8_t *srcBlockPtrBackup;
2636
#endif
2637

    
2638
#ifdef MORE_TIMING
2639
        long long T0, T1, diffTime=0;
2640
#endif
2641
#ifdef TIMING
2642
        long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
2643
        sumTime= rdtsc();
2644
#endif
2645
        dcOffset= ppMode->maxDcDiff;
2646
        dcThreshold= ppMode->maxDcDiff*2 + 1;
2647

    
2648
#ifdef HAVE_MMX
2649
        maxTmpNoise[0]= ppMode->maxTmpNoise[0];
2650
        maxTmpNoise[1]= ppMode->maxTmpNoise[1];
2651
        maxTmpNoise[2]= ppMode->maxTmpNoise[2];
2652
        
2653
        mmxDCOffset= 0x7F - dcOffset;
2654
        mmxDCThreshold= 0x7F - dcThreshold;
2655

    
2656
        mmxDCOffset*= 0x0101010101010101LL;
2657
        mmxDCThreshold*= 0x0101010101010101LL;
2658
#endif
2659

    
2660
        if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
2661
        else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14;
2662
        else if(   (mode & V_DEBLOCK)
2663
                || (mode & LINEAR_IPOL_DEINT_FILTER)
2664
                || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
2665
        else if(mode & V_X1_FILTER) copyAhead=11;
2666
        else if(mode & V_RK1_FILTER) copyAhead=10;
2667
        else if(mode & DERING) copyAhead=9;
2668
        else copyAhead=8;
2669

    
2670
        copyAhead-= 8;
2671

    
2672
        if(tempDst==NULL)
2673
        {
2674
                tempDst= (uint8_t*)memalign(8, 1024*24);
2675
                tempSrc= (uint8_t*)memalign(8, 1024*24);
2676
                tempDstBlock= (uint8_t*)memalign(8, 1024*24);
2677
                tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
2678
        }
2679

    
2680
        if(tempBlured[isColor]==NULL && (mode & TEMP_NOISE_FILTER))
2681
        {
2682
//                printf("%d %d %d\n", isColor, dstStride, height);
2683
                //FIXME works only as long as the size doesnt increase
2684
                //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
2685
                tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024);
2686
                tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024);
2687

    
2688
                memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024);
2689
                memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024);
2690
        }
2691

    
2692
        if(!yHistogram)
2693
        {
2694
                int i;
2695
                yHistogram= (uint64_t*)malloc(8*256);
2696
                for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
2697

    
2698
                if(mode & FULL_Y_RANGE)
2699
                {
2700
                        ppMode->maxAllowedY=255;
2701
                        ppMode->minAllowedY=0;
2702
                }
2703
        }
2704

    
2705
        if(!isColor)
2706
        {
2707
                uint64_t sum= 0;
2708
                int i;
2709
                static int framenum= -1;
2710
                uint64_t maxClipped;
2711
                uint64_t clipped;
2712
                double scale;
2713

    
2714
                framenum++;
2715
                if(framenum == 1) yHistogram[0]= width*height/64*15/256;
2716

    
2717
                for(i=0; i<256; i++)
2718
                {
2719
                        sum+= yHistogram[i];
2720
//                        printf("%d ", yHistogram[i]);
2721
                }
2722
//                printf("\n\n");
2723

    
2724
                /* we allways get a completly black picture first */
2725
                maxClipped= (uint64_t)(sum * maxClippedThreshold);
2726

    
2727
                clipped= sum;
2728
                for(black=255; black>0; black--)
2729
                {
2730
                        if(clipped < maxClipped) break;
2731
                        clipped-= yHistogram[black];
2732
                }
2733

    
2734
                clipped= sum;
2735
                for(white=0; white<256; white++)
2736
                {
2737
                        if(clipped < maxClipped) break;
2738
                        clipped-= yHistogram[white];
2739
                }
2740

    
2741
                scale= (double)(ppMode->maxAllowedY - ppMode->minAllowedY) / (double)(white-black);
2742

    
2743
#ifdef HAVE_MMX2
2744
                packedYScale= (uint16_t)(scale*256.0 + 0.5);
2745
                packedYOffset= (((black*packedYScale)>>8) - ppMode->minAllowedY) & 0xFFFF;
2746
#else
2747
                packedYScale= (uint16_t)(scale*1024.0 + 0.5);
2748
                packedYOffset= (black - ppMode->minAllowedY) & 0xFFFF;
2749
#endif
2750

    
2751
                packedYOffset|= packedYOffset<<32;
2752
                packedYOffset|= packedYOffset<<16;
2753

    
2754
                packedYScale|= packedYScale<<32;
2755
                packedYScale|= packedYScale<<16;
2756
                
2757
                if(mode & LEVEL_FIX)        QPCorrecture= (int)(scale*256*256 + 0.5);
2758
                else                        QPCorrecture= 256*256;
2759
        }
2760
        else
2761
        {
2762
                packedYScale= 0x0100010001000100LL;
2763
                packedYOffset= 0;
2764
                QPCorrecture= 256*256;
2765
        }
2766

    
2767
        /* copy & deinterlace first row of blocks */
2768
        y=-BLOCK_SIZE;
2769
        {
2770
                uint8_t *srcBlock= &(src[y*srcStride]);
2771
                uint8_t *dstBlock= tempDst + dstStride;
2772

    
2773
                // From this point on it is guranteed that we can read and write 16 lines downward
2774
                // finish 1 block before the next otherwise we?ll might have a problem
2775
                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2776
                for(x=0; x<width; x+=BLOCK_SIZE)
2777
                {
2778

    
2779
#ifdef HAVE_MMX2
2780
/*
2781
                        prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
2782
                        prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
2783
                        prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
2784
                        prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
2785
*/
2786

    
2787
                        asm(
2788
                                "movl %4, %%eax                        \n\t"
2789
                                "shrl $2, %%eax                        \n\t"
2790
                                "andl $6, %%eax                        \n\t"
2791
                                "addl %5, %%eax                        \n\t"
2792
                                "movl %%eax, %%ebx                \n\t"
2793
                                "imul %1, %%eax                        \n\t"
2794
                                "imul %3, %%ebx                        \n\t"
2795
                                "prefetchnta 32(%%eax, %0)        \n\t"
2796
                                "prefetcht0 32(%%ebx, %2)        \n\t"
2797
                                "addl %1, %%eax                        \n\t"
2798
                                "addl %3, %%ebx                        \n\t"
2799
                                "prefetchnta 32(%%eax, %0)        \n\t"
2800
                                "prefetcht0 32(%%ebx, %2)        \n\t"
2801
                        :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2802
                        "m" (x), "m" (copyAhead)
2803
                        : "%eax", "%ebx"
2804
                        );
2805

    
2806
#elif defined(HAVE_3DNOW)
2807
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2808
/*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2809
                        prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2810
                        prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2811
                        prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2812
*/
2813
#endif
2814

    
2815
                        RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
2816
                                srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX);
2817

    
2818
                        RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
2819

    
2820
                        if(mode & LINEAR_IPOL_DEINT_FILTER)
2821
                                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2822
                        else if(mode & LINEAR_BLEND_DEINT_FILTER)
2823
                                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2824
                        else if(mode & MEDIAN_DEINT_FILTER)
2825
                                RENAME(deInterlaceMedian)(dstBlock, dstStride);
2826
                        else if(mode & CUBIC_IPOL_DEINT_FILTER)
2827
                                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
2828
/*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
2829
                                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2830
*/
2831
                        dstBlock+=8;
2832
                        srcBlock+=8;
2833
                }
2834
                memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride );
2835
        }
2836

    
2837
        for(y=0; y<height; y+=BLOCK_SIZE)
2838
        {
2839
                //1% speedup if these are here instead of the inner loop
2840
                uint8_t *srcBlock= &(src[y*srcStride]);
2841
                uint8_t *dstBlock= &(dst[y*dstStride]);
2842
#ifdef HAVE_MMX
2843
                uint8_t *tempBlock1= tempBlocks;
2844
                uint8_t *tempBlock2= tempBlocks + 8;
2845
#endif
2846
#ifdef ARCH_X86
2847
                int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
2848
                int QPDelta= isColor ? (-1) : 1<<31;
2849
                int QPFrac= 1<<30;
2850
#endif
2851
                int QP=0;
2852
                /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
2853
                   if not than use a temporary buffer */
2854
                if(y+15 >= height)
2855
                {
2856
                        int i;
2857
                        /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
2858
                           blockcopy to dst later */
2859
                        memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
2860
                                srcStride*MAX(height-y-copyAhead, 0) );
2861

    
2862
                        /* duplicate last line of src to fill the void upto line (copyAhead+7) */
2863
                        for(i=MAX(height-y, 8); i<copyAhead+8; i++)
2864
                                memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
2865

    
2866
                        /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
2867
                        memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
2868

    
2869
                        /* duplicate last line of dst to fill the void upto line (copyAhead) */
2870
                        for(i=height-y+1; i<=copyAhead; i++)
2871
                                memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
2872

    
2873
                        dstBlock= tempDst + dstStride;
2874
                        srcBlock= tempSrc;
2875
                }
2876

    
2877
                // From this point on it is guranteed that we can read and write 16 lines downward
2878
                // finish 1 block before the next otherwise we?ll might have a problem
2879
                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2880
                for(x=0; x<width; x+=BLOCK_SIZE)
2881
                {
2882
                        const int stride= dstStride;
2883
#ifdef HAVE_MMX
2884
                        uint8_t *tmpXchg;
2885
#endif
2886
#ifdef ARCH_X86
2887
                        QP= *QPptr;
2888
                        asm volatile(
2889
                                "addl %2, %1                \n\t"
2890
                                "sbbl %%eax, %%eax        \n\t"
2891
                                "shll $2, %%eax                \n\t"
2892
                                "subl %%eax, %0                \n\t"
2893
                                : "+r" (QPptr), "+m" (QPFrac)
2894
                                : "r" (QPDelta)
2895
                                : "%eax"
2896
                        );
2897
#else
2898
                        QP= isColor ?
2899
                                QPs[(y>>3)*QPStride + (x>>3)]:
2900
                                QPs[(y>>4)*QPStride + (x>>4)];
2901
#endif
2902
                        if(!isColor)
2903
                        {
2904
                                QP= (QP* QPCorrecture + 256*128)>>16;
2905
                                yHistogram[ srcBlock[srcStride*12 + 4] ]++;
2906
                        }
2907
#ifdef HAVE_MMX
2908
                        asm volatile(
2909
                                "movd %0, %%mm7                                        \n\t"
2910
                                "packuswb %%mm7, %%mm7                                \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2911
                                "packuswb %%mm7, %%mm7                                \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2912
                                "packuswb %%mm7, %%mm7                                \n\t" // QP,..., QP
2913
                                "movq %%mm7, "MANGLE(pQPb)"                        \n\t"
2914
                                : : "r" (QP)
2915
                        );
2916
#endif
2917

    
2918
#ifdef MORE_TIMING
2919
                        T0= rdtsc();
2920
#endif
2921

    
2922
#ifdef HAVE_MMX2
2923
/*
2924
                        prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
2925
                        prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
2926
                        prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
2927
                        prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
2928
*/
2929

    
2930
                        asm(
2931
                                "movl %4, %%eax                        \n\t"
2932
                                "shrl $2, %%eax                        \n\t"
2933
                                "andl $6, %%eax                        \n\t"
2934
                                "addl %5, %%eax                        \n\t"
2935
                                "movl %%eax, %%ebx                \n\t"
2936
                                "imul %1, %%eax                        \n\t"
2937
                                "imul %3, %%ebx                        \n\t"
2938
                                "prefetchnta 32(%%eax, %0)        \n\t"
2939
                                "prefetcht0 32(%%ebx, %2)        \n\t"
2940
                                "addl %1, %%eax                        \n\t"
2941
                                "addl %3, %%ebx                        \n\t"
2942
                                "prefetchnta 32(%%eax, %0)        \n\t"
2943
                                "prefetcht0 32(%%ebx, %2)        \n\t"
2944
                        :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2945
                        "m" (x), "m" (copyAhead)
2946
                        : "%eax", "%ebx"
2947
                        );
2948

    
2949
#elif defined(HAVE_3DNOW)
2950
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2951
/*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2952
                        prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2953
                        prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2954
                        prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2955
*/
2956
#endif
2957

    
2958
#ifdef PP_FUNNY_STRIDE
2959
                        //can we mess with a 8x16 block, if not use a temp buffer, yes again
2960
                        if(x+7 >= width)
2961
                        {
2962
                                int i;
2963
                                dstBlockPtrBackup= dstBlock;
2964
                                srcBlockPtrBackup= srcBlock;
2965

    
2966
                                for(i=0;i<BLOCK_SIZE*2; i++)
2967
                                {
2968
                                        memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
2969
                                        memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
2970
                                }
2971

    
2972
                                dstBlock= tempDstBlock;
2973
                                srcBlock= tempSrcBlock;
2974
                        }
2975
#endif
2976

    
2977
                        RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
2978
                                srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
2979

    
2980
                        if(mode & LINEAR_IPOL_DEINT_FILTER)
2981
                                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2982
                        else if(mode & LINEAR_BLEND_DEINT_FILTER)
2983
                                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2984
                        else if(mode & MEDIAN_DEINT_FILTER)
2985
                                RENAME(deInterlaceMedian)(dstBlock, dstStride);
2986
                        else if(mode & CUBIC_IPOL_DEINT_FILTER)
2987
                                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
2988
/*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
2989
                                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2990
*/
2991

    
2992
                        /* only deblock if we have 2 blocks */
2993
                        if(y + 8 < height)
2994
                        {
2995
#ifdef MORE_TIMING
2996
                                T1= rdtsc();
2997
                                memcpyTime+= T1-T0;
2998
                                T0=T1;
2999
#endif
3000
                                if(mode & V_RK1_FILTER)
3001
                                        RENAME(vertRK1Filter)(dstBlock, stride, QP);
3002
                                else if(mode & V_X1_FILTER)
3003
                                        RENAME(vertX1Filter)(dstBlock, stride, QP);
3004
                                else if(mode & V_DEBLOCK)
3005
                                {
3006
                                        if( RENAME(isVertDC)(dstBlock, stride))
3007
                                        {
3008
                                                if(RENAME(isVertMinMaxOk)(dstBlock, stride, QP))
3009
                                                        RENAME(doVertLowPass)(dstBlock, stride, QP);
3010
                                        }
3011
                                        else
3012
                                                RENAME(doVertDefFilter)(dstBlock, stride, QP);
3013
                                }
3014
#ifdef MORE_TIMING
3015
                                T1= rdtsc();
3016
                                vertTime+= T1-T0;
3017
                                T0=T1;
3018
#endif
3019
                        }
3020

    
3021
#ifdef HAVE_MMX
3022
                        RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3023
#endif
3024
                        /* check if we have a previous block to deblock it with dstBlock */
3025
                        if(x - 8 >= 0)
3026
                        {
3027
#ifdef MORE_TIMING
3028
                                T0= rdtsc();
3029
#endif
3030
#ifdef HAVE_MMX
3031
                                if(mode & H_RK1_FILTER)
3032
                                        RENAME(vertRK1Filter)(tempBlock1, 16, QP);
3033
                                else if(mode & H_X1_FILTER)
3034
                                        RENAME(vertX1Filter)(tempBlock1, 16, QP);
3035
                                else if(mode & H_DEBLOCK)
3036
                                {
3037
                                        if( RENAME(isVertDC)(tempBlock1, 16) )
3038
                                        {
3039
                                                if(RENAME(isVertMinMaxOk)(tempBlock1, 16, QP))
3040
                                                        RENAME(doVertLowPass)(tempBlock1, 16, QP);
3041
                                        }
3042
                                        else
3043
                                                RENAME(doVertDefFilter)(tempBlock1, 16, QP);
3044
                                }
3045

    
3046
                                RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3047

    
3048
#else
3049
                                if(mode & H_X1_FILTER)
3050
                                        horizX1Filter(dstBlock-4, stride, QP);
3051
                                else if(mode & H_DEBLOCK)
3052
                                {
3053
                                        if( isHorizDC(dstBlock-4, stride))
3054
                                        {
3055
                                                if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3056
                                                        doHorizLowPass(dstBlock-4, stride, QP);
3057
                                        }
3058
                                        else
3059
                                                doHorizDefFilter(dstBlock-4, stride, QP);
3060
                                }
3061
#endif
3062
#ifdef MORE_TIMING
3063
                                T1= rdtsc();
3064
                                horizTime+= T1-T0;
3065
                                T0=T1;
3066
#endif
3067
                                if(mode & DERING)
3068
                                {
3069
                                //FIXME filter first line
3070
                                        if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, QP);
3071
                                }
3072

    
3073
                                if(mode & TEMP_NOISE_FILTER)
3074
                                {
3075
                                        RENAME(tempNoiseReducer)(dstBlock-8, stride,
3076
                                                tempBlured[isColor] + y*dstStride + x,
3077
                                                tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3078
                                                ppMode->maxTmpNoise);
3079
                                }
3080
                        }
3081

    
3082
#ifdef PP_FUNNY_STRIDE
3083
                        /* did we use a tmp-block buffer */
3084
                        if(x+7 >= width)
3085
                        {
3086
                                int i;
3087
                                dstBlock= dstBlockPtrBackup;
3088
                                srcBlock= srcBlockPtrBackup;
3089

    
3090
                                for(i=0;i<BLOCK_SIZE*2; i++)
3091
                                {
3092
                                        memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3093
                                }
3094
                        }
3095
#endif
3096

    
3097
                        dstBlock+=8;
3098
                        srcBlock+=8;
3099

    
3100
#ifdef HAVE_MMX
3101
                        tmpXchg= tempBlock1;
3102
                        tempBlock1= tempBlock2;
3103
                        tempBlock2 = tmpXchg;
3104
#endif
3105
                }
3106

    
3107
                if(mode & DERING)
3108
                {
3109
                                if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, QP);
3110
                }
3111

    
3112
                if((mode & TEMP_NOISE_FILTER))
3113
                {
3114
                        RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3115
                                tempBlured[isColor] + y*dstStride + x,
3116
                                tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3117
                                ppMode->maxTmpNoise);
3118
                }
3119

    
3120
                /* did we use a tmp buffer for the last lines*/
3121
                if(y+15 >= height)
3122
                {
3123
                        uint8_t *dstBlock= &(dst[y*dstStride]);
3124
                        memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3125
                }
3126
/*
3127
                for(x=0; x<width; x+=32)
3128
                {
3129
                        volatile int i;
3130
                        i+=        + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3131
                                + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3132
                                + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3133
//                                + dstBlock[x +13*dstStride]
3134
//                                + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3135
                }*/
3136
        }
3137
#ifdef HAVE_3DNOW
3138
        asm volatile("femms");
3139
#elif defined (HAVE_MMX)
3140
        asm volatile("emms");
3141
#endif
3142

    
3143
#ifdef TIMING
3144
        // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3145
        sumTime= rdtsc() - sumTime;
3146
        if(!isColor)
3147
                printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
3148
                        (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3149
                        (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
3150
                        , black, white);
3151
#endif
3152
#ifdef DEBUG_BRIGHTNESS
3153
        if(!isColor)
3154
        {
3155
                int max=1;
3156
                int i;
3157
                for(i=0; i<256; i++)
3158
                        if(yHistogram[i] > max) max=yHistogram[i];
3159

    
3160
                for(i=1; i<256; i++)
3161
                {
3162
                        int x;
3163
                        int start=yHistogram[i-1]/(max/256+1);
3164
                        int end=yHistogram[i]/(max/256+1);
3165
                        int inc= end > start ? 1 : -1;
3166
                        for(x=start; x!=end+inc; x+=inc)
3167
                                dst[ i*dstStride + x]+=128;
3168
                }
3169

    
3170
                for(i=0; i<100; i+=2)
3171
                {
3172
                        dst[ (white)*dstStride + i]+=128;
3173
                        dst[ (black)*dstStride + i]+=128;
3174
                }
3175

    
3176
        }
3177
#endif
3178

    
3179
}