Statistics
| Branch: | Revision:

ffmpeg / libavcodec / libpostproc / postprocess_template.c @ 20646267

History | View | Annotate | Download (112 KB)

1
/*
2
    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
/**
20
 * @file postprocess_template.c
21
 * mmx/mmx2/3dnow postprocess code.
22
 */
23

    
24

    
25
#undef PAVGB
26
#undef PMINUB
27
#undef PMAXUB
28

    
29
#ifdef HAVE_MMX2
30
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
31
#elif defined (HAVE_3DNOW)
32
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
33
#endif
34

    
35
#ifdef HAVE_MMX2
36
#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
37
#elif defined (HAVE_MMX)
38
#define PMINUB(b,a,t) \
39
        "movq " #a ", " #t " \n\t"\
40
        "psubusb " #b ", " #t " \n\t"\
41
        "psubb " #t ", " #a " \n\t"
42
#endif
43

    
44
#ifdef HAVE_MMX2
45
#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
46
#elif defined (HAVE_MMX)
47
#define PMAXUB(a,b) \
48
        "psubusb " #a ", " #b " \n\t"\
49
        "paddb " #a ", " #b " \n\t"
50
#endif
51

    
52
//FIXME? |255-0| = 1 (shouldnt be a problem ...)
53
#ifdef HAVE_MMX
54
/**
55
 * Check if the middle 8x8 Block in the given 8x16 block is flat
56
 */
57
static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
58
        int numEq= 0, dcOk;
59
        src+= stride*4; // src points to begin of the 8x8 Block
60
asm volatile(
61
                "movq %0, %%mm7                                        \n\t" 
62
                "movq %1, %%mm6                                        \n\t" 
63
                : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
64
                );
65
                
66
asm volatile(
67
                "leal (%2, %3), %%eax                                \n\t"
68
//        0        1        2        3        4        5        6        7        8        9
69
//        %1        eax        eax+%2        eax+2%2        %1+4%2        ecx        ecx+%2        ecx+2%2        %1+8%2        ecx+4%2
70

    
71
                "movq (%2), %%mm0                                \n\t"
72
                "movq (%%eax), %%mm1                                \n\t"
73
                "movq %%mm0, %%mm3                                \n\t"
74
                "movq %%mm0, %%mm4                                \n\t"
75
                PMAXUB(%%mm1, %%mm4)
76
                PMINUB(%%mm1, %%mm3, %%mm5)
77
                "psubb %%mm1, %%mm0                                \n\t" // mm0 = differnece
78
                "paddb %%mm7, %%mm0                                \n\t"
79
                "pcmpgtb %%mm6, %%mm0                                \n\t"
80

    
81
                "movq (%%eax,%3), %%mm2                                \n\t"
82
                PMAXUB(%%mm2, %%mm4)
83
                PMINUB(%%mm2, %%mm3, %%mm5)
84
                "psubb %%mm2, %%mm1                                \n\t"
85
                "paddb %%mm7, %%mm1                                \n\t"
86
                "pcmpgtb %%mm6, %%mm1                                \n\t"
87
                "paddb %%mm1, %%mm0                                \n\t"
88

    
89
                "movq (%%eax, %3, 2), %%mm1                        \n\t"
90
                PMAXUB(%%mm1, %%mm4)
91
                PMINUB(%%mm1, %%mm3, %%mm5)
92
                "psubb %%mm1, %%mm2                                \n\t"
93
                "paddb %%mm7, %%mm2                                \n\t"
94
                "pcmpgtb %%mm6, %%mm2                                \n\t"
95
                "paddb %%mm2, %%mm0                                \n\t"
96
                
97
                "leal (%%eax, %3, 4), %%eax                        \n\t"
98

    
99
                "movq (%2, %3, 4), %%mm2                        \n\t"
100
                PMAXUB(%%mm2, %%mm4)
101
                PMINUB(%%mm2, %%mm3, %%mm5)
102
                "psubb %%mm2, %%mm1                                \n\t"
103
                "paddb %%mm7, %%mm1                                \n\t"
104
                "pcmpgtb %%mm6, %%mm1                                \n\t"
105
                "paddb %%mm1, %%mm0                                \n\t"
106

    
107
                "movq (%%eax), %%mm1                                \n\t"
108
                PMAXUB(%%mm1, %%mm4)
109
                PMINUB(%%mm1, %%mm3, %%mm5)
110
                "psubb %%mm1, %%mm2                                \n\t"
111
                "paddb %%mm7, %%mm2                                \n\t"
112
                "pcmpgtb %%mm6, %%mm2                                \n\t"
113
                "paddb %%mm2, %%mm0                                \n\t"
114

    
115
                "movq (%%eax, %3), %%mm2                        \n\t"
116
                PMAXUB(%%mm2, %%mm4)
117
                PMINUB(%%mm2, %%mm3, %%mm5)
118
                "psubb %%mm2, %%mm1                                \n\t"
119
                "paddb %%mm7, %%mm1                                \n\t"
120
                "pcmpgtb %%mm6, %%mm1                                \n\t"
121
                "paddb %%mm1, %%mm0                                \n\t"
122

    
123
                "movq (%%eax, %3, 2), %%mm1                        \n\t"
124
                PMAXUB(%%mm1, %%mm4)
125
                PMINUB(%%mm1, %%mm3, %%mm5)
126
                "psubb %%mm1, %%mm2                                \n\t"
127
                "paddb %%mm7, %%mm2                                \n\t"
128
                "pcmpgtb %%mm6, %%mm2                                \n\t"
129
                "paddb %%mm2, %%mm0                                \n\t"
130
                "psubusb %%mm3, %%mm4                                \n\t"
131

    
132
                "                                                \n\t"
133
#ifdef HAVE_MMX2
134
                "pxor %%mm7, %%mm7                                \n\t"
135
                "psadbw %%mm7, %%mm0                                \n\t"
136
#else
137
                "movq %%mm0, %%mm1                                \n\t"
138
                "psrlw $8, %%mm0                                \n\t"
139
                "paddb %%mm1, %%mm0                                \n\t"
140
                "movq %%mm0, %%mm1                                \n\t"
141
                "psrlq $16, %%mm0                                \n\t"
142
                "paddb %%mm1, %%mm0                                \n\t"
143
                "movq %%mm0, %%mm1                                \n\t"
144
                "psrlq $32, %%mm0                                \n\t"
145
                "paddb %%mm1, %%mm0                                \n\t"
146
#endif
147
                "movq %4, %%mm7                                        \n\t" // QP,..., QP
148
                "paddusb %%mm7, %%mm7                                \n\t" // 2QP ... 2QP
149
                "psubusb %%mm7, %%mm4                                \n\t" // Diff <= 2QP -> 0
150
                "packssdw %%mm4, %%mm4                                \n\t"
151
                "movd %%mm0, %0                                        \n\t"
152
                "movd %%mm4, %1                                        \n\t"
153

    
154
                : "=r" (numEq), "=r" (dcOk)
155
                : "r" (src), "r" (stride), "m" (c->pQPb)
156
                : "%eax"
157
                );
158

    
159
        numEq= (-numEq) &0xFF;
160
        if(numEq > c->ppMode.flatnessThreshold){
161
            if(dcOk) return 0;
162
            else     return 1;
163
        }else{
164
            return 2;
165
        }
166
}
167
#endif
168

    
169
/**
170
 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
171
 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
172
 */
173
#ifndef HAVE_ALTIVEC
174
static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
175
{
176
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
177
        src+= stride*3;
178
        asm volatile(        //"movv %0 %1 %2\n\t"
179
                "movq %2, %%mm0                        \n\t"  // QP,..., QP
180
                "pxor %%mm4, %%mm4                                \n\t"
181

    
182
                "movq (%0), %%mm6                                \n\t"
183
                "movq (%0, %1), %%mm5                                \n\t"
184
                "movq %%mm5, %%mm1                                \n\t"
185
                "movq %%mm6, %%mm2                                \n\t"
186
                "psubusb %%mm6, %%mm5                                \n\t"
187
                "psubusb %%mm1, %%mm2                                \n\t"
188
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
189
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
190
                "pcmpeqb %%mm4, %%mm2                        \n\t" // diff <= QP -> FF
191

    
192
                "pand %%mm2, %%mm6                                \n\t"
193
                "pandn %%mm1, %%mm2                                \n\t"
194
                "por %%mm2, %%mm6                                \n\t"// First Line to Filter
195

    
196
                "movq (%0, %1, 8), %%mm5                        \n\t"
197
                "leal (%0, %1, 4), %%eax                        \n\t"
198
                "leal (%0, %1, 8), %%ecx                        \n\t"
199
                "subl %1, %%ecx                                        \n\t"
200
                "addl %1, %0                                        \n\t" // %0 points to line 1 not 0
201
                "movq (%0, %1, 8), %%mm7                        \n\t"
202
                "movq %%mm5, %%mm1                                \n\t"
203
                "movq %%mm7, %%mm2                                \n\t"
204
                "psubusb %%mm7, %%mm5                                \n\t"
205
                "psubusb %%mm1, %%mm2                                \n\t"
206
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
207
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
208
                "pcmpeqb %%mm4, %%mm2                        \n\t" // diff <= QP -> FF
209

    
210
                "pand %%mm2, %%mm7                                \n\t"
211
                "pandn %%mm1, %%mm2                                \n\t"
212
                "por %%mm2, %%mm7                                \n\t" // First Line to Filter
213

    
214

    
215
                //         1        2        3        4        5        6        7        8
216
                //        %0        %0+%1        %0+2%1        eax        %0+4%1        eax+2%1        ecx        eax+4%1
217
                // 6 4 2 2 1 1
218
                // 6 4 4 2
219
                // 6 8 2
220

    
221
                "movq (%0, %1), %%mm0                                \n\t" //  1
222
                "movq %%mm0, %%mm1                                \n\t" //  1
223
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
224
                PAVGB(%%mm6, %%mm0)                                      //3 1        /4
225

    
226
                "movq (%0, %1, 4), %%mm2                        \n\t" //     1
227
                "movq %%mm2, %%mm5                                \n\t" //     1
228
                PAVGB((%%eax), %%mm2)                                      //    11        /2
229
                PAVGB((%0, %1, 2), %%mm2)                              //   211        /4
230
                "movq %%mm2, %%mm3                                \n\t" //   211        /4
231
                "movq (%0), %%mm4                                \n\t" // 1
232
                PAVGB(%%mm4, %%mm3)                                      // 4 211        /8
233
                PAVGB(%%mm0, %%mm3)                                      //642211        /16
234
                "movq %%mm3, (%0)                                \n\t" // X
235
                // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
236
                "movq %%mm1, %%mm0                                \n\t" //  1
237
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
238
                "movq %%mm4, %%mm3                                \n\t" // 1
239
                PAVGB((%0,%1,2), %%mm3)                                      // 1 1        /2
240
                PAVGB((%%eax,%1,2), %%mm5)                              //     11        /2
241
                PAVGB((%%eax), %%mm5)                                      //    211 /4
242
                PAVGB(%%mm5, %%mm3)                                      // 2 2211 /8
243
                PAVGB(%%mm0, %%mm3)                                      //4242211 /16
244
                "movq %%mm3, (%0,%1)                                \n\t" //  X
245
                // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
246
                PAVGB(%%mm4, %%mm6)                                      //11        /2
247
                "movq (%%ecx), %%mm0                                \n\t" //       1
248
                PAVGB((%%eax, %1, 2), %%mm0)                              //      11/2
249
                "movq %%mm0, %%mm3                                \n\t" //      11/2
250
                PAVGB(%%mm1, %%mm0)                                      //  2   11/4
251
                PAVGB(%%mm6, %%mm0)                                      //222   11/8
252
                PAVGB(%%mm2, %%mm0)                                      //22242211/16
253
                "movq (%0, %1, 2), %%mm2                        \n\t" //   1
254
                "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
255
                // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
256
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
257
                PAVGB((%%ecx), %%mm0)                                      //       11        /2
258
                PAVGB(%%mm0, %%mm6)                                      //11     11        /4
259
                PAVGB(%%mm1, %%mm4)                                      // 11                /2
260
                PAVGB(%%mm2, %%mm1)                                      //  11                /2
261
                PAVGB(%%mm1, %%mm6)                                      //1122   11        /8
262
                PAVGB(%%mm5, %%mm6)                                      //112242211        /16
263
                "movq (%%eax), %%mm5                                \n\t" //    1
264
                "movq %%mm6, (%%eax)                                \n\t" //    X
265
                // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
266
                "movq (%%eax, %1, 4), %%mm6                        \n\t" //        1
267
                PAVGB(%%mm7, %%mm6)                                      //        11        /2
268
                PAVGB(%%mm4, %%mm6)                                      // 11     11        /4
269
                PAVGB(%%mm3, %%mm6)                                      // 11   2211        /8
270
                PAVGB(%%mm5, %%mm2)                                      //   11                /2
271
                "movq (%0, %1, 4), %%mm4                        \n\t" //     1
272
                PAVGB(%%mm4, %%mm2)                                      //   112                /4
273
                PAVGB(%%mm2, %%mm6)                                      // 112242211        /16
274
                "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
275
                // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
276
                PAVGB(%%mm7, %%mm1)                                      //  11     2        /4
277
                PAVGB(%%mm4, %%mm5)                                      //    11                /2
278
                PAVGB(%%mm5, %%mm0)                                      //    11 11        /4
279
                "movq (%%eax, %1, 2), %%mm6                        \n\t" //      1
280
                PAVGB(%%mm6, %%mm1)                                      //  11  4  2        /8
281
                PAVGB(%%mm0, %%mm1)                                      //  11224222        /16
282
                "movq %%mm1, (%%eax, %1, 2)                        \n\t" //      X
283
                // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
284
                PAVGB((%%ecx), %%mm2)                                      //   112 4        /8
285
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
286
                PAVGB(%%mm0, %%mm6)                                      //      1 1        /2
287
                PAVGB(%%mm7, %%mm6)                                      //      1 12        /4
288
                PAVGB(%%mm2, %%mm6)                                      //   1122424        /4
289
                "movq %%mm6, (%%ecx)                                \n\t" //       X
290
                // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
291
                PAVGB(%%mm7, %%mm5)                                      //    11   2        /4
292
                PAVGB(%%mm7, %%mm5)                                      //    11   6        /8
293

    
294
                PAVGB(%%mm3, %%mm0)                                      //      112        /4
295
                PAVGB(%%mm0, %%mm5)                                      //    112246        /16
296
                "movq %%mm5, (%%eax, %1, 4)                        \n\t" //        X
297
                "subl %1, %0                                        \n\t"
298

    
299
                :
300
                : "r" (src), "r" (stride), "m" (c->pQPb)
301
                : "%eax", "%ecx"
302
        );
303
#else
304
        const int l1= stride;
305
        const int l2= stride + l1;
306
        const int l3= stride + l2;
307
        const int l4= stride + l3;
308
        const int l5= stride + l4;
309
        const int l6= stride + l5;
310
        const int l7= stride + l6;
311
        const int l8= stride + l7;
312
        const int l9= stride + l8;
313
        int x;
314
        src+= stride*3;
315
        for(x=0; x<BLOCK_SIZE; x++)
316
        {
317
                const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
318
                const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
319

    
320
                int sums[10];
321
                sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
322
                sums[1] = sums[0] - first  + src[l4];
323
                sums[2] = sums[1] - first  + src[l5];
324
                sums[3] = sums[2] - first  + src[l6];
325
                sums[4] = sums[3] - first  + src[l7];
326
                sums[5] = sums[4] - src[l1] + src[l8];
327
                sums[6] = sums[5] - src[l2] + last;
328
                sums[7] = sums[6] - src[l3] + last;
329
                sums[8] = sums[7] - src[l4] + last;
330
                sums[9] = sums[8] - src[l5] + last;
331

    
332
                src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
333
                src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
334
                src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
335
                src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
336
                src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
337
                src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
338
                src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
339
                src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
340

    
341
                src++;
342
        }
343
#endif
344
}
345
#endif //HAVE_ALTIVEC
346

    
347
#if 0
348
/**
349
 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
350
 * values are correctly clipped (MMX2)
351
 * values are wraparound (C)
352
 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
353
        0 8 16 24
354
        x = 8
355
        x/2 = 4
356
        x/8 = 1
357
        1 12 12 23
358
 */
359
static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
360
{
361
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
362
        src+= stride*3;
363
// FIXME rounding
364
        asm volatile(
365
                "pxor %%mm7, %%mm7                                \n\t" // 0
366
                "movq "MANGLE(b80)", %%mm6                        \n\t" // MIN_SIGNED_BYTE
367
                "leal (%0, %1), %%eax                                \n\t"
368
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
369
//        0        1        2        3        4        5        6        7        8        9
370
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1        %0+8%1        ecx+4%1
371
                "movq "MANGLE(pQPb)", %%mm0                        \n\t" // QP,..., QP
372
                "movq %%mm0, %%mm1                                \n\t" // QP,..., QP
373
                "paddusb "MANGLE(b02)", %%mm0                        \n\t"
374
                "psrlw $2, %%mm0                                \n\t"
375
                "pand "MANGLE(b3F)", %%mm0                        \n\t" // QP/4,..., QP/4
376
                "paddusb %%mm1, %%mm0                                \n\t" // QP*1.25 ...
377
                "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
378
                "movq (%%ecx), %%mm3                                \n\t" // line 5
379
                "movq %%mm2, %%mm4                                \n\t" // line 4
380
                "pcmpeqb %%mm5, %%mm5                                \n\t" // -1
381
                "pxor %%mm2, %%mm5                                \n\t" // -line 4 - 1
382
                PAVGB(%%mm3, %%mm5)
383
                "paddb %%mm6, %%mm5                                \n\t" // (l5-l4)/2
384
                "psubusb %%mm3, %%mm4                                \n\t"
385
                "psubusb %%mm2, %%mm3                                \n\t"
386
                "por %%mm3, %%mm4                                \n\t" // |l4 - l5|
387
                "psubusb %%mm0, %%mm4                                \n\t"
388
                "pcmpeqb %%mm7, %%mm4                                \n\t"
389
                "pand %%mm4, %%mm5                                \n\t" // d/2
390

391
//                "paddb %%mm6, %%mm2                                \n\t" // line 4 + 0x80
392
                "paddb %%mm5, %%mm2                                \n\t"
393
//                "psubb %%mm6, %%mm2                                \n\t"
394
                "movq %%mm2, (%0,%1, 4)                                \n\t"
395

396
                "movq (%%ecx), %%mm2                                \n\t"
397
//                "paddb %%mm6, %%mm2                                \n\t" // line 5 + 0x80
398
                "psubb %%mm5, %%mm2                                \n\t"
399
//                "psubb %%mm6, %%mm2                                \n\t"
400
                "movq %%mm2, (%%ecx)                                \n\t"
401

402
                "paddb %%mm6, %%mm5                                \n\t"
403
                "psrlw $2, %%mm5                                \n\t"
404
                "pand "MANGLE(b3F)", %%mm5                        \n\t"
405
                "psubb "MANGLE(b20)", %%mm5                        \n\t" // (l5-l4)/8
406

407
                "movq (%%eax, %1, 2), %%mm2                        \n\t"
408
                "paddb %%mm6, %%mm2                                \n\t" // line 3 + 0x80
409
                "paddsb %%mm5, %%mm2                                \n\t"
410
                "psubb %%mm6, %%mm2                                \n\t"
411
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
412

413
                "movq (%%ecx, %1), %%mm2                        \n\t"
414
                "paddb %%mm6, %%mm2                                \n\t" // line 6 + 0x80
415
                "psubsb %%mm5, %%mm2                                \n\t"
416
                "psubb %%mm6, %%mm2                                \n\t"
417
                "movq %%mm2, (%%ecx, %1)                        \n\t"
418

419
                :
420
                : "r" (src), "r" (stride)
421
                : "%eax", "%ecx"
422
        );
423
#else
424
         const int l1= stride;
425
        const int l2= stride + l1;
426
        const int l3= stride + l2;
427
        const int l4= stride + l3;
428
        const int l5= stride + l4;
429
        const int l6= stride + l5;
430
//        const int l7= stride + l6;
431
//        const int l8= stride + l7;
432
//        const int l9= stride + l8;
433
        int x;
434
        const int QP15= QP + (QP>>2);
435
        src+= stride*3;
436
        for(x=0; x<BLOCK_SIZE; x++)
437
        {
438
                const int v = (src[x+l5] - src[x+l4]);
439
                if(ABS(v) < QP15)
440
                {
441
                        src[x+l3] +=v>>3;
442
                        src[x+l4] +=v>>1;
443
                        src[x+l5] -=v>>1;
444
                        src[x+l6] -=v>>3;
445

    
446
                }
447
        }
448

    
449
#endif
450
}
451
#endif
452

    
453
/**
454
 * Experimental Filter 1
455
 * will not damage linear gradients
456
 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
457
 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
458
 * MMX2 version does correct clipping C version doesnt
459
 */
460
static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
461
{
462
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
463
        src+= stride*3;
464

    
465
        asm volatile(
466
                "pxor %%mm7, %%mm7                                \n\t" // 0
467
                "leal (%0, %1), %%eax                                \n\t"
468
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
469
//        0        1        2        3        4        5        6        7        8        9
470
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1        %0+8%1        ecx+4%1
471
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
472
                "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
473
                "movq %%mm1, %%mm2                                \n\t" // line 4
474
                "psubusb %%mm0, %%mm1                                \n\t"
475
                "psubusb %%mm2, %%mm0                                \n\t"
476
                "por %%mm1, %%mm0                                \n\t" // |l2 - l3|
477
                "movq (%%ecx), %%mm3                                \n\t" // line 5
478
                "movq (%%ecx, %1), %%mm4                        \n\t" // line 6
479
                "movq %%mm3, %%mm5                                \n\t" // line 5
480
                "psubusb %%mm4, %%mm3                                \n\t"
481
                "psubusb %%mm5, %%mm4                                \n\t"
482
                "por %%mm4, %%mm3                                \n\t" // |l5 - l6|
483
                PAVGB(%%mm3, %%mm0)                                      // (|l2 - l3| + |l5 - l6|)/2
484
                "movq %%mm2, %%mm1                                \n\t" // line 4
485
                "psubusb %%mm5, %%mm2                                \n\t"
486
                "movq %%mm2, %%mm4                                \n\t"
487
                "pcmpeqb %%mm7, %%mm2                                \n\t" // (l4 - l5) <= 0 ? -1 : 0
488
                "psubusb %%mm1, %%mm5                                \n\t"
489
                "por %%mm5, %%mm4                                \n\t" // |l4 - l5|
490
                "psubusb %%mm0, %%mm4                \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
491
                "movq %%mm4, %%mm3                                \n\t" // d
492
                "movq %2, %%mm0                        \n\t"
493
                "paddusb %%mm0, %%mm0                                \n\t"
494
                "psubusb %%mm0, %%mm4                                \n\t"
495
                "pcmpeqb %%mm7, %%mm4                                \n\t" // d <= QP ? -1 : 0
496
                "psubusb "MANGLE(b01)", %%mm3                        \n\t"
497
                "pand %%mm4, %%mm3                                \n\t" // d <= QP ? d : 0
498

    
499
                PAVGB(%%mm7, %%mm3)                                      // d/2
500
                "movq %%mm3, %%mm1                                \n\t" // d/2
501
                PAVGB(%%mm7, %%mm3)                                      // d/4
502
                PAVGB(%%mm1, %%mm3)                                      // 3*d/8
503

    
504
                "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
505
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
506
                "psubusb %%mm3, %%mm0                                \n\t"
507
                "pxor %%mm2, %%mm0                                \n\t"
508
                "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
509

    
510
                "movq (%%ecx), %%mm0                                \n\t" // line 5
511
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
512
                "paddusb %%mm3, %%mm0                                \n\t"
513
                "pxor %%mm2, %%mm0                                \n\t"
514
                "movq %%mm0, (%%ecx)                                \n\t" // line 5
515

    
516
                PAVGB(%%mm7, %%mm1)                                      // d/4
517

    
518
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
519
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
520
                "psubusb %%mm1, %%mm0                                \n\t"
521
                "pxor %%mm2, %%mm0                                \n\t"
522
                "movq %%mm0, (%%eax, %1, 2)                        \n\t" // line 3
523

    
524
                "movq (%%ecx, %1), %%mm0                        \n\t" // line 6
525
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
526
                "paddusb %%mm1, %%mm0                                \n\t"
527
                "pxor %%mm2, %%mm0                                \n\t"
528
                "movq %%mm0, (%%ecx, %1)                        \n\t" // line 6
529

    
530
                PAVGB(%%mm7, %%mm1)                                      // d/8
531

    
532
                "movq (%%eax, %1), %%mm0                        \n\t" // line 2
533
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
534
                "psubusb %%mm1, %%mm0                                \n\t"
535
                "pxor %%mm2, %%mm0                                \n\t"
536
                "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
537

    
538
                "movq (%%ecx, %1, 2), %%mm0                        \n\t" // line 7
539
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
540
                "paddusb %%mm1, %%mm0                                \n\t"
541
                "pxor %%mm2, %%mm0                                \n\t"
542
                "movq %%mm0, (%%ecx, %1, 2)                        \n\t" // line 7
543

    
544
                :
545
                : "r" (src), "r" (stride), "m" (co->pQPb)
546
                : "%eax", "%ecx"
547
        );
548
#else
549

    
550
         const int l1= stride;
551
        const int l2= stride + l1;
552
        const int l3= stride + l2;
553
        const int l4= stride + l3;
554
        const int l5= stride + l4;
555
        const int l6= stride + l5;
556
        const int l7= stride + l6;
557
//        const int l8= stride + l7;
558
//        const int l9= stride + l8;
559
        int x;
560

    
561
        src+= stride*3;
562
        for(x=0; x<BLOCK_SIZE; x++)
563
        {
564
                int a= src[l3] - src[l4];
565
                int b= src[l4] - src[l5];
566
                int c= src[l5] - src[l6];
567

    
568
                int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
569
                d= MAX(d, 0);
570

    
571
                if(d < co->QP*2)
572
                {
573
                        int v = d * SIGN(-b);
574

    
575
                        src[l2] +=v>>3;
576
                        src[l3] +=v>>2;
577
                        src[l4] +=(3*v)>>3;
578
                        src[l5] -=(3*v)>>3;
579
                        src[l6] -=v>>2;
580
                        src[l7] -=v>>3;
581

    
582
                }
583
                src++;
584
        }
585
#endif
586
}
587

    
588
#ifndef HAVE_ALTIVEC
589
static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
590
{
591
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
592
/*
593
        uint8_t tmp[16];
594
        const int l1= stride;
595
        const int l2= stride + l1;
596
        const int l3= stride + l2;
597
        const int l4= (int)tmp - (int)src - stride*3;
598
        const int l5= (int)tmp - (int)src - stride*3 + 8;
599
        const int l6= stride*3 + l3;
600
        const int l7= stride + l6;
601
        const int l8= stride + l7;
602

603
        memcpy(tmp, src+stride*7, 8);
604
        memcpy(tmp+8, src+stride*8, 8);
605
*/
606
        src+= stride*4;
607
        asm volatile(
608

    
609
#if 0 //sligtly more accurate and slightly slower
610
                "pxor %%mm7, %%mm7                                \n\t" // 0
611
                "leal (%0, %1), %%eax                                \n\t"
612
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
613
//        0        1        2        3        4        5        6        7
614
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ecx+%1        ecx+2%1
615
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1
616

617

618
                "movq (%0, %1, 2), %%mm0                        \n\t" // l2
619
                "movq (%0), %%mm1                                \n\t" // l0
620
                "movq %%mm0, %%mm2                                \n\t" // l2
621
                PAVGB(%%mm7, %%mm0)                                      // ~l2/2
622
                PAVGB(%%mm1, %%mm0)                                      // ~(l2 + 2l0)/4
623
                PAVGB(%%mm2, %%mm0)                                      // ~(5l2 + 2l0)/8
624

625
                "movq (%%eax), %%mm1                                \n\t" // l1
626
                "movq (%%eax, %1, 2), %%mm3                        \n\t" // l3
627
                "movq %%mm1, %%mm4                                \n\t" // l1
628
                PAVGB(%%mm7, %%mm1)                                      // ~l1/2
629
                PAVGB(%%mm3, %%mm1)                                      // ~(l1 + 2l3)/4
630
                PAVGB(%%mm4, %%mm1)                                      // ~(5l1 + 2l3)/8
631

632
                "movq %%mm0, %%mm4                                \n\t" // ~(5l2 + 2l0)/8
633
                "psubusb %%mm1, %%mm0                                \n\t"
634
                "psubusb %%mm4, %%mm1                                \n\t"
635
                "por %%mm0, %%mm1                                \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
636
// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
637

638
                "movq (%0, %1, 4), %%mm0                        \n\t" // l4
639
                "movq %%mm0, %%mm4                                \n\t" // l4
640
                PAVGB(%%mm7, %%mm0)                                      // ~l4/2
641
                PAVGB(%%mm2, %%mm0)                                      // ~(l4 + 2l2)/4
642
                PAVGB(%%mm4, %%mm0)                                      // ~(5l4 + 2l2)/8
643

644
                "movq (%%ecx), %%mm2                                \n\t" // l5
645
                "movq %%mm3, %%mm5                                \n\t" // l3
646
                PAVGB(%%mm7, %%mm3)                                      // ~l3/2
647
                PAVGB(%%mm2, %%mm3)                                      // ~(l3 + 2l5)/4
648
                PAVGB(%%mm5, %%mm3)                                      // ~(5l3 + 2l5)/8
649

650
                "movq %%mm0, %%mm6                                \n\t" // ~(5l4 + 2l2)/8
651
                "psubusb %%mm3, %%mm0                                \n\t"
652
                "psubusb %%mm6, %%mm3                                \n\t"
653
                "por %%mm0, %%mm3                                \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
654
                "pcmpeqb %%mm7, %%mm0                                \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
655
// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
656

657
                "movq (%%ecx, %1), %%mm6                        \n\t" // l6
658
                "movq %%mm6, %%mm5                                \n\t" // l6
659
                PAVGB(%%mm7, %%mm6)                                      // ~l6/2
660
                PAVGB(%%mm4, %%mm6)                                      // ~(l6 + 2l4)/4
661
                PAVGB(%%mm5, %%mm6)                                      // ~(5l6 + 2l4)/8
662

663
                "movq (%%ecx, %1, 2), %%mm5                        \n\t" // l7
664
                "movq %%mm2, %%mm4                                \n\t" // l5
665
                PAVGB(%%mm7, %%mm2)                                      // ~l5/2
666
                PAVGB(%%mm5, %%mm2)                                      // ~(l5 + 2l7)/4
667
                PAVGB(%%mm4, %%mm2)                                      // ~(5l5 + 2l7)/8
668

669
                "movq %%mm6, %%mm4                                \n\t" // ~(5l6 + 2l4)/8
670
                "psubusb %%mm2, %%mm6                                \n\t"
671
                "psubusb %%mm4, %%mm2                                \n\t"
672
                "por %%mm6, %%mm2                                \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
673
// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
674

675

676
                PMINUB(%%mm2, %%mm1, %%mm4)                              // MIN(|lenergy|,|renergy|)/8
677
                "movq %2, %%mm4                                        \n\t" // QP //FIXME QP+1 ?
678
                "paddusb "MANGLE(b01)", %%mm4                        \n\t"
679
                "pcmpgtb %%mm3, %%mm4                                \n\t" // |menergy|/8 < QP
680
                "psubusb %%mm1, %%mm3                                \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
681
                "pand %%mm4, %%mm3                                \n\t"
682

683
                "movq %%mm3, %%mm1                                \n\t"
684
//                "psubusb "MANGLE(b01)", %%mm3                        \n\t"
685
                PAVGB(%%mm7, %%mm3)
686
                PAVGB(%%mm7, %%mm3)
687
                "paddusb %%mm1, %%mm3                                \n\t"
688
//                "paddusb "MANGLE(b01)", %%mm3                        \n\t"
689

690
                "movq (%%eax, %1, 2), %%mm6                        \n\t" //l3
691
                "movq (%0, %1, 4), %%mm5                        \n\t" //l4
692
                "movq (%0, %1, 4), %%mm4                        \n\t" //l4
693
                "psubusb %%mm6, %%mm5                                \n\t"
694
                "psubusb %%mm4, %%mm6                                \n\t"
695
                "por %%mm6, %%mm5                                \n\t" // |l3-l4|
696
                "pcmpeqb %%mm7, %%mm6                                \n\t" // SIGN(l3-l4)
697
                "pxor %%mm6, %%mm0                                \n\t"
698
                "pand %%mm0, %%mm3                                \n\t"
699
                PMINUB(%%mm5, %%mm3, %%mm0)
700

701
                "psubusb "MANGLE(b01)", %%mm3                        \n\t"
702
                PAVGB(%%mm7, %%mm3)
703

704
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
705
                "movq (%0, %1, 4), %%mm2                        \n\t"
706
                "pxor %%mm6, %%mm0                                \n\t"
707
                "pxor %%mm6, %%mm2                                \n\t"
708
                "psubb %%mm3, %%mm0                                \n\t"
709
                "paddb %%mm3, %%mm2                                \n\t"
710
                "pxor %%mm6, %%mm0                                \n\t"
711
                "pxor %%mm6, %%mm2                                \n\t"
712
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
713
                "movq %%mm2, (%0, %1, 4)                        \n\t"
714
#endif
715

    
716
                "leal (%0, %1), %%eax                                \n\t"
717
                "pcmpeqb %%mm6, %%mm6                                \n\t" // -1
718
//        0        1        2        3        4        5        6        7
719
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ecx+%1        ecx+2%1
720
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1
721

    
722

    
723
                "movq (%%eax, %1, 2), %%mm1                        \n\t" // l3
724
                "movq (%0, %1, 4), %%mm0                        \n\t" // l4
725
                "pxor %%mm6, %%mm1                                \n\t" // -l3-1
726
                PAVGB(%%mm1, %%mm0)                                      // -q+128 = (l4-l3+256)/2
727
// mm1=-l3-1, mm0=128-q
728

    
729
                "movq (%%eax, %1, 4), %%mm2                        \n\t" // l5
730
                "movq (%%eax, %1), %%mm3                        \n\t" // l2
731
                "pxor %%mm6, %%mm2                                \n\t" // -l5-1
732
                "movq %%mm2, %%mm5                                \n\t" // -l5-1
733
                "movq "MANGLE(b80)", %%mm4                        \n\t" // 128
734
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
735
                PAVGB(%%mm3, %%mm2)                                      // (l2-l5+256)/2
736
                PAVGB(%%mm0, %%mm4)                                      // ~(l4-l3)/4 + 128
737
                PAVGB(%%mm2, %%mm4)                                      // ~(l2-l5)/4 +(l4-l3)/8 + 128
738
                PAVGB(%%mm0, %%mm4)                                      // ~(l2-l5)/8 +5(l4-l3)/16 + 128
739
// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
740

    
741
                "movq (%%eax), %%mm2                                \n\t" // l1
742
                "pxor %%mm6, %%mm2                                \n\t" // -l1-1
743
                PAVGB(%%mm3, %%mm2)                                      // (l2-l1+256)/2
744
                PAVGB((%0), %%mm1)                                      // (l0-l3+256)/2
745
                "movq "MANGLE(b80)", %%mm3                        \n\t" // 128
746
                PAVGB(%%mm2, %%mm3)                                      // ~(l2-l1)/4 + 128
747
                PAVGB(%%mm1, %%mm3)                                      // ~(l0-l3)/4 +(l2-l1)/8 + 128
748
                PAVGB(%%mm2, %%mm3)                                      // ~(l0-l3)/8 +5(l2-l1)/16 + 128
749
// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
750

    
751
                PAVGB((%%ecx, %1), %%mm5)                              // (l6-l5+256)/2
752
                "movq (%%ecx, %1, 2), %%mm1                        \n\t" // l7
753
                "pxor %%mm6, %%mm1                                \n\t" // -l7-1
754
                PAVGB((%0, %1, 4), %%mm1)                              // (l4-l7+256)/2
755
                "movq "MANGLE(b80)", %%mm2                        \n\t" // 128
756
                PAVGB(%%mm5, %%mm2)                                      // ~(l6-l5)/4 + 128
757
                PAVGB(%%mm1, %%mm2)                                      // ~(l4-l7)/4 +(l6-l5)/8 + 128
758
                PAVGB(%%mm5, %%mm2)                                      // ~(l4-l7)/8 +5(l6-l5)/16 + 128
759
// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
760

    
761
                "movq "MANGLE(b00)", %%mm1                        \n\t" // 0
762
                "movq "MANGLE(b00)", %%mm5                        \n\t" // 0
763
                "psubb %%mm2, %%mm1                                \n\t" // 128 - renergy/16
764
                "psubb %%mm3, %%mm5                                \n\t" // 128 - lenergy/16
765
                PMAXUB(%%mm1, %%mm2)                                      // 128 + |renergy/16|
766
                 PMAXUB(%%mm5, %%mm3)                                      // 128 + |lenergy/16|
767
                PMINUB(%%mm2, %%mm3, %%mm1)                              // 128 + MIN(|lenergy|,|renergy|)/16
768

    
769
// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
770

    
771
                "movq "MANGLE(b00)", %%mm7                        \n\t" // 0
772
                "movq %2, %%mm2                                        \n\t" // QP
773
                PAVGB(%%mm6, %%mm2)                                      // 128 + QP/2
774
                "psubb %%mm6, %%mm2                                \n\t"
775

    
776
                "movq %%mm4, %%mm1                                \n\t"
777
                "pcmpgtb %%mm7, %%mm1                                \n\t" // SIGN(menergy)
778
                "pxor %%mm1, %%mm4                                \n\t"
779
                "psubb %%mm1, %%mm4                                \n\t" // 128 + |menergy|/16
780
                "pcmpgtb %%mm4, %%mm2                                \n\t" // |menergy|/16 < QP/2
781
                "psubusb %%mm3, %%mm4                                \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
782
// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
783

    
784
                "movq %%mm4, %%mm3                                \n\t" // d
785
                "psubusb "MANGLE(b01)", %%mm4                        \n\t"
786
                PAVGB(%%mm7, %%mm4)                                      // d/32
787
                PAVGB(%%mm7, %%mm4)                                      // (d + 32)/64
788
                "paddb %%mm3, %%mm4                                \n\t" // 5d/64
789
                "pand %%mm2, %%mm4                                \n\t"
790

    
791
                "movq "MANGLE(b80)", %%mm5                        \n\t" // 128
792
                "psubb %%mm0, %%mm5                                \n\t" // q
793
                "paddsb %%mm6, %%mm5                                \n\t" // fix bad rounding
794
                "pcmpgtb %%mm5, %%mm7                                \n\t" // SIGN(q)
795
                "pxor %%mm7, %%mm5                                \n\t"
796

    
797
                PMINUB(%%mm5, %%mm4, %%mm3)                              // MIN(|q|, 5d/64)
798
                "pxor %%mm1, %%mm7                                \n\t" // SIGN(d*q)
799

    
800
                "pand %%mm7, %%mm4                                \n\t"
801
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
802
                "movq (%0, %1, 4), %%mm2                        \n\t"
803
                "pxor %%mm1, %%mm0                                \n\t"
804
                "pxor %%mm1, %%mm2                                \n\t"
805
                "paddb %%mm4, %%mm0                                \n\t"
806
                "psubb %%mm4, %%mm2                                \n\t"
807
                "pxor %%mm1, %%mm0                                \n\t"
808
                "pxor %%mm1, %%mm2                                \n\t"
809
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
810
                "movq %%mm2, (%0, %1, 4)                        \n\t"
811

    
812
                :
813
                : "r" (src), "r" (stride), "m" (c->pQPb)
814
                : "%eax", "%ecx"
815
        );
816

    
817
/*
818
        {
819
        int x;
820
        src-= stride;
821
        for(x=0; x<BLOCK_SIZE; x++)
822
        {
823
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
824
                if(ABS(middleEnergy)< 8*QP)
825
                {
826
                        const int q=(src[l4] - src[l5])/2;
827
                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
828
                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
829

830
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
831
                        d= MAX(d, 0);
832

833
                        d= (5*d + 32) >> 6;
834
                        d*= SIGN(-middleEnergy);
835

836
                        if(q>0)
837
                        {
838
                                d= d<0 ? 0 : d;
839
                                d= d>q ? q : d;
840
                        }
841
                        else
842
                        {
843
                                d= d>0 ? 0 : d;
844
                                d= d<q ? q : d;
845
                        }
846

847
                        src[l4]-= d;
848
                        src[l5]+= d;
849
                }
850
                src++;
851
        }
852
src-=8;
853
        for(x=0; x<8; x++)
854
        {
855
                int y;
856
                for(y=4; y<6; y++)
857
                {
858
                        int d= src[x+y*stride] - tmp[x+(y-4)*8];
859
                        int ad= ABS(d);
860
                        static int max=0;
861
                        static int sum=0;
862
                        static int num=0;
863
                        static int bias=0;
864

865
                        if(max<ad) max=ad;
866
                        sum+= ad>3 ? 1 : 0;
867
                        if(ad>3)
868
                        {
869
                                src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
870
                        }
871
                        if(y==4) bias+=d;
872
                        num++;
873
                        if(num%1000000 == 0)
874
                        {
875
                                printf(" %d %d %d %d\n", num, sum, max, bias);
876
                        }
877
                }
878
        }
879
}
880
*/
881
#elif defined (HAVE_MMX)
882
        src+= stride*4;
883
        asm volatile(
884
                "pxor %%mm7, %%mm7                                \n\t"
885
                "leal -40(%%esp), %%ecx                                \n\t" // make space for 4 8-byte vars
886
                "andl $0xFFFFFFF8, %%ecx                        \n\t" // align
887
//        0        1        2        3        4        5        6        7
888
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        edx+%1        edx+2%1
889
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1
890

    
891
                "movq (%0), %%mm0                                \n\t"
892
                "movq %%mm0, %%mm1                                \n\t"
893
                "punpcklbw %%mm7, %%mm0                                \n\t" // low part of line 0
894
                "punpckhbw %%mm7, %%mm1                                \n\t" // high part of line 0
895

    
896
                "movq (%0, %1), %%mm2                                \n\t"
897
                "leal (%0, %1, 2), %%eax                        \n\t"
898
                "movq %%mm2, %%mm3                                \n\t"
899
                "punpcklbw %%mm7, %%mm2                                \n\t" // low part of line 1
900
                "punpckhbw %%mm7, %%mm3                                \n\t" // high part of line 1
901

    
902
                "movq (%%eax), %%mm4                                \n\t"
903
                "movq %%mm4, %%mm5                                \n\t"
904
                "punpcklbw %%mm7, %%mm4                                \n\t" // low part of line 2
905
                "punpckhbw %%mm7, %%mm5                                \n\t" // high part of line 2
906

    
907
                "paddw %%mm0, %%mm0                                \n\t" // 2L0
908
                "paddw %%mm1, %%mm1                                \n\t" // 2H0
909
                "psubw %%mm4, %%mm2                                \n\t" // L1 - L2
910
                "psubw %%mm5, %%mm3                                \n\t" // H1 - H2
911
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - L1 + L2
912
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - H1 + H2
913

    
914
                "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
915
                "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
916
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2
917
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2
918

    
919
                "movq (%%eax, %1), %%mm2                        \n\t"
920
                "movq %%mm2, %%mm3                                \n\t"
921
                "punpcklbw %%mm7, %%mm2                                \n\t" // L3
922
                "punpckhbw %%mm7, %%mm3                                \n\t" // H3
923

    
924
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - L3
925
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - H3
926
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
927
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
928
                "movq %%mm0, (%%ecx)                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
929
                "movq %%mm1, 8(%%ecx)                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
930

    
931
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
932
                "movq %%mm0, %%mm1                                \n\t"
933
                "punpcklbw %%mm7, %%mm0                                \n\t" // L4
934
                "punpckhbw %%mm7, %%mm1                                \n\t" // H4
935

    
936
                "psubw %%mm0, %%mm2                                \n\t" // L3 - L4
937
                "psubw %%mm1, %%mm3                                \n\t" // H3 - H4
938
                "movq %%mm2, 16(%%ecx)                                \n\t" // L3 - L4
939
                "movq %%mm3, 24(%%ecx)                                \n\t" // H3 - H4
940
                "paddw %%mm4, %%mm4                                \n\t" // 2L2
941
                "paddw %%mm5, %%mm5                                \n\t" // 2H2
942
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - L3 + L4
943
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - H3 + H4
944

    
945
                "leal (%%eax, %1), %0                                \n\t"
946
                "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
947
                "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
948
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4
949
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4
950
//50 opcodes so far
951
                "movq (%0, %1, 2), %%mm2                        \n\t"
952
                "movq %%mm2, %%mm3                                \n\t"
953
                "punpcklbw %%mm7, %%mm2                                \n\t" // L5
954
                "punpckhbw %%mm7, %%mm3                                \n\t" // H5
955
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - L5
956
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - H5
957
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - 2L5
958
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - 2H5
959

    
960
                "movq (%%eax, %1, 4), %%mm6                        \n\t"
961
                "punpcklbw %%mm7, %%mm6                                \n\t" // L6
962
                "psubw %%mm6, %%mm2                                \n\t" // L5 - L6
963
                "movq (%%eax, %1, 4), %%mm6                        \n\t"
964
                "punpckhbw %%mm7, %%mm6                                \n\t" // H6
965
                "psubw %%mm6, %%mm3                                \n\t" // H5 - H6
966

    
967
                "paddw %%mm0, %%mm0                                \n\t" // 2L4
968
                "paddw %%mm1, %%mm1                                \n\t" // 2H4
969
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - L5 + L6
970
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - H5 + H6
971

    
972
                "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
973
                "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
974
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6
975
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6
976

    
977
                "movq (%0, %1, 4), %%mm2                        \n\t"
978
                "movq %%mm2, %%mm3                                \n\t"
979
                "punpcklbw %%mm7, %%mm2                                \n\t" // L7
980
                "punpckhbw %%mm7, %%mm3                                \n\t" // H7
981

    
982
                "paddw %%mm2, %%mm2                                \n\t" // 2L7
983
                "paddw %%mm3, %%mm3                                \n\t" // 2H7
984
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6 - 2L7
985
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6 - 2H7
986

    
987
                "movq (%%ecx), %%mm2                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
988
                "movq 8(%%ecx), %%mm3                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
989

    
990
#ifdef HAVE_MMX2
991
                "movq %%mm7, %%mm6                                \n\t" // 0
992
                "psubw %%mm0, %%mm6                                \n\t"
993
                "pmaxsw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
994
                "movq %%mm7, %%mm6                                \n\t" // 0
995
                "psubw %%mm1, %%mm6                                \n\t"
996
                "pmaxsw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
997
                "movq %%mm7, %%mm6                                \n\t" // 0
998
                "psubw %%mm2, %%mm6                                \n\t"
999
                "pmaxsw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1000
                "movq %%mm7, %%mm6                                \n\t" // 0
1001
                "psubw %%mm3, %%mm6                                \n\t"
1002
                "pmaxsw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1003
#else
1004
                "movq %%mm7, %%mm6                                \n\t" // 0
1005
                "pcmpgtw %%mm0, %%mm6                                \n\t"
1006
                "pxor %%mm6, %%mm0                                \n\t"
1007
                "psubw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1008
                "movq %%mm7, %%mm6                                \n\t" // 0
1009
                "pcmpgtw %%mm1, %%mm6                                \n\t"
1010
                "pxor %%mm6, %%mm1                                \n\t"
1011
                "psubw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1012
                "movq %%mm7, %%mm6                                \n\t" // 0
1013
                "pcmpgtw %%mm2, %%mm6                                \n\t"
1014
                "pxor %%mm6, %%mm2                                \n\t"
1015
                "psubw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1016
                "movq %%mm7, %%mm6                                \n\t" // 0
1017
                "pcmpgtw %%mm3, %%mm6                                \n\t"
1018
                "pxor %%mm6, %%mm3                                \n\t"
1019
                "psubw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1020
#endif
1021

    
1022
#ifdef HAVE_MMX2
1023
                "pminsw %%mm2, %%mm0                                \n\t"
1024
                "pminsw %%mm3, %%mm1                                \n\t"
1025
#else
1026
                "movq %%mm0, %%mm6                                \n\t"
1027
                "psubusw %%mm2, %%mm6                                \n\t"
1028
                "psubw %%mm6, %%mm0                                \n\t"
1029
                "movq %%mm1, %%mm6                                \n\t"
1030
                "psubusw %%mm3, %%mm6                                \n\t"
1031
                "psubw %%mm6, %%mm1                                \n\t"
1032
#endif
1033

    
1034
                "movd %2, %%mm2                                        \n\t" // QP
1035
                "punpcklbw %%mm7, %%mm2                                \n\t"
1036

    
1037
                "movq %%mm7, %%mm6                                \n\t" // 0
1038
                "pcmpgtw %%mm4, %%mm6                                \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1039
                "pxor %%mm6, %%mm4                                \n\t"
1040
                "psubw %%mm6, %%mm4                                \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1041
                "pcmpgtw %%mm5, %%mm7                                \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1042
                "pxor %%mm7, %%mm5                                \n\t"
1043
                "psubw %%mm7, %%mm5                                \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1044
// 100 opcodes
1045
                "psllw $3, %%mm2                                \n\t" // 8QP
1046
                "movq %%mm2, %%mm3                                \n\t" // 8QP
1047
                "pcmpgtw %%mm4, %%mm2                                \n\t"
1048
                "pcmpgtw %%mm5, %%mm3                                \n\t"
1049
                "pand %%mm2, %%mm4                                \n\t"
1050
                "pand %%mm3, %%mm5                                \n\t"
1051

    
1052

    
1053
                "psubusw %%mm0, %%mm4                                \n\t" // hd
1054
                "psubusw %%mm1, %%mm5                                \n\t" // ld
1055

    
1056

    
1057
                "movq "MANGLE(w05)", %%mm2                        \n\t" // 5
1058
                "pmullw %%mm2, %%mm4                                \n\t"
1059
                "pmullw %%mm2, %%mm5                                \n\t"
1060
                "movq "MANGLE(w20)", %%mm2                        \n\t" // 32
1061
                "paddw %%mm2, %%mm4                                \n\t"
1062
                "paddw %%mm2, %%mm5                                \n\t"
1063
                "psrlw $6, %%mm4                                \n\t"
1064
                "psrlw $6, %%mm5                                \n\t"
1065

    
1066
                "movq 16(%%ecx), %%mm0                                \n\t" // L3 - L4
1067
                "movq 24(%%ecx), %%mm1                                \n\t" // H3 - H4
1068

    
1069
                "pxor %%mm2, %%mm2                                \n\t"
1070
                "pxor %%mm3, %%mm3                                \n\t"
1071

    
1072
                "pcmpgtw %%mm0, %%mm2                                \n\t" // sign (L3-L4)
1073
                "pcmpgtw %%mm1, %%mm3                                \n\t" // sign (H3-H4)
1074
                "pxor %%mm2, %%mm0                                \n\t"
1075
                "pxor %%mm3, %%mm1                                \n\t"
1076
                "psubw %%mm2, %%mm0                                \n\t" // |L3-L4|
1077
                "psubw %%mm3, %%mm1                                \n\t" // |H3-H4|
1078
                "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1079
                "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1080

    
1081
                "pxor %%mm6, %%mm2                                \n\t"
1082
                "pxor %%mm7, %%mm3                                \n\t"
1083
                "pand %%mm2, %%mm4                                \n\t"
1084
                "pand %%mm3, %%mm5                                \n\t"
1085

    
1086
#ifdef HAVE_MMX2
1087
                "pminsw %%mm0, %%mm4                                \n\t"
1088
                "pminsw %%mm1, %%mm5                                \n\t"
1089
#else
1090
                "movq %%mm4, %%mm2                                \n\t"
1091
                "psubusw %%mm0, %%mm2                                \n\t"
1092
                "psubw %%mm2, %%mm4                                \n\t"
1093
                "movq %%mm5, %%mm2                                \n\t"
1094
                "psubusw %%mm1, %%mm2                                \n\t"
1095
                "psubw %%mm2, %%mm5                                \n\t"
1096
#endif
1097
                "pxor %%mm6, %%mm4                                \n\t"
1098
                "pxor %%mm7, %%mm5                                \n\t"
1099
                "psubw %%mm6, %%mm4                                \n\t"
1100
                "psubw %%mm7, %%mm5                                \n\t"
1101
                "packsswb %%mm5, %%mm4                                \n\t"
1102
                "movq (%0), %%mm0                                \n\t"
1103
                "paddb   %%mm4, %%mm0                                \n\t"
1104
                "movq %%mm0, (%0)                                \n\t"
1105
                "movq (%0, %1), %%mm0                                \n\t"
1106
                "psubb %%mm4, %%mm0                                \n\t"
1107
                "movq %%mm0, (%0, %1)                                \n\t"
1108

    
1109
                : "+r" (src)
1110
                : "r" (stride), "m" (c->pQPb)
1111
                : "%eax", "%ecx"
1112
        );
1113
#else
1114
        const int l1= stride;
1115
        const int l2= stride + l1;
1116
        const int l3= stride + l2;
1117
        const int l4= stride + l3;
1118
        const int l5= stride + l4;
1119
        const int l6= stride + l5;
1120
        const int l7= stride + l6;
1121
        const int l8= stride + l7;
1122
//        const int l9= stride + l8;
1123
        int x;
1124
        src+= stride*3;
1125
        for(x=0; x<BLOCK_SIZE; x++)
1126
        {
1127
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1128
                if(ABS(middleEnergy) < 8*c->QP)
1129
                {
1130
                        const int q=(src[l4] - src[l5])/2;
1131
                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1132
                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1133

    
1134
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1135
                        d= MAX(d, 0);
1136

    
1137
                        d= (5*d + 32) >> 6;
1138
                        d*= SIGN(-middleEnergy);
1139

    
1140
                        if(q>0)
1141
                        {
1142
                                d= d<0 ? 0 : d;
1143
                                d= d>q ? q : d;
1144
                        }
1145
                        else
1146
                        {
1147
                                d= d>0 ? 0 : d;
1148
                                d= d<q ? q : d;
1149
                        }
1150

    
1151
                        src[l4]-= d;
1152
                        src[l5]+= d;
1153
                }
1154
                src++;
1155
        }
1156
#endif
1157
}
1158
#endif //HAVE_ALTIVEC
1159

    
1160
#ifndef HAVE_ALTIVEC
1161
static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1162
{
1163
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1164
        asm volatile(
1165
                "pxor %%mm6, %%mm6                                \n\t"
1166
                "pcmpeqb %%mm7, %%mm7                                \n\t"
1167
                "movq %2, %%mm0                                        \n\t"
1168
                "punpcklbw %%mm6, %%mm0                                \n\t"
1169
                "psrlw $1, %%mm0                                \n\t"
1170
                "psubw %%mm7, %%mm0                                \n\t"
1171
                "packuswb %%mm0, %%mm0                                \n\t"
1172
                "movq %%mm0, %3                                        \n\t"
1173

    
1174
                "leal (%0, %1), %%eax                                \n\t"
1175
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1176
                
1177
//        0        1        2        3        4        5        6        7        8        9
1178
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1179

    
1180
#undef FIND_MIN_MAX
1181
#ifdef HAVE_MMX2
1182
#define FIND_MIN_MAX(addr)\
1183
                "movq " #addr ", %%mm0                                \n\t"\
1184
                "pminub %%mm0, %%mm7                                \n\t"\
1185
                "pmaxub %%mm0, %%mm6                                \n\t"
1186
#else
1187
#define FIND_MIN_MAX(addr)\
1188
                "movq " #addr ", %%mm0                                \n\t"\
1189
                "movq %%mm7, %%mm1                                \n\t"\
1190
                "psubusb %%mm0, %%mm6                                \n\t"\
1191
                "paddb %%mm0, %%mm6                                \n\t"\
1192
                "psubusb %%mm0, %%mm1                                \n\t"\
1193
                "psubb %%mm1, %%mm7                                \n\t"
1194
#endif
1195

    
1196
FIND_MIN_MAX((%%eax))
1197
FIND_MIN_MAX((%%eax, %1))
1198
FIND_MIN_MAX((%%eax, %1, 2))
1199
FIND_MIN_MAX((%0, %1, 4))
1200
FIND_MIN_MAX((%%edx))
1201
FIND_MIN_MAX((%%edx, %1))
1202
FIND_MIN_MAX((%%edx, %1, 2))
1203
FIND_MIN_MAX((%0, %1, 8))
1204

    
1205
                "movq %%mm7, %%mm4                                \n\t"
1206
                "psrlq $8, %%mm7                                \n\t"
1207
#ifdef HAVE_MMX2
1208
                "pminub %%mm4, %%mm7                                \n\t" // min of pixels
1209
                "pshufw $0xF9, %%mm7, %%mm4                        \n\t"
1210
                "pminub %%mm4, %%mm7                                \n\t" // min of pixels
1211
                "pshufw $0xFE, %%mm7, %%mm4                        \n\t"
1212
                "pminub %%mm4, %%mm7                                \n\t"
1213
#else
1214
                "movq %%mm7, %%mm1                                \n\t"
1215
                "psubusb %%mm4, %%mm1                                \n\t"
1216
                "psubb %%mm1, %%mm7                                \n\t"
1217
                "movq %%mm7, %%mm4                                \n\t"
1218
                "psrlq $16, %%mm7                                \n\t"
1219
                "movq %%mm7, %%mm1                                \n\t"
1220
                "psubusb %%mm4, %%mm1                                \n\t"
1221
                "psubb %%mm1, %%mm7                                \n\t"
1222
                "movq %%mm7, %%mm4                                \n\t"
1223
                "psrlq $32, %%mm7                                \n\t"
1224
                "movq %%mm7, %%mm1                                \n\t"
1225
                "psubusb %%mm4, %%mm1                                \n\t"
1226
                "psubb %%mm1, %%mm7                                \n\t"
1227
#endif
1228

    
1229

    
1230
                "movq %%mm6, %%mm4                                \n\t"
1231
                "psrlq $8, %%mm6                                \n\t"
1232
#ifdef HAVE_MMX2
1233
                "pmaxub %%mm4, %%mm6                                \n\t" // max of pixels
1234
                "pshufw $0xF9, %%mm6, %%mm4                        \n\t"
1235
                "pmaxub %%mm4, %%mm6                                \n\t"
1236
                "pshufw $0xFE, %%mm6, %%mm4                        \n\t"
1237
                "pmaxub %%mm4, %%mm6                                \n\t"
1238
#else
1239
                "psubusb %%mm4, %%mm6                                \n\t"
1240
                "paddb %%mm4, %%mm6                                \n\t"
1241
                "movq %%mm6, %%mm4                                \n\t"
1242
                "psrlq $16, %%mm6                                \n\t"
1243
                "psubusb %%mm4, %%mm6                                \n\t"
1244
                "paddb %%mm4, %%mm6                                \n\t"
1245
                "movq %%mm6, %%mm4                                \n\t"
1246
                "psrlq $32, %%mm6                                \n\t"
1247
                "psubusb %%mm4, %%mm6                                \n\t"
1248
                "paddb %%mm4, %%mm6                                \n\t"
1249
#endif
1250
                "movq %%mm6, %%mm0                                \n\t" // max
1251
                "psubb %%mm7, %%mm6                                \n\t" // max - min
1252
                "movd %%mm6, %%ecx                                \n\t"
1253
                "cmpb "MANGLE(deringThreshold)", %%cl                \n\t"
1254
                " jb 1f                                                \n\t"
1255
                "leal -24(%%esp), %%ecx                                \n\t"
1256
                "andl $0xFFFFFFF8, %%ecx                        \n\t" 
1257
                PAVGB(%%mm0, %%mm7)                                      // a=(max + min)/2
1258
                "punpcklbw %%mm7, %%mm7                                \n\t"
1259
                "punpcklbw %%mm7, %%mm7                                \n\t"
1260
                "punpcklbw %%mm7, %%mm7                                \n\t"
1261
                "movq %%mm7, (%%ecx)                                \n\t"
1262

    
1263
                "movq (%0), %%mm0                                \n\t" // L10
1264
                "movq %%mm0, %%mm1                                \n\t" // L10
1265
                "movq %%mm0, %%mm2                                \n\t" // L10
1266
                "psllq $8, %%mm1                                \n\t"
1267
                "psrlq $8, %%mm2                                \n\t"
1268
                "movd -4(%0), %%mm3                                \n\t"
1269
                "movd 8(%0), %%mm4                                \n\t"
1270
                "psrlq $24, %%mm3                                \n\t"
1271
                "psllq $56, %%mm4                                \n\t"
1272
                "por %%mm3, %%mm1                                \n\t" // L00
1273
                "por %%mm4, %%mm2                                \n\t" // L20
1274
                "movq %%mm1, %%mm3                                \n\t" // L00
1275
                PAVGB(%%mm2, %%mm1)                                      // (L20 + L00)/2
1276
                PAVGB(%%mm0, %%mm1)                                      // (L20 + L00 + 2L10)/4
1277
                "psubusb %%mm7, %%mm0                                \n\t"
1278
                "psubusb %%mm7, %%mm2                                \n\t"
1279
                "psubusb %%mm7, %%mm3                                \n\t"
1280
                "pcmpeqb "MANGLE(b00)", %%mm0                        \n\t" // L10 > a ? 0 : -1
1281
                "pcmpeqb "MANGLE(b00)", %%mm2                        \n\t" // L20 > a ? 0 : -1
1282
                "pcmpeqb "MANGLE(b00)", %%mm3                        \n\t" // L00 > a ? 0 : -1
1283
                "paddb %%mm2, %%mm0                                \n\t"
1284
                "paddb %%mm3, %%mm0                                \n\t"
1285

    
1286
                "movq (%%eax), %%mm2                                \n\t" // L11
1287
                "movq %%mm2, %%mm3                                \n\t" // L11
1288
                "movq %%mm2, %%mm4                                \n\t" // L11
1289
                "psllq $8, %%mm3                                \n\t"
1290
                "psrlq $8, %%mm4                                \n\t"
1291
                "movd -4(%%eax), %%mm5                                \n\t"
1292
                "movd 8(%%eax), %%mm6                                \n\t"
1293
                "psrlq $24, %%mm5                                \n\t"
1294
                "psllq $56, %%mm6                                \n\t"
1295
                "por %%mm5, %%mm3                                \n\t" // L01
1296
                "por %%mm6, %%mm4                                \n\t" // L21
1297
                "movq %%mm3, %%mm5                                \n\t" // L01
1298
                PAVGB(%%mm4, %%mm3)                                      // (L21 + L01)/2
1299
                PAVGB(%%mm2, %%mm3)                                      // (L21 + L01 + 2L11)/4
1300
                "psubusb %%mm7, %%mm2                                \n\t"
1301
                "psubusb %%mm7, %%mm4                                \n\t"
1302
                "psubusb %%mm7, %%mm5                                \n\t"
1303
                "pcmpeqb "MANGLE(b00)", %%mm2                        \n\t" // L11 > a ? 0 : -1
1304
                "pcmpeqb "MANGLE(b00)", %%mm4                        \n\t" // L21 > a ? 0 : -1
1305
                "pcmpeqb "MANGLE(b00)", %%mm5                        \n\t" // L01 > a ? 0 : -1
1306
                "paddb %%mm4, %%mm2                                \n\t"
1307
                "paddb %%mm5, %%mm2                                \n\t"
1308
// 0, 2, 3, 1
1309
#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1310
                "movq " #src ", " #sx "                                \n\t" /* src[0] */\
1311
                "movq " #sx ", " #lx "                                \n\t" /* src[0] */\
1312
                "movq " #sx ", " #t0 "                                \n\t" /* src[0] */\
1313
                "psllq $8, " #lx "                                \n\t"\
1314
                "psrlq $8, " #t0 "                                \n\t"\
1315
                "movd -4" #src ", " #t1 "                        \n\t"\
1316
                "psrlq $24, " #t1 "                                \n\t"\
1317
                "por " #t1 ", " #lx "                                \n\t" /* src[-1] */\
1318
                "movd 8" #src ", " #t1 "                        \n\t"\
1319
                "psllq $56, " #t1 "                                \n\t"\
1320
                "por " #t1 ", " #t0 "                                \n\t" /* src[+1] */\
1321
                "movq " #lx ", " #t1 "                                \n\t" /* src[-1] */\
1322
                PAVGB(t0, lx)                                              /* (src[-1] + src[+1])/2 */\
1323
                PAVGB(sx, lx)                                      /* (src[-1] + 2src[0] + src[+1])/4 */\
1324
                PAVGB(lx, pplx)                                             \
1325
                "movq " #lx ", 8(%%ecx)                                \n\t"\
1326
                "movq (%%ecx), " #lx "                                \n\t"\
1327
                "psubusb " #lx ", " #t1 "                        \n\t"\
1328
                "psubusb " #lx ", " #t0 "                        \n\t"\
1329
                "psubusb " #lx ", " #sx "                        \n\t"\
1330
                "movq "MANGLE(b00)", " #lx "                        \n\t"\
1331
                "pcmpeqb " #lx ", " #t1 "                        \n\t" /* src[-1] > a ? 0 : -1*/\
1332
                "pcmpeqb " #lx ", " #t0 "                        \n\t" /* src[+1] > a ? 0 : -1*/\
1333
                "pcmpeqb " #lx ", " #sx "                        \n\t" /* src[0]  > a ? 0 : -1*/\
1334
                "paddb " #t1 ", " #t0 "                                \n\t"\
1335
                "paddb " #t0 ", " #sx "                                \n\t"\
1336
\
1337
                PAVGB(plx, pplx)                                      /* filtered */\
1338
                "movq " #dst ", " #t0 "                                \n\t" /* dst */\
1339
                "movq " #t0 ", " #t1 "                                \n\t" /* dst */\
1340
                "psubusb %3, " #t0 "                                \n\t"\
1341
                "paddusb %3, " #t1 "                                \n\t"\
1342
                PMAXUB(t0, pplx)\
1343
                PMINUB(t1, pplx, t0)\
1344
                "paddb " #sx ", " #ppsx "                        \n\t"\
1345
                "paddb " #psx ", " #ppsx "                        \n\t"\
1346
                "#paddb "MANGLE(b02)", " #ppsx "                \n\t"\
1347
                "pand "MANGLE(b08)", " #ppsx "                        \n\t"\
1348
                "pcmpeqb " #lx ", " #ppsx "                        \n\t"\
1349
                "pand " #ppsx ", " #pplx "                        \n\t"\
1350
                "pandn " #dst ", " #ppsx "                        \n\t"\
1351
                "por " #pplx ", " #ppsx "                        \n\t"\
1352
                "movq " #ppsx ", " #dst "                        \n\t"\
1353
                "movq 8(%%ecx), " #lx "                                \n\t"
1354

    
1355
/*
1356
0000000
1357
1111111
1358

1359
1111110
1360
1111101
1361
1111100
1362
1111011
1363
1111010
1364
1111001
1365

1366
1111000
1367
1110111
1368

1369
*/
1370
//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1371
DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1372
DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1373
DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1374
DERING_CORE((%0, %1, 4),(%%edx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1375
DERING_CORE((%%edx),(%%edx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1376
DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1377
DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1378
DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1379

    
1380
                "1:                        \n\t"
1381
                : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
1382
                : "%eax", "%edx", "%ecx"
1383
        );
1384
#else
1385
        int y;
1386
        int min=255;
1387
        int max=0;
1388
        int avg;
1389
        uint8_t *p;
1390
        int s[10];
1391
        const int QP2= c->QP/2 + 1;
1392

    
1393
        for(y=1; y<9; y++)
1394
        {
1395
                int x;
1396
                p= src + stride*y;
1397
                for(x=1; x<9; x++)
1398
                {
1399
                        p++;
1400
                        if(*p > max) max= *p;
1401
                        if(*p < min) min= *p;
1402
                }
1403
        }
1404
        avg= (min + max + 1)>>1;
1405

    
1406
        if(max - min <deringThreshold) return;
1407

    
1408
        for(y=0; y<10; y++)
1409
        {
1410
                int t = 0;
1411

    
1412
                if(src[stride*y + 0] > avg) t+= 1;
1413
                if(src[stride*y + 1] > avg) t+= 2;
1414
                if(src[stride*y + 2] > avg) t+= 4;
1415
                if(src[stride*y + 3] > avg) t+= 8;
1416
                if(src[stride*y + 4] > avg) t+= 16;
1417
                if(src[stride*y + 5] > avg) t+= 32;
1418
                if(src[stride*y + 6] > avg) t+= 64;
1419
                if(src[stride*y + 7] > avg) t+= 128;
1420
                if(src[stride*y + 8] > avg) t+= 256;
1421
                if(src[stride*y + 9] > avg) t+= 512;
1422
                
1423
                t |= (~t)<<16;
1424
                t &= (t<<1) & (t>>1);
1425
                s[y] = t;
1426
        }
1427
        
1428
        for(y=1; y<9; y++)
1429
        {
1430
                int t = s[y-1] & s[y] & s[y+1];
1431
                t|= t>>16;
1432
                s[y-1]= t;
1433
        }
1434

    
1435
        for(y=1; y<9; y++)
1436
        {
1437
                int x;
1438
                int t = s[y-1];
1439

    
1440
                p= src + stride*y;
1441
                for(x=1; x<9; x++)
1442
                {
1443
                        p++;
1444
                        if(t & (1<<x))
1445
                        {
1446
                                int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1447
                                      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1448
                                      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1449
                                f= (f + 8)>>4;
1450

    
1451
#ifdef DEBUG_DERING_THRESHOLD
1452
                                asm volatile("emms\n\t":);
1453
                                {
1454
                                static long long numPixels=0;
1455
                                if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1456
//                                if((max-min)<20 || (max-min)*QP<200)
1457
//                                if((max-min)*QP < 500)
1458
//                                if(max-min<QP/2)
1459
                                if(max-min < 20)
1460
                                {
1461
                                        static int numSkiped=0;
1462
                                        static int errorSum=0;
1463
                                        static int worstQP=0;
1464
                                        static int worstRange=0;
1465
                                        static int worstDiff=0;
1466
                                        int diff= (f - *p);
1467
                                        int absDiff= ABS(diff);
1468
                                        int error= diff*diff;
1469

    
1470
                                        if(x==1 || x==8 || y==1 || y==8) continue;
1471

    
1472
                                        numSkiped++;
1473
                                        if(absDiff > worstDiff)
1474
                                        {
1475
                                                worstDiff= absDiff;
1476
                                                worstQP= QP;
1477
                                                worstRange= max-min;
1478
                                        }
1479
                                        errorSum+= error;
1480

    
1481
                                        if(1024LL*1024LL*1024LL % numSkiped == 0)
1482
                                        {
1483
                                                printf( "sum:%1.3f, skip:%d, wQP:%d, "
1484
                                                        "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1485
                                                        (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1486
                                                        worstDiff, (float)numSkiped/numPixels);
1487
                                        }
1488
                                }
1489
                                }
1490
#endif
1491
                                if     (*p + QP2 < f) *p= *p + QP2;
1492
                                else if(*p - QP2 > f) *p= *p - QP2;
1493
                                else *p=f;
1494
                        }
1495
                }
1496
        }
1497
#ifdef DEBUG_DERING_THRESHOLD
1498
        if(max-min < 20)
1499
        {
1500
                for(y=1; y<9; y++)
1501
                {
1502
                        int x;
1503
                        int t = 0;
1504
                        p= src + stride*y;
1505
                        for(x=1; x<9; x++)
1506
                        {
1507
                                p++;
1508
                                *p = MIN(*p + 20, 255);
1509
                        }
1510
                }
1511
//                src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1512
        }
1513
#endif
1514
#endif
1515
}
1516
#endif //HAVE_ALTIVEC
1517

    
1518
/**
1519
 * Deinterlaces the given block by linearly interpolating every second line.
1520
 * will be called for every 8x8 block and can read & write from line 4-15
1521
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1522
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1523
 */
1524
static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1525
{
1526
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1527
        src+= 4*stride;
1528
        asm volatile(
1529
                "leal (%0, %1), %%eax                                \n\t"
1530
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
1531
//        0        1        2        3        4        5        6        7        8        9
1532
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1        %0+8%1        ecx+4%1
1533

    
1534
                "movq (%0), %%mm0                                \n\t"
1535
                "movq (%%eax, %1), %%mm1                        \n\t"
1536
                PAVGB(%%mm1, %%mm0)
1537
                "movq %%mm0, (%%eax)                                \n\t"
1538
                "movq (%0, %1, 4), %%mm0                        \n\t"
1539
                PAVGB(%%mm0, %%mm1)
1540
                "movq %%mm1, (%%eax, %1, 2)                        \n\t"
1541
                "movq (%%ecx, %1), %%mm1                        \n\t"
1542
                PAVGB(%%mm1, %%mm0)
1543
                "movq %%mm0, (%%ecx)                                \n\t"
1544
                "movq (%0, %1, 8), %%mm0                        \n\t"
1545
                PAVGB(%%mm0, %%mm1)
1546
                "movq %%mm1, (%%ecx, %1, 2)                        \n\t"
1547

    
1548
                : : "r" (src), "r" (stride)
1549
                : "%eax", "%ecx"
1550
        );
1551
#else
1552
        int a, b, x;
1553
        src+= 4*stride;
1554

    
1555
        for(x=0; x<2; x++){
1556
                a= *(uint32_t*)&src[stride*0];
1557
                b= *(uint32_t*)&src[stride*2];
1558
                *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1559
                a= *(uint32_t*)&src[stride*4];
1560
                *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1561
                b= *(uint32_t*)&src[stride*6];
1562
                *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1563
                a= *(uint32_t*)&src[stride*8];
1564
                *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1565
                src += 4;
1566
        }
1567
#endif
1568
}
1569

    
1570
/**
1571
 * Deinterlaces the given block by cubic interpolating every second line.
1572
 * will be called for every 8x8 block and can read & write from line 4-15
1573
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1574
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1575
 * this filter will read lines 3-15 and write 7-13
1576
 */
1577
static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1578
{
1579
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1580
        src+= stride*3;
1581
        asm volatile(
1582
                "leal (%0, %1), %%eax                                \n\t"
1583
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1584
                "leal (%%edx, %1, 4), %%ecx                        \n\t"
1585
                "addl %1, %%ecx                                        \n\t"
1586
                "pxor %%mm7, %%mm7                                \n\t"
1587
//        0        1        2        3        4        5        6        7        8        9        10
1588
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1 ecx
1589

    
1590
#define DEINT_CUBIC(a,b,c,d,e)\
1591
                "movq " #a ", %%mm0                                \n\t"\
1592
                "movq " #b ", %%mm1                                \n\t"\
1593
                "movq " #d ", %%mm2                                \n\t"\
1594
                "movq " #e ", %%mm3                                \n\t"\
1595
                PAVGB(%%mm2, %%mm1)                                        /* (b+d) /2 */\
1596
                PAVGB(%%mm3, %%mm0)                                        /* a(a+e) /2 */\
1597
                "movq %%mm0, %%mm2                                \n\t"\
1598
                "punpcklbw %%mm7, %%mm0                                \n\t"\
1599
                "punpckhbw %%mm7, %%mm2                                \n\t"\
1600
                "movq %%mm1, %%mm3                                \n\t"\
1601
                "punpcklbw %%mm7, %%mm1                                \n\t"\
1602
                "punpckhbw %%mm7, %%mm3                                \n\t"\
1603
                "psubw %%mm1, %%mm0                                \n\t"        /* L(a+e - (b+d))/2 */\
1604
                "psubw %%mm3, %%mm2                                \n\t"        /* H(a+e - (b+d))/2 */\
1605
                "psraw $3, %%mm0                                \n\t"        /* L(a+e - (b+d))/16 */\
1606
                "psraw $3, %%mm2                                \n\t"        /* H(a+e - (b+d))/16 */\
1607
                "psubw %%mm0, %%mm1                                \n\t"        /* L(9b + 9d - a - e)/16 */\
1608
                "psubw %%mm2, %%mm3                                \n\t"        /* H(9b + 9d - a - e)/16 */\
1609
                "packuswb %%mm3, %%mm1                                \n\t"\
1610
                "movq %%mm1, " #c "                                \n\t"
1611

    
1612
DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))
1613
DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))
1614
DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))
1615
DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1616

    
1617
                : : "r" (src), "r" (stride)
1618
                : "%eax", "%edx", "ecx"
1619
        );
1620
#else
1621
        int x;
1622
        src+= stride*3;
1623
        for(x=0; x<8; x++)
1624
        {
1625
                src[stride*3] = CLIP((-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1626
                src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1627
                src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1628
                src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1629
                src++;
1630
        }
1631
#endif
1632
}
1633

    
1634
/**
1635
 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1636
 * will be called for every 8x8 block and can read & write from line 4-15
1637
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1638
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1639
 * this filter will read lines 4-13 and write 5-11
1640
 */
1641
static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1642
{
1643
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1644
        src+= stride*4;
1645
        asm volatile(
1646
                "leal (%0, %1), %%eax                                \n\t"
1647
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1648
                "pxor %%mm7, %%mm7                                \n\t"
1649
                "movq (%2), %%mm0                                \n\t"
1650
//        0        1        2        3        4        5        6        7        8        9        10
1651
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1 ecx
1652

    
1653
#define DEINT_FF(a,b,c,d)\
1654
                "movq " #a ", %%mm1                                \n\t"\
1655
                "movq " #b ", %%mm2                                \n\t"\
1656
                "movq " #c ", %%mm3                                \n\t"\
1657
                "movq " #d ", %%mm4                                \n\t"\
1658
                PAVGB(%%mm3, %%mm1)                                        \
1659
                PAVGB(%%mm4, %%mm0)                                        \
1660
                "movq %%mm0, %%mm3                                \n\t"\
1661
                "punpcklbw %%mm7, %%mm0                                \n\t"\
1662
                "punpckhbw %%mm7, %%mm3                                \n\t"\
1663
                "movq %%mm1, %%mm4                                \n\t"\
1664
                "punpcklbw %%mm7, %%mm1                                \n\t"\
1665
                "punpckhbw %%mm7, %%mm4                                \n\t"\
1666
                "psllw $2, %%mm1                                \n\t"\
1667
                "psllw $2, %%mm4                                \n\t"\
1668
                "psubw %%mm0, %%mm1                                \n\t"\
1669
                "psubw %%mm3, %%mm4                                \n\t"\
1670
                "movq %%mm2, %%mm5                                \n\t"\
1671
                "movq %%mm2, %%mm0                                \n\t"\
1672
                "punpcklbw %%mm7, %%mm2                                \n\t"\
1673
                "punpckhbw %%mm7, %%mm5                                \n\t"\
1674
                "paddw %%mm2, %%mm1                                \n\t"\
1675
                "paddw %%mm5, %%mm4                                \n\t"\
1676
                "psraw $2, %%mm1                                \n\t"\
1677
                "psraw $2, %%mm4                                \n\t"\
1678
                "packuswb %%mm4, %%mm1                                \n\t"\
1679
                "movq %%mm1, " #b "                                \n\t"\
1680

    
1681
DEINT_FF((%0)       , (%%eax)       , (%%eax, %1), (%%eax, %1, 2))
1682
DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx)       )
1683
DEINT_FF((%0, %1, 4), (%%edx)       , (%%edx, %1), (%%edx, %1, 2))
1684
DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4))
1685

    
1686
                "movq %%mm0, (%2)                                \n\t"
1687
                : : "r" (src), "r" (stride), "r"(tmp)
1688
                : "%eax", "%edx"
1689
        );
1690
#else
1691
        int x;
1692
        src+= stride*4;
1693
        for(x=0; x<8; x++)
1694
        {
1695
                int t1= tmp[x];
1696
                int t2= src[stride*1];
1697

    
1698
                src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1699
                t1= src[stride*4];
1700
                src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1701
                t2= src[stride*6];
1702
                src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1703
                t1= src[stride*8];
1704
                src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1705
                tmp[x]= t1;
1706

    
1707
                src++;
1708
        }
1709
#endif
1710
}
1711

    
1712
/**
1713
 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter.
1714
 * will be called for every 8x8 block and can read & write from line 4-15
1715
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1716
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1717
 * this filter will read lines 4-13 and write 4-11
1718
 */
1719
static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1720
{
1721
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1722
        src+= stride*4;
1723
        asm volatile(
1724
                "leal (%0, %1), %%eax                                \n\t"
1725
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1726
                "pxor %%mm7, %%mm7                                \n\t"
1727
                "movq (%2), %%mm0                                \n\t"
1728
                "movq (%3), %%mm1                                \n\t"
1729
//        0        1        2        3        4        5        6        7        8        9        10
1730
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1 ecx
1731

    
1732
#define DEINT_L5(t1,t2,a,b,c)\
1733
                "movq " #a ", %%mm2                                \n\t"\
1734
                "movq " #b ", %%mm3                                \n\t"\
1735
                "movq " #c ", %%mm4                                \n\t"\
1736
                PAVGB(t2, %%mm3)                                        \
1737
                PAVGB(t1, %%mm4)                                        \
1738
                "movq %%mm2, %%mm5                                \n\t"\
1739
                "movq %%mm2, " #t1 "                                \n\t"\
1740
                "punpcklbw %%mm7, %%mm2                                \n\t"\
1741
                "punpckhbw %%mm7, %%mm5                                \n\t"\
1742
                "movq %%mm2, %%mm6                                \n\t"\
1743
                "paddw %%mm2, %%mm2                                \n\t"\
1744
                "paddw %%mm6, %%mm2                                \n\t"\
1745
                "movq %%mm5, %%mm6                                \n\t"\
1746
                "paddw %%mm5, %%mm5                                \n\t"\
1747
                "paddw %%mm6, %%mm5                                \n\t"\
1748
                "movq %%mm3, %%mm6                                \n\t"\
1749
                "punpcklbw %%mm7, %%mm3                                \n\t"\
1750
                "punpckhbw %%mm7, %%mm6                                \n\t"\
1751
                "paddw %%mm3, %%mm3                                \n\t"\
1752
                "paddw %%mm6, %%mm6                                \n\t"\
1753
                "paddw %%mm3, %%mm2                                \n\t"\
1754
                "paddw %%mm6, %%mm5                                \n\t"\
1755
                "movq %%mm4, %%mm6                                \n\t"\
1756
                "punpcklbw %%mm7, %%mm4                                \n\t"\
1757
                "punpckhbw %%mm7, %%mm6                                \n\t"\
1758
                "psubw %%mm4, %%mm2                                \n\t"\
1759
                "psubw %%mm6, %%mm5                                \n\t"\
1760
                "psraw $2, %%mm2                                \n\t"\
1761
                "psraw $2, %%mm5                                \n\t"\
1762
                "packuswb %%mm5, %%mm2                                \n\t"\
1763
                "movq %%mm2, " #a "                                \n\t"\
1764

    
1765
DEINT_L5(%%mm0, %%mm1, (%0)          , (%%eax)       , (%%eax, %1)   )
1766
DEINT_L5(%%mm1, %%mm0, (%%eax)       , (%%eax, %1)   , (%%eax, %1, 2))
1767
DEINT_L5(%%mm0, %%mm1, (%%eax, %1)   , (%%eax, %1, 2), (%0, %1, 4)   )
1768
DEINT_L5(%%mm1, %%mm0, (%%eax, %1, 2), (%0, %1, 4)   , (%%edx)       )
1769
DEINT_L5(%%mm0, %%mm1, (%0, %1, 4)   , (%%edx)       , (%%edx, %1)   )  
1770
DEINT_L5(%%mm1, %%mm0, (%%edx)       , (%%edx, %1)   , (%%edx, %1, 2))
1771
DEINT_L5(%%mm0, %%mm1, (%%edx, %1)   , (%%edx, %1, 2), (%0, %1, 8)   )
1772
DEINT_L5(%%mm1, %%mm0, (%%edx, %1, 2), (%0, %1, 8)   , (%%edx, %1, 4))
1773

    
1774
                "movq %%mm0, (%2)                                \n\t"
1775
                "movq %%mm1, (%3)                                \n\t"
1776
                : : "r" (src), "r" (stride), "r"(tmp), "r"(tmp2)
1777
                : "%eax", "%edx"
1778
        );
1779
#else
1780
        int x;
1781
        src+= stride*4;
1782
        for(x=0; x<8; x++)
1783
        {
1784
                int t1= tmp[x];
1785
                int t2= tmp2[x];
1786
                int t3= src[0];
1787

    
1788
                src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1789
                t1= src[stride*1];
1790
                src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1791
                t2= src[stride*2];
1792
                src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1793
                t3= src[stride*3];
1794
                src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1795
                t1= src[stride*4];
1796
                src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1797
                t2= src[stride*5];
1798
                src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1799
                t3= src[stride*6];
1800
                src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1801
                t1= src[stride*7];
1802
                src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1803

    
1804
                tmp[x]= t3;
1805
                tmp2[x]= t1;
1806

    
1807
                src++;
1808
        }
1809
#endif
1810
}
1811

    
1812
/**
1813
 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter.
1814
 * will be called for every 8x8 block and can read & write from line 4-15
1815
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1816
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1817
 * this filter will read lines 4-13 and write 4-11
1818
 */
1819
static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1820
{
1821
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1822
        src+= 4*stride;
1823
        asm volatile(
1824
                "leal (%0, %1), %%eax                                \n\t"
1825
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1826
//        0        1        2        3        4        5        6        7        8        9
1827
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1828

    
1829
                "movq (%2), %%mm0                                \n\t" // L0
1830
                "movq (%%eax), %%mm1                                \n\t" // L2
1831
                PAVGB(%%mm1, %%mm0)                                      // L0+L2
1832
                "movq (%0), %%mm2                                \n\t" // L1
1833
                PAVGB(%%mm2, %%mm0)
1834
                "movq %%mm0, (%0)                                \n\t"
1835
                "movq (%%eax, %1), %%mm0                        \n\t" // L3
1836
                PAVGB(%%mm0, %%mm2)                                      // L1+L3
1837
                PAVGB(%%mm1, %%mm2)                                      // 2L2 + L1 + L3
1838
                "movq %%mm2, (%%eax)                                \n\t"
1839
                "movq (%%eax, %1, 2), %%mm2                        \n\t" // L4
1840
                PAVGB(%%mm2, %%mm1)                                      // L2+L4
1841
                PAVGB(%%mm0, %%mm1)                                      // 2L3 + L2 + L4
1842
                "movq %%mm1, (%%eax, %1)                        \n\t"
1843
                "movq (%0, %1, 4), %%mm1                        \n\t" // L5
1844
                PAVGB(%%mm1, %%mm0)                                      // L3+L5
1845
                PAVGB(%%mm2, %%mm0)                                      // 2L4 + L3 + L5
1846
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
1847
                "movq (%%edx), %%mm0                                \n\t" // L6
1848
                PAVGB(%%mm0, %%mm2)                                      // L4+L6
1849
                PAVGB(%%mm1, %%mm2)                                      // 2L5 + L4 + L6
1850
                "movq %%mm2, (%0, %1, 4)                        \n\t"
1851
                "movq (%%edx, %1), %%mm2                        \n\t" // L7
1852
                PAVGB(%%mm2, %%mm1)                                      // L5+L7
1853
                PAVGB(%%mm0, %%mm1)                                      // 2L6 + L5 + L7
1854
                "movq %%mm1, (%%edx)                                \n\t"
1855
                "movq (%%edx, %1, 2), %%mm1                        \n\t" // L8
1856
                PAVGB(%%mm1, %%mm0)                                      // L6+L8
1857
                PAVGB(%%mm2, %%mm0)                                      // 2L7 + L6 + L8
1858
                "movq %%mm0, (%%edx, %1)                        \n\t"
1859
                "movq (%0, %1, 8), %%mm0                        \n\t" // L9
1860
                PAVGB(%%mm0, %%mm2)                                      // L7+L9
1861
                PAVGB(%%mm1, %%mm2)                                      // 2L8 + L7 + L9
1862
                "movq %%mm2, (%%edx, %1, 2)                        \n\t"
1863
                "movq %%mm1, (%2)                                \n\t"
1864

    
1865
                : : "r" (src), "r" (stride), "r" (tmp)
1866
                : "%eax", "%edx"
1867
        );
1868
#else
1869
        int a, b, c, x;
1870
        src+= 4*stride;
1871

    
1872
        for(x=0; x<2; x++){
1873
                a= *(uint32_t*)&tmp[stride*0];
1874
                b= *(uint32_t*)&src[stride*0];
1875
                c= *(uint32_t*)&src[stride*1];
1876
                a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1877
                *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1878

    
1879
                a= *(uint32_t*)&src[stride*2];
1880
                b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1881
                *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1882

    
1883
                b= *(uint32_t*)&src[stride*3];
1884
                c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1885
                *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1886

    
1887
                c= *(uint32_t*)&src[stride*4];
1888
                a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1889
                *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1890

    
1891
                a= *(uint32_t*)&src[stride*5];
1892
                b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1893
                *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1894

    
1895
                b= *(uint32_t*)&src[stride*6];
1896
                c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1897
                *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1898

    
1899
                c= *(uint32_t*)&src[stride*7];
1900
                a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1901
                *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1902

    
1903
                a= *(uint32_t*)&src[stride*8];
1904
                b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1905
                *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1906

    
1907
                *(uint32_t*)&tmp[stride*0]= c;
1908
                src += 4;
1909
                tmp += 4;
1910
        }
1911
#endif
1912
}
1913

    
1914
/**
1915
 * Deinterlaces the given block by applying a median filter to every second line.
1916
 * will be called for every 8x8 block and can read & write from line 4-15,
1917
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1918
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1919
 */
1920
static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1921
{
1922
#ifdef HAVE_MMX
1923
        src+= 4*stride;
1924
#ifdef HAVE_MMX2
1925
        asm volatile(
1926
                "leal (%0, %1), %%eax                                \n\t"
1927
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1928
//        0        1        2        3        4        5        6        7        8        9
1929
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1930

    
1931
                "movq (%0), %%mm0                                \n\t" //
1932
                "movq (%%eax, %1), %%mm2                        \n\t" //
1933
                "movq (%%eax), %%mm1                                \n\t" //
1934
                "movq %%mm0, %%mm3                                \n\t"
1935
                "pmaxub %%mm1, %%mm0                                \n\t" //
1936
                "pminub %%mm3, %%mm1                                \n\t" //
1937
                "pmaxub %%mm2, %%mm1                                \n\t" //
1938
                "pminub %%mm1, %%mm0                                \n\t"
1939
                "movq %%mm0, (%%eax)                                \n\t"
1940

    
1941
                "movq (%0, %1, 4), %%mm0                        \n\t" //
1942
                "movq (%%eax, %1, 2), %%mm1                        \n\t" //
1943
                "movq %%mm2, %%mm3                                \n\t"
1944
                "pmaxub %%mm1, %%mm2                                \n\t" //
1945
                "pminub %%mm3, %%mm1                                \n\t" //
1946
                "pmaxub %%mm0, %%mm1                                \n\t" //
1947
                "pminub %%mm1, %%mm2                                \n\t"
1948
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
1949

    
1950
                "movq (%%edx), %%mm2                                \n\t" //
1951
                "movq (%%edx, %1), %%mm1                        \n\t" //
1952
                "movq %%mm2, %%mm3                                \n\t"
1953
                "pmaxub %%mm0, %%mm2                                \n\t" //
1954
                "pminub %%mm3, %%mm0                                \n\t" //
1955
                "pmaxub %%mm1, %%mm0                                \n\t" //
1956
                "pminub %%mm0, %%mm2                                \n\t"
1957
                "movq %%mm2, (%%edx)                                \n\t"
1958

    
1959
                "movq (%%edx, %1, 2), %%mm2                        \n\t" //
1960
                "movq (%0, %1, 8), %%mm0                        \n\t" //
1961
                "movq %%mm2, %%mm3                                \n\t"
1962
                "pmaxub %%mm0, %%mm2                                \n\t" //
1963
                "pminub %%mm3, %%mm0                                \n\t" //
1964
                "pmaxub %%mm1, %%mm0                                \n\t" //
1965
                "pminub %%mm0, %%mm2                                \n\t"
1966
                "movq %%mm2, (%%edx, %1, 2)                        \n\t"
1967

    
1968

    
1969
                : : "r" (src), "r" (stride)
1970
                : "%eax", "%edx"
1971
        );
1972

    
1973
#else // MMX without MMX2
1974
        asm volatile(
1975
                "leal (%0, %1), %%eax                                \n\t"
1976
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1977
//        0        1        2        3        4        5        6        7        8        9
1978
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1979
                "pxor %%mm7, %%mm7                                \n\t"
1980

    
1981
#define MEDIAN(a,b,c)\
1982
                "movq " #a ", %%mm0                                \n\t"\
1983
                "movq " #b ", %%mm2                                \n\t"\
1984
                "movq " #c ", %%mm1                                \n\t"\
1985
                "movq %%mm0, %%mm3                                \n\t"\
1986
                "movq %%mm1, %%mm4                                \n\t"\
1987
                "movq %%mm2, %%mm5                                \n\t"\
1988
                "psubusb %%mm1, %%mm3                                \n\t"\
1989
                "psubusb %%mm2, %%mm4                                \n\t"\
1990
                "psubusb %%mm0, %%mm5                                \n\t"\
1991
                "pcmpeqb %%mm7, %%mm3                                \n\t"\
1992
                "pcmpeqb %%mm7, %%mm4                                \n\t"\
1993
                "pcmpeqb %%mm7, %%mm5                                \n\t"\
1994
                "movq %%mm3, %%mm6                                \n\t"\
1995
                "pxor %%mm4, %%mm3                                \n\t"\
1996
                "pxor %%mm5, %%mm4                                \n\t"\
1997
                "pxor %%mm6, %%mm5                                \n\t"\
1998
                "por %%mm3, %%mm1                                \n\t"\
1999
                "por %%mm4, %%mm2                                \n\t"\
2000
                "por %%mm5, %%mm0                                \n\t"\
2001
                "pand %%mm2, %%mm0                                \n\t"\
2002
                "pand %%mm1, %%mm0                                \n\t"\
2003
                "movq %%mm0, " #b "                                \n\t"
2004

    
2005
MEDIAN((%0), (%%eax), (%%eax, %1))
2006
MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2007
MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1))
2008
MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
2009

    
2010
                : : "r" (src), "r" (stride)
2011
                : "%eax", "%edx"
2012
        );
2013
#endif // MMX
2014
#else
2015
        int x, y;
2016
        src+= 4*stride;
2017
        // FIXME - there should be a way to do a few columns in parallel like w/mmx
2018
        for(x=0; x<8; x++)
2019
        {
2020
                uint8_t *colsrc = src;
2021
                for (y=0; y<4; y++)
2022
                {
2023
                        int a, b, c, d, e, f;
2024
                        a = colsrc[0       ];
2025
                        b = colsrc[stride  ];
2026
                        c = colsrc[stride*2];
2027
                        d = (a-b)>>31;
2028
                        e = (b-c)>>31;
2029
                        f = (c-a)>>31;
2030
                        colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
2031
                        colsrc += stride*2;
2032
                }
2033
                src++;
2034
        }
2035
#endif
2036
}
2037

    
2038
#ifdef HAVE_MMX
2039
/**
2040
 * transposes and shift the given 8x8 Block into dst1 and dst2
2041
 */
2042
static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2043
{
2044
        asm(
2045
                "leal (%0, %1), %%eax                                \n\t"
2046
//        0        1        2        3        4        5        6        7        8        9
2047
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
2048
                "movq (%0), %%mm0                \n\t" // 12345678
2049
                "movq (%%eax), %%mm1                \n\t" // abcdefgh
2050
                "movq %%mm0, %%mm2                \n\t" // 12345678
2051
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2052
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2053

    
2054
                "movq (%%eax, %1), %%mm1        \n\t"
2055
                "movq (%%eax, %1, 2), %%mm3        \n\t"
2056
                "movq %%mm1, %%mm4                \n\t"
2057
                "punpcklbw %%mm3, %%mm1                \n\t"
2058
                "punpckhbw %%mm3, %%mm4                \n\t"
2059

    
2060
                "movq %%mm0, %%mm3                \n\t"
2061
                "punpcklwd %%mm1, %%mm0                \n\t"
2062
                "punpckhwd %%mm1, %%mm3                \n\t"
2063
                "movq %%mm2, %%mm1                \n\t"
2064
                "punpcklwd %%mm4, %%mm2                \n\t"
2065
                "punpckhwd %%mm4, %%mm1                \n\t"
2066

    
2067
                "movd %%mm0, 128(%2)                \n\t"
2068
                "psrlq $32, %%mm0                \n\t"
2069
                "movd %%mm0, 144(%2)                \n\t"
2070
                "movd %%mm3, 160(%2)                \n\t"
2071
                "psrlq $32, %%mm3                \n\t"
2072
                "movd %%mm3, 176(%2)                \n\t"
2073
                "movd %%mm3, 48(%3)                \n\t"
2074
                "movd %%mm2, 192(%2)                \n\t"
2075
                "movd %%mm2, 64(%3)                \n\t"
2076
                "psrlq $32, %%mm2                \n\t"
2077
                "movd %%mm2, 80(%3)                \n\t"
2078
                "movd %%mm1, 96(%3)                \n\t"
2079
                "psrlq $32, %%mm1                \n\t"
2080
                "movd %%mm1, 112(%3)                \n\t"
2081

    
2082
                "leal (%%eax, %1, 4), %%eax        \n\t"
2083
                
2084
                "movq (%0, %1, 4), %%mm0        \n\t" // 12345678
2085
                "movq (%%eax), %%mm1                \n\t" // abcdefgh
2086
                "movq %%mm0, %%mm2                \n\t" // 12345678
2087
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2088
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2089

    
2090
                "movq (%%eax, %1), %%mm1        \n\t"
2091
                "movq (%%eax, %1, 2), %%mm3        \n\t"
2092
                "movq %%mm1, %%mm4                \n\t"
2093
                "punpcklbw %%mm3, %%mm1                \n\t"
2094
                "punpckhbw %%mm3, %%mm4                \n\t"
2095

    
2096
                "movq %%mm0, %%mm3                \n\t"
2097
                "punpcklwd %%mm1, %%mm0                \n\t"
2098
                "punpckhwd %%mm1, %%mm3                \n\t"
2099
                "movq %%mm2, %%mm1                \n\t"
2100
                "punpcklwd %%mm4, %%mm2                \n\t"
2101
                "punpckhwd %%mm4, %%mm1                \n\t"
2102

    
2103
                "movd %%mm0, 132(%2)                \n\t"
2104
                "psrlq $32, %%mm0                \n\t"
2105
                "movd %%mm0, 148(%2)                \n\t"
2106
                "movd %%mm3, 164(%2)                \n\t"
2107
                "psrlq $32, %%mm3                \n\t"
2108
                "movd %%mm3, 180(%2)                \n\t"
2109
                "movd %%mm3, 52(%3)                \n\t"
2110
                "movd %%mm2, 196(%2)                \n\t"
2111
                "movd %%mm2, 68(%3)                \n\t"
2112
                "psrlq $32, %%mm2                \n\t"
2113
                "movd %%mm2, 84(%3)                \n\t"
2114
                "movd %%mm1, 100(%3)                \n\t"
2115
                "psrlq $32, %%mm1                \n\t"
2116
                "movd %%mm1, 116(%3)                \n\t"
2117

    
2118

    
2119
        :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2120
        : "%eax"
2121
        );
2122
}
2123

    
2124
/**
2125
 * transposes the given 8x8 block
2126
 */
2127
static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2128
{
2129
        asm(
2130
                "leal (%0, %1), %%eax                                \n\t"
2131
                "leal (%%eax, %1, 4), %%edx                        \n\t"
2132
//        0        1        2        3        4        5        6        7        8        9
2133
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
2134
                "movq (%2), %%mm0                \n\t" // 12345678
2135
                "movq 16(%2), %%mm1                \n\t" // abcdefgh
2136
                "movq %%mm0, %%mm2                \n\t" // 12345678
2137
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2138
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2139

    
2140
                "movq 32(%2), %%mm1                \n\t"
2141
                "movq 48(%2), %%mm3                \n\t"
2142
                "movq %%mm1, %%mm4                \n\t"
2143
                "punpcklbw %%mm3, %%mm1                \n\t"
2144
                "punpckhbw %%mm3, %%mm4                \n\t"
2145

    
2146
                "movq %%mm0, %%mm3                \n\t"
2147
                "punpcklwd %%mm1, %%mm0                \n\t"
2148
                "punpckhwd %%mm1, %%mm3                \n\t"
2149
                "movq %%mm2, %%mm1                \n\t"
2150
                "punpcklwd %%mm4, %%mm2                \n\t"
2151
                "punpckhwd %%mm4, %%mm1                \n\t"
2152

    
2153
                "movd %%mm0, (%0)                \n\t"
2154
                "psrlq $32, %%mm0                \n\t"
2155
                "movd %%mm0, (%%eax)                \n\t"
2156
                "movd %%mm3, (%%eax, %1)        \n\t"
2157
                "psrlq $32, %%mm3                \n\t"
2158
                "movd %%mm3, (%%eax, %1, 2)        \n\t"
2159
                "movd %%mm2, (%0, %1, 4)        \n\t"
2160
                "psrlq $32, %%mm2                \n\t"
2161
                "movd %%mm2, (%%edx)                \n\t"
2162
                "movd %%mm1, (%%edx, %1)        \n\t"
2163
                "psrlq $32, %%mm1                \n\t"
2164
                "movd %%mm1, (%%edx, %1, 2)        \n\t"
2165

    
2166

    
2167
                "movq 64(%2), %%mm0                \n\t" // 12345678
2168
                "movq 80(%2), %%mm1                \n\t" // abcdefgh
2169
                "movq %%mm0, %%mm2                \n\t" // 12345678
2170
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2171
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2172

    
2173
                "movq 96(%2), %%mm1                \n\t"
2174
                "movq 112(%2), %%mm3                \n\t"
2175
                "movq %%mm1, %%mm4                \n\t"
2176
                "punpcklbw %%mm3, %%mm1                \n\t"
2177
                "punpckhbw %%mm3, %%mm4                \n\t"
2178

    
2179
                "movq %%mm0, %%mm3                \n\t"
2180
                "punpcklwd %%mm1, %%mm0                \n\t"
2181
                "punpckhwd %%mm1, %%mm3                \n\t"
2182
                "movq %%mm2, %%mm1                \n\t"
2183
                "punpcklwd %%mm4, %%mm2                \n\t"
2184
                "punpckhwd %%mm4, %%mm1                \n\t"
2185

    
2186
                "movd %%mm0, 4(%0)                \n\t"
2187
                "psrlq $32, %%mm0                \n\t"
2188
                "movd %%mm0, 4(%%eax)                \n\t"
2189
                "movd %%mm3, 4(%%eax, %1)        \n\t"
2190
                "psrlq $32, %%mm3                \n\t"
2191
                "movd %%mm3, 4(%%eax, %1, 2)        \n\t"
2192
                "movd %%mm2, 4(%0, %1, 4)        \n\t"
2193
                "psrlq $32, %%mm2                \n\t"
2194
                "movd %%mm2, 4(%%edx)                \n\t"
2195
                "movd %%mm1, 4(%%edx, %1)        \n\t"
2196
                "psrlq $32, %%mm1                \n\t"
2197
                "movd %%mm1, 4(%%edx, %1, 2)        \n\t"
2198

    
2199
        :: "r" (dst), "r" (dstStride), "r" (src)
2200
        : "%eax", "%edx"
2201
        );
2202
}
2203
#endif
2204
//static int test=0;
2205

    
2206
#ifndef HAVE_ALTIVEC
2207
static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2208
                                    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2209
{
2210
        // to save a register (FIXME do this outside of the loops)
2211
        tempBluredPast[127]= maxNoise[0];
2212
        tempBluredPast[128]= maxNoise[1];
2213
        tempBluredPast[129]= maxNoise[2];
2214
        
2215
#define FAST_L2_DIFF
2216
//#define L1_DIFF //u should change the thresholds too if u try that one
2217
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2218
        asm volatile(
2219
                "leal (%2, %2, 2), %%eax                        \n\t" // 3*stride
2220
                "leal (%2, %2, 4), %%edx                        \n\t" // 5*stride
2221
                "leal (%%edx, %2, 2), %%ecx                        \n\t" // 7*stride
2222
//        0        1        2        3        4        5        6        7        8        9
2223
//        %x        %x+%2        %x+2%2        %x+eax        %x+4%2        %x+edx        %x+2eax        %x+ecx        %x+8%2
2224
//FIXME reorder?
2225
#ifdef L1_DIFF //needs mmx2
2226
                "movq (%0), %%mm0                                \n\t" // L0
2227
                "psadbw (%1), %%mm0                                \n\t" // |L0-R0|
2228
                "movq (%0, %2), %%mm1                                \n\t" // L1
2229
                "psadbw (%1, %2), %%mm1                                \n\t" // |L1-R1|
2230
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2231
                "psadbw (%1, %2, 2), %%mm2                        \n\t" // |L2-R2|
2232
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2233
                "psadbw (%1, %%eax), %%mm3                        \n\t" // |L3-R3|
2234

    
2235
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2236
                "paddw %%mm1, %%mm0                                \n\t"
2237
                "psadbw (%1, %2, 4), %%mm4                        \n\t" // |L4-R4|
2238
                "movq (%0, %%edx), %%mm5                        \n\t" // L5
2239
                "paddw %%mm2, %%mm0                                \n\t"
2240
                "psadbw (%1, %%edx), %%mm5                        \n\t" // |L5-R5|
2241
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2242
                "paddw %%mm3, %%mm0                                \n\t"
2243
                "psadbw (%1, %%eax, 2), %%mm6                        \n\t" // |L6-R6|
2244
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2245
                "paddw %%mm4, %%mm0                                \n\t"
2246
                "psadbw (%1, %%ecx), %%mm7                        \n\t" // |L7-R7|
2247
                "paddw %%mm5, %%mm6                                \n\t"
2248
                "paddw %%mm7, %%mm6                                \n\t"
2249
                "paddw %%mm6, %%mm0                                \n\t"
2250
#elif defined (FAST_L2_DIFF)
2251
                "pcmpeqb %%mm7, %%mm7                                \n\t"
2252
                "movq "MANGLE(b80)", %%mm6                        \n\t"
2253
                "pxor %%mm0, %%mm0                                \n\t"
2254
#define L2_DIFF_CORE(a, b)\
2255
                "movq " #a ", %%mm5                                \n\t"\
2256
                "movq " #b ", %%mm2                                \n\t"\
2257
                "pxor %%mm7, %%mm2                                \n\t"\
2258
                PAVGB(%%mm2, %%mm5)\
2259
                "paddb %%mm6, %%mm5                                \n\t"\
2260
                "movq %%mm5, %%mm2                                \n\t"\
2261
                "psllw $8, %%mm5                                \n\t"\
2262
                "pmaddwd %%mm5, %%mm5                                \n\t"\
2263
                "pmaddwd %%mm2, %%mm2                                \n\t"\
2264
                "paddd %%mm2, %%mm5                                \n\t"\
2265
                "psrld $14, %%mm5                                \n\t"\
2266
                "paddd %%mm5, %%mm0                                \n\t"
2267

    
2268
L2_DIFF_CORE((%0), (%1))
2269
L2_DIFF_CORE((%0, %2), (%1, %2))
2270
L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2271
L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2272
L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2273
L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2274
L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2275
L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2276

    
2277
#else
2278
                "pxor %%mm7, %%mm7                                \n\t"
2279
                "pxor %%mm0, %%mm0                                \n\t"
2280
#define L2_DIFF_CORE(a, b)\
2281
                "movq " #a ", %%mm5                                \n\t"\
2282
                "movq " #b ", %%mm2                                \n\t"\
2283
                "movq %%mm5, %%mm1                                \n\t"\
2284
                "movq %%mm2, %%mm3                                \n\t"\
2285
                "punpcklbw %%mm7, %%mm5                                \n\t"\
2286
                "punpckhbw %%mm7, %%mm1                                \n\t"\
2287
                "punpcklbw %%mm7, %%mm2                                \n\t"\
2288
                "punpckhbw %%mm7, %%mm3                                \n\t"\
2289
                "psubw %%mm2, %%mm5                                \n\t"\
2290
                "psubw %%mm3, %%mm1                                \n\t"\
2291
                "pmaddwd %%mm5, %%mm5                                \n\t"\
2292
                "pmaddwd %%mm1, %%mm1                                \n\t"\
2293
                "paddd %%mm1, %%mm5                                \n\t"\
2294
                "paddd %%mm5, %%mm0                                \n\t"
2295

    
2296
L2_DIFF_CORE((%0), (%1))
2297
L2_DIFF_CORE((%0, %2), (%1, %2))
2298
L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2299
L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2300
L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2301
L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2302
L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2303
L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2304

    
2305
#endif
2306

    
2307
                "movq %%mm0, %%mm4                                \n\t"
2308
                "psrlq $32, %%mm0                                \n\t"
2309
                "paddd %%mm0, %%mm4                                \n\t"
2310
                "movd %%mm4, %%ecx                                \n\t"
2311
                "shll $2, %%ecx                                        \n\t"
2312
                "movl %3, %%edx                                        \n\t"
2313
                "addl -4(%%edx), %%ecx                                \n\t"
2314
                "addl 4(%%edx), %%ecx                                \n\t"
2315
                "addl -1024(%%edx), %%ecx                        \n\t"
2316
                "addl $4, %%ecx                                        \n\t"
2317
                "addl 1024(%%edx), %%ecx                        \n\t"
2318
                "shrl $3, %%ecx                                        \n\t"
2319
                "movl %%ecx, (%%edx)                                \n\t"
2320

    
2321
//                "movl %3, %%ecx                                        \n\t"
2322
//                "movl %%ecx, test                                \n\t"
2323
//                "jmp 4f \n\t"
2324
                "cmpl 512(%%edx), %%ecx                                \n\t"
2325
                " jb 2f                                                \n\t"
2326
                "cmpl 516(%%edx), %%ecx                                \n\t"
2327
                " jb 1f                                                \n\t"
2328

    
2329
                "leal (%%eax, %2, 2), %%edx                        \n\t" // 5*stride
2330
                "leal (%%edx, %2, 2), %%ecx                        \n\t" // 7*stride
2331
                "movq (%0), %%mm0                                \n\t" // L0
2332
                "movq (%0, %2), %%mm1                                \n\t" // L1
2333
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2334
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2335
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2336
                "movq (%0, %%edx), %%mm5                        \n\t" // L5
2337
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2338
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2339
                "movq %%mm0, (%1)                                \n\t" // L0
2340
                "movq %%mm1, (%1, %2)                                \n\t" // L1
2341
                "movq %%mm2, (%1, %2, 2)                        \n\t" // L2
2342
                "movq %%mm3, (%1, %%eax)                        \n\t" // L3
2343
                "movq %%mm4, (%1, %2, 4)                        \n\t" // L4
2344
                "movq %%mm5, (%1, %%edx)                        \n\t" // L5
2345
                "movq %%mm6, (%1, %%eax, 2)                        \n\t" // L6
2346
                "movq %%mm7, (%1, %%ecx)                        \n\t" // L7
2347
                "jmp 4f                                                \n\t"
2348

    
2349
                "1:                                                \n\t"
2350
                "leal (%%eax, %2, 2), %%edx                        \n\t" // 5*stride
2351
                "leal (%%edx, %2, 2), %%ecx                        \n\t" // 7*stride
2352
                "movq (%0), %%mm0                                \n\t" // L0
2353
                PAVGB((%1), %%mm0)                                      // L0
2354
                "movq (%0, %2), %%mm1                                \n\t" // L1
2355
                PAVGB((%1, %2), %%mm1)                                      // L1
2356
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2357
                PAVGB((%1, %2, 2), %%mm2)                              // L2
2358
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2359
                PAVGB((%1, %%eax), %%mm3)                              // L3
2360
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2361
                PAVGB((%1, %2, 4), %%mm4)                              // L4
2362
                "movq (%0, %%edx), %%mm5                        \n\t" // L5
2363
                PAVGB((%1, %%edx), %%mm5)                              // L5
2364
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2365
                PAVGB((%1, %%eax, 2), %%mm6)                              // L6
2366
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2367
                PAVGB((%1, %%ecx), %%mm7)                              // L7
2368
                "movq %%mm0, (%1)                                \n\t" // R0
2369
                "movq %%mm1, (%1, %2)                                \n\t" // R1
2370
                "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2371
                "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2372
                "movq %%mm4, (%1, %2, 4)                        \n\t" // R4
2373
                "movq %%mm5, (%1, %%edx)                        \n\t" // R5
2374
                "movq %%mm6, (%1, %%eax, 2)                        \n\t" // R6
2375
                "movq %%mm7, (%1, %%ecx)                        \n\t" // R7
2376
                "movq %%mm0, (%0)                                \n\t" // L0
2377
                "movq %%mm1, (%0, %2)                                \n\t" // L1
2378
                "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2379
                "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2380
                "movq %%mm4, (%0, %2, 4)                        \n\t" // L4
2381
                "movq %%mm5, (%0, %%edx)                        \n\t" // L5
2382
                "movq %%mm6, (%0, %%eax, 2)                        \n\t" // L6
2383
                "movq %%mm7, (%0, %%ecx)                        \n\t" // L7
2384
                "jmp 4f                                                \n\t"
2385

    
2386
                "2:                                                \n\t"
2387
                "cmpl 508(%%edx), %%ecx                                \n\t"
2388
                " jb 3f                                                \n\t"
2389

    
2390
                "leal (%%eax, %2, 2), %%edx                        \n\t" // 5*stride
2391
                "leal (%%edx, %2, 2), %%ecx                        \n\t" // 7*stride
2392
                "movq (%0), %%mm0                                \n\t" // L0
2393
                "movq (%0, %2), %%mm1                                \n\t" // L1
2394
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2395
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2396
                "movq (%1), %%mm4                                \n\t" // R0
2397
                "movq (%1, %2), %%mm5                                \n\t" // R1
2398
                "movq (%1, %2, 2), %%mm6                        \n\t" // R2
2399
                "movq (%1, %%eax), %%mm7                        \n\t" // R3
2400
                PAVGB(%%mm4, %%mm0)
2401
                PAVGB(%%mm5, %%mm1)
2402
                PAVGB(%%mm6, %%mm2)
2403
                PAVGB(%%mm7, %%mm3)
2404
                PAVGB(%%mm4, %%mm0)
2405
                PAVGB(%%mm5, %%mm1)
2406
                PAVGB(%%mm6, %%mm2)
2407
                PAVGB(%%mm7, %%mm3)
2408
                "movq %%mm0, (%1)                                \n\t" // R0
2409
                "movq %%mm1, (%1, %2)                                \n\t" // R1
2410
                "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2411
                "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2412
                "movq %%mm0, (%0)                                \n\t" // L0
2413
                "movq %%mm1, (%0, %2)                                \n\t" // L1
2414
                "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2415
                "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2416

    
2417
                "movq (%0, %2, 4), %%mm0                        \n\t" // L4
2418
                "movq (%0, %%edx), %%mm1                        \n\t" // L5
2419
                "movq (%0, %%eax, 2), %%mm2                        \n\t" // L6
2420
                "movq (%0, %%ecx), %%mm3                        \n\t" // L7
2421
                "movq (%1, %2, 4), %%mm4                        \n\t" // R4
2422
                "movq (%1, %%edx), %%mm5                        \n\t" // R5
2423
                "movq (%1, %%eax, 2), %%mm6                        \n\t" // R6
2424
                "movq (%1, %%ecx), %%mm7                        \n\t" // R7
2425
                PAVGB(%%mm4, %%mm0)
2426
                PAVGB(%%mm5, %%mm1)
2427
                PAVGB(%%mm6, %%mm2)
2428
                PAVGB(%%mm7, %%mm3)
2429
                PAVGB(%%mm4, %%mm0)
2430
                PAVGB(%%mm5, %%mm1)
2431
                PAVGB(%%mm6, %%mm2)
2432
                PAVGB(%%mm7, %%mm3)
2433
                "movq %%mm0, (%1, %2, 4)                        \n\t" // R4
2434
                "movq %%mm1, (%1, %%edx)                        \n\t" // R5
2435
                "movq %%mm2, (%1, %%eax, 2)                        \n\t" // R6
2436
                "movq %%mm3, (%1, %%ecx)                        \n\t" // R7
2437
                "movq %%mm0, (%0, %2, 4)                        \n\t" // L4
2438
                "movq %%mm1, (%0, %%edx)                        \n\t" // L5
2439
                "movq %%mm2, (%0, %%eax, 2)                        \n\t" // L6
2440
                "movq %%mm3, (%0, %%ecx)                        \n\t" // L7
2441
                "jmp 4f                                                \n\t"
2442

    
2443
                "3:                                                \n\t"
2444
                "leal (%%eax, %2, 2), %%edx                        \n\t" // 5*stride
2445
                "leal (%%edx, %2, 2), %%ecx                        \n\t" // 7*stride
2446
                "movq (%0), %%mm0                                \n\t" // L0
2447
                "movq (%0, %2), %%mm1                                \n\t" // L1
2448
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2449
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2450
                "movq (%1), %%mm4                                \n\t" // R0
2451
                "movq (%1, %2), %%mm5                                \n\t" // R1
2452
                "movq (%1, %2, 2), %%mm6                        \n\t" // R2
2453
                "movq (%1, %%eax), %%mm7                        \n\t" // R3
2454
                PAVGB(%%mm4, %%mm0)
2455
                PAVGB(%%mm5, %%mm1)
2456
                PAVGB(%%mm6, %%mm2)
2457
                PAVGB(%%mm7, %%mm3)
2458
                PAVGB(%%mm4, %%mm0)
2459
                PAVGB(%%mm5, %%mm1)
2460
                PAVGB(%%mm6, %%mm2)
2461
                PAVGB(%%mm7, %%mm3)
2462
                PAVGB(%%mm4, %%mm0)
2463
                PAVGB(%%mm5, %%mm1)
2464
                PAVGB(%%mm6, %%mm2)
2465
                PAVGB(%%mm7, %%mm3)
2466
                "movq %%mm0, (%1)                                \n\t" // R0
2467
                "movq %%mm1, (%1, %2)                                \n\t" // R1
2468
                "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2469
                "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2470
                "movq %%mm0, (%0)                                \n\t" // L0
2471
                "movq %%mm1, (%0, %2)                                \n\t" // L1
2472
                "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2473
                "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2474

    
2475
                "movq (%0, %2, 4), %%mm0                        \n\t" // L4
2476
                "movq (%0, %%edx), %%mm1                        \n\t" // L5
2477
                "movq (%0, %%eax, 2), %%mm2                        \n\t" // L6
2478
                "movq (%0, %%ecx), %%mm3                        \n\t" // L7
2479
                "movq (%1, %2, 4), %%mm4                        \n\t" // R4
2480
                "movq (%1, %%edx), %%mm5                        \n\t" // R5
2481
                "movq (%1, %%eax, 2), %%mm6                        \n\t" // R6
2482
                "movq (%1, %%ecx), %%mm7                        \n\t" // R7
2483
                PAVGB(%%mm4, %%mm0)
2484
                PAVGB(%%mm5, %%mm1)
2485
                PAVGB(%%mm6, %%mm2)
2486
                PAVGB(%%mm7, %%mm3)
2487
                PAVGB(%%mm4, %%mm0)
2488
                PAVGB(%%mm5, %%mm1)
2489
                PAVGB(%%mm6, %%mm2)
2490
                PAVGB(%%mm7, %%mm3)
2491
                PAVGB(%%mm4, %%mm0)
2492
                PAVGB(%%mm5, %%mm1)
2493
                PAVGB(%%mm6, %%mm2)
2494
                PAVGB(%%mm7, %%mm3)
2495
                "movq %%mm0, (%1, %2, 4)                        \n\t" // R4
2496
                "movq %%mm1, (%1, %%edx)                        \n\t" // R5
2497
                "movq %%mm2, (%1, %%eax, 2)                        \n\t" // R6
2498
                "movq %%mm3, (%1, %%ecx)                        \n\t" // R7
2499
                "movq %%mm0, (%0, %2, 4)                        \n\t" // L4
2500
                "movq %%mm1, (%0, %%edx)                        \n\t" // L5
2501
                "movq %%mm2, (%0, %%eax, 2)                        \n\t" // L6
2502
                "movq %%mm3, (%0, %%ecx)                        \n\t" // L7
2503

    
2504
                "4:                                                \n\t"
2505

    
2506
                :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
2507
                : "%eax", "%edx", "%ecx", "memory"
2508
                );
2509
//printf("%d\n", test);
2510
#else
2511
{
2512
        int y;
2513
        int d=0;
2514
//        int sysd=0;
2515
        int i;
2516

    
2517
        for(y=0; y<8; y++)
2518
        {
2519
                int x;
2520
                for(x=0; x<8; x++)
2521
                {
2522
                        int ref= tempBlured[ x + y*stride ];
2523
                        int cur= src[ x + y*stride ];
2524
                        int d1=ref - cur;
2525
//                        if(x==0 || x==7) d1+= d1>>1;
2526
//                        if(y==0 || y==7) d1+= d1>>1;
2527
//                        d+= ABS(d1);
2528
                        d+= d1*d1;
2529
//                        sysd+= d1;
2530
                }
2531
        }
2532
        i=d;
2533
        d=         (
2534
                4*d
2535
                +(*(tempBluredPast-256))
2536
                +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2537
                +(*(tempBluredPast+256))
2538
                +4)>>3;
2539
        *tempBluredPast=i;
2540
//        ((*tempBluredPast)*3 + d + 2)>>2;
2541

    
2542
//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2543
/*
2544
Switch between
2545
 1  0  0  0  0  0  0  (0)
2546
64 32 16  8  4  2  1  (1)
2547
64 48 36 27 20 15 11 (33) (approx)
2548
64 56 49 43 37 33 29 (200) (approx)
2549
*/
2550
        if(d > maxNoise[1])
2551
        {
2552
                if(d < maxNoise[2])
2553
                {
2554
                        for(y=0; y<8; y++)
2555
                        {
2556
                                int x;
2557
                                for(x=0; x<8; x++)
2558
                                {
2559
                                        int ref= tempBlured[ x + y*stride ];
2560
                                        int cur= src[ x + y*stride ];
2561
                                        tempBlured[ x + y*stride ]=
2562
                                        src[ x + y*stride ]=
2563
                                                (ref + cur + 1)>>1;
2564
                                }
2565
                        }
2566
                }
2567
                else
2568
                {
2569
                        for(y=0; y<8; y++)
2570
                        {
2571
                                int x;
2572
                                for(x=0; x<8; x++)
2573
                                {
2574
                                        tempBlured[ x + y*stride ]= src[ x + y*stride ];
2575
                                }
2576
                        }
2577
                }
2578
        }
2579
        else
2580
        {
2581
                if(d < maxNoise[0])
2582
                {
2583
                        for(y=0; y<8; y++)
2584
                        {
2585
                                int x;
2586
                                for(x=0; x<8; x++)
2587
                                {
2588
                                        int ref= tempBlured[ x + y*stride ];
2589
                                        int cur= src[ x + y*stride ];
2590
                                        tempBlured[ x + y*stride ]=
2591
                                        src[ x + y*stride ]=
2592
                                                (ref*7 + cur + 4)>>3;
2593
                                }
2594
                        }
2595
                }
2596
                else
2597
                {
2598
                        for(y=0; y<8; y++)
2599
                        {
2600
                                int x;
2601
                                for(x=0; x<8; x++)
2602
                                {
2603
                                        int ref= tempBlured[ x + y*stride ];
2604
                                        int cur= src[ x + y*stride ];
2605
                                        tempBlured[ x + y*stride ]=
2606
                                        src[ x + y*stride ]=
2607
                                                (ref*3 + cur + 2)>>2;
2608
                                }
2609
                        }
2610
                }
2611
        }
2612
}
2613
#endif
2614
}
2615
#endif //HAVE_ALTIVEC
2616

    
2617
#ifdef HAVE_MMX
2618
/**
2619
 * accurate deblock filter
2620
 */
2621
static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
2622
        int64_t dc_mask, eq_mask;
2623
        int64_t sums[10*8*2];
2624
        src+= step*3; // src points to begin of the 8x8 Block
2625
//START_TIMER
2626
asm volatile(
2627
                "movq %0, %%mm7                                        \n\t" 
2628
                "movq %1, %%mm6                                        \n\t" 
2629
                : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
2630
                );
2631
                
2632
asm volatile(
2633
                "leal (%2, %3), %%eax                                \n\t"
2634
//        0        1        2        3        4        5        6        7        8        9
2635
//        %1        eax        eax+%2        eax+2%2        %1+4%2        ecx        ecx+%2        ecx+2%2        %1+8%2        ecx+4%2
2636

    
2637
                "movq (%2), %%mm0                                \n\t"
2638
                "movq (%%eax), %%mm1                                \n\t"
2639
                "movq %%mm1, %%mm3                                \n\t"
2640
                "movq %%mm1, %%mm4                                \n\t"
2641
                "psubb %%mm1, %%mm0                                \n\t" // mm0 = differnece
2642
                "paddb %%mm7, %%mm0                                \n\t"
2643
                "pcmpgtb %%mm6, %%mm0                                \n\t"
2644

    
2645
                "movq (%%eax,%3), %%mm2                                \n\t"
2646
                PMAXUB(%%mm2, %%mm4)
2647
                PMINUB(%%mm2, %%mm3, %%mm5)
2648
                "psubb %%mm2, %%mm1                                \n\t"
2649
                "paddb %%mm7, %%mm1                                \n\t"
2650
                "pcmpgtb %%mm6, %%mm1                                \n\t"
2651
                "paddb %%mm1, %%mm0                                \n\t"
2652

    
2653
                "movq (%%eax, %3, 2), %%mm1                        \n\t"
2654
                PMAXUB(%%mm1, %%mm4)
2655
                PMINUB(%%mm1, %%mm3, %%mm5)
2656
                "psubb %%mm1, %%mm2                                \n\t"
2657
                "paddb %%mm7, %%mm2                                \n\t"
2658
                "pcmpgtb %%mm6, %%mm2                                \n\t"
2659
                "paddb %%mm2, %%mm0                                \n\t"
2660
                
2661
                "leal (%%eax, %3, 4), %%eax                        \n\t"
2662

    
2663
                "movq (%2, %3, 4), %%mm2                        \n\t"
2664
                PMAXUB(%%mm2, %%mm4)
2665
                PMINUB(%%mm2, %%mm3, %%mm5)
2666
                "psubb %%mm2, %%mm1                                \n\t"
2667
                "paddb %%mm7, %%mm1                                \n\t"
2668
                "pcmpgtb %%mm6, %%mm1                                \n\t"
2669
                "paddb %%mm1, %%mm0                                \n\t"
2670

    
2671
                "movq (%%eax), %%mm1                                \n\t"
2672
                PMAXUB(%%mm1, %%mm4)
2673
                PMINUB(%%mm1, %%mm3, %%mm5)
2674
                "psubb %%mm1, %%mm2                                \n\t"
2675
                "paddb %%mm7, %%mm2                                \n\t"
2676
                "pcmpgtb %%mm6, %%mm2                                \n\t"
2677
                "paddb %%mm2, %%mm0                                \n\t"
2678

    
2679
                "movq (%%eax, %3), %%mm2                        \n\t"
2680
                PMAXUB(%%mm2, %%mm4)
2681
                PMINUB(%%mm2, %%mm3, %%mm5)
2682
                "psubb %%mm2, %%mm1                                \n\t"
2683
                "paddb %%mm7, %%mm1                                \n\t"
2684
                "pcmpgtb %%mm6, %%mm1                                \n\t"
2685
                "paddb %%mm1, %%mm0                                \n\t"
2686

    
2687
                "movq (%%eax, %3, 2), %%mm1                        \n\t"
2688
                PMAXUB(%%mm1, %%mm4)
2689
                PMINUB(%%mm1, %%mm3, %%mm5)
2690
                "psubb %%mm1, %%mm2                                \n\t"
2691
                "paddb %%mm7, %%mm2                                \n\t"
2692
                "pcmpgtb %%mm6, %%mm2                                \n\t"
2693
                "paddb %%mm2, %%mm0                                \n\t"
2694

    
2695
                "movq (%2, %3, 8), %%mm2                        \n\t"
2696
                PMAXUB(%%mm2, %%mm4)
2697
                PMINUB(%%mm2, %%mm3, %%mm5)
2698
                "psubb %%mm2, %%mm1                                \n\t"
2699
                "paddb %%mm7, %%mm1                                \n\t"
2700
                "pcmpgtb %%mm6, %%mm1                                \n\t"
2701
                "paddb %%mm1, %%mm0                                \n\t"
2702

    
2703
                "movq (%%eax, %3, 4), %%mm1                        \n\t"
2704
                "psubb %%mm1, %%mm2                                \n\t"
2705
                "paddb %%mm7, %%mm2                                \n\t"
2706
                "pcmpgtb %%mm6, %%mm2                                \n\t"
2707
                "paddb %%mm2, %%mm0                                \n\t"
2708
                "psubusb %%mm3, %%mm4                                \n\t"
2709

    
2710
                "movq %4, %%mm7                                        \n\t" // QP,..., QP
2711
                "paddusb %%mm7, %%mm7                                \n\t" // 2QP ... 2QP
2712
                "pcmpgtb %%mm4, %%mm7                                \n\t" // Diff < 2QP -> FF
2713
                "movq %%mm7, %1                                        \n\t"
2714

    
2715
                "pxor %%mm6, %%mm6                                \n\t"
2716
                "movq %5, %%mm7                                        \n\t"
2717
                "punpcklbw %%mm7, %%mm7                                \n\t"
2718
                "punpcklbw %%mm7, %%mm7                                \n\t"
2719
                "punpcklbw %%mm7, %%mm7                                \n\t"
2720
                "psubb %%mm0, %%mm6                                \n\t"
2721
                "pcmpgtb %%mm7, %%mm6                                \n\t"
2722
                "movq %%mm6, %0                                        \n\t"
2723

    
2724
                : "=m" (eq_mask), "=m" (dc_mask)
2725
                : "r" (src), "r" (step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2726
                : "%eax"
2727
                );
2728

    
2729
        if(dc_mask & eq_mask){
2730
                int offset= -8*step;
2731
                int64_t *temp_sums= sums;
2732

    
2733
                asm volatile(
2734
                "movq %2, %%mm0                                        \n\t"  // QP,..., QP
2735
                "pxor %%mm4, %%mm4                                \n\t"
2736

    
2737
                "movq (%0), %%mm6                                \n\t"
2738
                "movq (%0, %1), %%mm5                                \n\t"
2739
                "movq %%mm5, %%mm1                                \n\t"
2740
                "movq %%mm6, %%mm2                                \n\t"
2741
                "psubusb %%mm6, %%mm5                                \n\t"
2742
                "psubusb %%mm1, %%mm2                                \n\t"
2743
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
2744
                "psubusb %%mm2, %%mm0                                \n\t" // diff >= QP -> 0
2745
                "pcmpeqb %%mm4, %%mm0                                \n\t" // diff >= QP -> FF
2746

    
2747
                "pxor %%mm6, %%mm1                                \n\t"
2748
                "pand %%mm0, %%mm1                                \n\t"
2749
                "pxor %%mm1, %%mm6                                \n\t"
2750
                // 0:QP  6:First
2751

    
2752
                "movq (%0, %1, 8), %%mm5                        \n\t"
2753
                "addl %1, %0                                        \n\t" // %0 points to line 1 not 0
2754
                "movq (%0, %1, 8), %%mm7                        \n\t"
2755
                "movq %%mm5, %%mm1                                \n\t"
2756
                "movq %%mm7, %%mm2                                \n\t"
2757
                "psubusb %%mm7, %%mm5                                \n\t"
2758
                "psubusb %%mm1, %%mm2                                \n\t"
2759
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
2760
                "movq %2, %%mm0                                        \n\t"  // QP,..., QP
2761
                "psubusb %%mm2, %%mm0                                \n\t" // diff >= QP -> 0
2762
                "pcmpeqb %%mm4, %%mm0                                \n\t" // diff >= QP -> FF
2763

    
2764
                "pxor %%mm7, %%mm1                                \n\t"
2765
                "pand %%mm0, %%mm1                                \n\t"
2766
                "pxor %%mm1, %%mm7                                \n\t"
2767
                
2768
                "movq %%mm6, %%mm5                                \n\t"
2769
                "punpckhbw %%mm4, %%mm6                                \n\t"
2770
                "punpcklbw %%mm4, %%mm5                                \n\t"
2771
                // 4:0 5/6:First 7:Last
2772

    
2773
                "movq %%mm5, %%mm0                                \n\t"
2774
                "movq %%mm6, %%mm1                                \n\t"
2775
                "psllw $2, %%mm0                                \n\t"
2776
                "psllw $2, %%mm1                                \n\t"
2777
                "paddw "MANGLE(w04)", %%mm0                        \n\t"
2778
                "paddw "MANGLE(w04)", %%mm1                        \n\t"
2779

    
2780
#define NEXT\
2781
                "movq (%0), %%mm2                                \n\t"\
2782
                "movq (%0), %%mm3                                \n\t"\
2783
                "addl %1, %0                                        \n\t"\
2784
                "punpcklbw %%mm4, %%mm2                                \n\t"\
2785
                "punpckhbw %%mm4, %%mm3                                \n\t"\
2786
                "paddw %%mm2, %%mm0                                \n\t"\
2787
                "paddw %%mm3, %%mm1                                \n\t"
2788

    
2789
#define PREV\
2790
                "movq (%0), %%mm2                                \n\t"\
2791
                "movq (%0), %%mm3                                \n\t"\
2792
                "addl %1, %0                                        \n\t"\
2793
                "punpcklbw %%mm4, %%mm2                                \n\t"\
2794
                "punpckhbw %%mm4, %%mm3                                \n\t"\
2795
                "psubw %%mm2, %%mm0                                \n\t"\
2796
                "psubw %%mm3, %%mm1                                \n\t"
2797

    
2798
                                
2799
                NEXT //0
2800
                NEXT //1
2801
                NEXT //2
2802
                "movq %%mm0, (%3)                                \n\t"
2803
                "movq %%mm1, 8(%3)                                \n\t"
2804

    
2805
                NEXT //3
2806
                "psubw %%mm5, %%mm0                                \n\t"
2807
                "psubw %%mm6, %%mm1                                \n\t"
2808
                "movq %%mm0, 16(%3)                                \n\t"
2809
                "movq %%mm1, 24(%3)                                \n\t"
2810

    
2811
                NEXT //4
2812
                "psubw %%mm5, %%mm0                                \n\t"
2813
                "psubw %%mm6, %%mm1                                \n\t"
2814
                "movq %%mm0, 32(%3)                                \n\t"
2815
                "movq %%mm1, 40(%3)                                \n\t"
2816

    
2817
                NEXT //5
2818
                "psubw %%mm5, %%mm0                                \n\t"
2819
                "psubw %%mm6, %%mm1                                \n\t"
2820
                "movq %%mm0, 48(%3)                                \n\t"
2821
                "movq %%mm1, 56(%3)                                \n\t"
2822

    
2823
                NEXT //6
2824
                "psubw %%mm5, %%mm0                                \n\t"
2825
                "psubw %%mm6, %%mm1                                \n\t"
2826
                "movq %%mm0, 64(%3)                                \n\t"
2827
                "movq %%mm1, 72(%3)                                \n\t"
2828

    
2829
                "movq %%mm7, %%mm6                                \n\t"
2830
                "punpckhbw %%mm4, %%mm7                                \n\t"
2831
                "punpcklbw %%mm4, %%mm6                                \n\t"
2832
                
2833
                NEXT //7
2834
                "movl %4, %0                                        \n\t"
2835
                "addl %1, %0                                        \n\t"
2836
                PREV //0
2837
                "movq %%mm0, 80(%3)                                \n\t"
2838
                "movq %%mm1, 88(%3)                                \n\t"
2839

    
2840
                PREV //1
2841
                "paddw %%mm6, %%mm0                                \n\t"
2842
                "paddw %%mm7, %%mm1                                \n\t"
2843
                "movq %%mm0, 96(%3)                                \n\t"
2844
                "movq %%mm1, 104(%3)                                \n\t"
2845
                
2846
                PREV //2
2847
                "paddw %%mm6, %%mm0                                \n\t"
2848
                "paddw %%mm7, %%mm1                                \n\t"
2849
                "movq %%mm0, 112(%3)                                \n\t"
2850
                "movq %%mm1, 120(%3)                                \n\t"
2851

    
2852
                PREV //3
2853
                "paddw %%mm6, %%mm0                                \n\t"
2854
                "paddw %%mm7, %%mm1                                \n\t"
2855
                "movq %%mm0, 128(%3)                                \n\t"
2856
                "movq %%mm1, 136(%3)                                \n\t"
2857

    
2858
                PREV //4
2859
                "paddw %%mm6, %%mm0                                \n\t"
2860
                "paddw %%mm7, %%mm1                                \n\t"
2861
                "movq %%mm0, 144(%3)                                \n\t"
2862
                "movq %%mm1, 152(%3)                                \n\t"
2863

    
2864
                "movl %4, %0                                        \n\t" //FIXME
2865

    
2866
                : "+&r"(src)
2867
                : "r" (step), "m" (c->pQPb), "r"(sums), "g"(src)
2868
                );
2869

    
2870
                src+= step; // src points to begin of the 8x8 Block
2871

    
2872
                asm volatile(
2873
                "movq %4, %%mm6                                        \n\t"
2874
                "pcmpeqb %%mm5, %%mm5                                \n\t"
2875
                "pxor %%mm6, %%mm5                                \n\t"
2876
                "pxor %%mm7, %%mm7                                \n\t"
2877

    
2878
                "1:                                                \n\t"
2879
                "movq (%1), %%mm0                                \n\t"
2880
                "movq 8(%1), %%mm1                                \n\t"
2881
                "paddw 32(%1), %%mm0                                \n\t"
2882
                "paddw 40(%1), %%mm1                                \n\t"
2883
                "movq (%0, %3), %%mm2                                \n\t"
2884
                "movq %%mm2, %%mm3                                \n\t"
2885
                "movq %%mm2, %%mm4                                \n\t"
2886
                "punpcklbw %%mm7, %%mm2                                \n\t"
2887
                "punpckhbw %%mm7, %%mm3                                \n\t"
2888
                "paddw %%mm2, %%mm0                                \n\t"
2889
                "paddw %%mm3, %%mm1                                \n\t"
2890
                "paddw %%mm2, %%mm0                                \n\t"
2891
                "paddw %%mm3, %%mm1                                \n\t"
2892
                "psrlw $4, %%mm0                                \n\t"
2893
                "psrlw $4, %%mm1                                \n\t"
2894
                "packuswb %%mm1, %%mm0                                \n\t"
2895
                "pand %%mm6, %%mm0                                \n\t"
2896
                "pand %%mm5, %%mm4                                \n\t"
2897
                "por %%mm4, %%mm0                                \n\t"
2898
                "movq %%mm0, (%0, %3)                                \n\t"
2899
                "addl $16, %1                                        \n\t"
2900
                "addl %2, %0                                        \n\t"
2901
                " js 1b                                                \n\t"
2902

    
2903
                : "+r"(offset), "+r"(temp_sums)
2904
                : "r" (step), "r"(src - offset), "m"(dc_mask & eq_mask)
2905
                );
2906
        }else
2907
                src+= step; // src points to begin of the 8x8 Block
2908

    
2909
        if(eq_mask != -1LL){
2910
                uint8_t *temp_src= src;
2911
                asm volatile(
2912
                "pxor %%mm7, %%mm7                                \n\t"
2913
                "leal -40(%%esp), %%ecx                                \n\t" // make space for 4 8-byte vars
2914
                "andl $0xFFFFFFF8, %%ecx                        \n\t" // align
2915
//        0        1        2        3        4        5        6        7        8        9
2916
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1        %1+8%1        ecx+4%1
2917

    
2918
                "movq (%0), %%mm0                                \n\t"
2919
                "movq %%mm0, %%mm1                                \n\t"
2920
                "punpcklbw %%mm7, %%mm0                                \n\t" // low part of line 0
2921
                "punpckhbw %%mm7, %%mm1                                \n\t" // high part of line 0
2922

    
2923
                "movq (%0, %1), %%mm2                                \n\t"
2924
                "leal (%0, %1, 2), %%eax                        \n\t"
2925
                "movq %%mm2, %%mm3                                \n\t"
2926
                "punpcklbw %%mm7, %%mm2                                \n\t" // low part of line 1
2927
                "punpckhbw %%mm7, %%mm3                                \n\t" // high part of line 1
2928

    
2929
                "movq (%%eax), %%mm4                                \n\t"
2930
                "movq %%mm4, %%mm5                                \n\t"
2931
                "punpcklbw %%mm7, %%mm4                                \n\t" // low part of line 2
2932
                "punpckhbw %%mm7, %%mm5                                \n\t" // high part of line 2
2933

    
2934
                "paddw %%mm0, %%mm0                                \n\t" // 2L0
2935
                "paddw %%mm1, %%mm1                                \n\t" // 2H0
2936
                "psubw %%mm4, %%mm2                                \n\t" // L1 - L2
2937
                "psubw %%mm5, %%mm3                                \n\t" // H1 - H2
2938
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - L1 + L2
2939
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - H1 + H2
2940

    
2941
                "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
2942
                "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
2943
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2
2944
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2
2945

    
2946
                "movq (%%eax, %1), %%mm2                        \n\t"
2947
                "movq %%mm2, %%mm3                                \n\t"
2948
                "punpcklbw %%mm7, %%mm2                                \n\t" // L3
2949
                "punpckhbw %%mm7, %%mm3                                \n\t" // H3
2950

    
2951
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - L3
2952
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - H3
2953
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2954
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2955
                "movq %%mm0, (%%ecx)                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2956
                "movq %%mm1, 8(%%ecx)                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2957

    
2958
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
2959
                "movq %%mm0, %%mm1                                \n\t"
2960
                "punpcklbw %%mm7, %%mm0                                \n\t" // L4
2961
                "punpckhbw %%mm7, %%mm1                                \n\t" // H4
2962

    
2963
                "psubw %%mm0, %%mm2                                \n\t" // L3 - L4
2964
                "psubw %%mm1, %%mm3                                \n\t" // H3 - H4
2965
                "movq %%mm2, 16(%%ecx)                                \n\t" // L3 - L4
2966
                "movq %%mm3, 24(%%ecx)                                \n\t" // H3 - H4
2967
                "paddw %%mm4, %%mm4                                \n\t" // 2L2
2968
                "paddw %%mm5, %%mm5                                \n\t" // 2H2
2969
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - L3 + L4
2970
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - H3 + H4
2971

    
2972
                "leal (%%eax, %1), %0                                \n\t"
2973
                "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
2974
                "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
2975
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4
2976
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4
2977
//50 opcodes so far
2978
                "movq (%0, %1, 2), %%mm2                        \n\t"
2979
                "movq %%mm2, %%mm3                                \n\t"
2980
                "punpcklbw %%mm7, %%mm2                                \n\t" // L5
2981
                "punpckhbw %%mm7, %%mm3                                \n\t" // H5
2982
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - L5
2983
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - H5
2984
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2985
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - 2H5
2986

    
2987
                "movq (%%eax, %1, 4), %%mm6                        \n\t"
2988
                "punpcklbw %%mm7, %%mm6                                \n\t" // L6
2989
                "psubw %%mm6, %%mm2                                \n\t" // L5 - L6
2990
                "movq (%%eax, %1, 4), %%mm6                        \n\t"
2991
                "punpckhbw %%mm7, %%mm6                                \n\t" // H6
2992
                "psubw %%mm6, %%mm3                                \n\t" // H5 - H6
2993

    
2994
                "paddw %%mm0, %%mm0                                \n\t" // 2L4
2995
                "paddw %%mm1, %%mm1                                \n\t" // 2H4
2996
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - L5 + L6
2997
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - H5 + H6
2998

    
2999
                "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
3000
                "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
3001
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6
3002
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6
3003

    
3004
                "movq (%0, %1, 4), %%mm2                        \n\t"
3005
                "movq %%mm2, %%mm3                                \n\t"
3006
                "punpcklbw %%mm7, %%mm2                                \n\t" // L7
3007
                "punpckhbw %%mm7, %%mm3                                \n\t" // H7
3008

    
3009
                "paddw %%mm2, %%mm2                                \n\t" // 2L7
3010
                "paddw %%mm3, %%mm3                                \n\t" // 2H7
3011
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6 - 2L7
3012
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6 - 2H7
3013

    
3014
                "movq (%%ecx), %%mm2                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
3015
                "movq 8(%%ecx), %%mm3                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
3016

    
3017
#ifdef HAVE_MMX2
3018
                "movq %%mm7, %%mm6                                \n\t" // 0
3019
                "psubw %%mm0, %%mm6                                \n\t"
3020
                "pmaxsw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3021
                "movq %%mm7, %%mm6                                \n\t" // 0
3022
                "psubw %%mm1, %%mm6                                \n\t"
3023
                "pmaxsw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3024
                "movq %%mm7, %%mm6                                \n\t" // 0
3025
                "psubw %%mm2, %%mm6                                \n\t"
3026
                "pmaxsw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3027
                "movq %%mm7, %%mm6                                \n\t" // 0
3028
                "psubw %%mm3, %%mm6                                \n\t"
3029
                "pmaxsw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3030
#else
3031
                "movq %%mm7, %%mm6                                \n\t" // 0
3032
                "pcmpgtw %%mm0, %%mm6                                \n\t"
3033
                "pxor %%mm6, %%mm0                                \n\t"
3034
                "psubw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3035
                "movq %%mm7, %%mm6                                \n\t" // 0
3036
                "pcmpgtw %%mm1, %%mm6                                \n\t"
3037
                "pxor %%mm6, %%mm1                                \n\t"
3038
                "psubw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3039
                "movq %%mm7, %%mm6                                \n\t" // 0
3040
                "pcmpgtw %%mm2, %%mm6                                \n\t"
3041
                "pxor %%mm6, %%mm2                                \n\t"
3042
                "psubw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3043
                "movq %%mm7, %%mm6                                \n\t" // 0
3044
                "pcmpgtw %%mm3, %%mm6                                \n\t"
3045
                "pxor %%mm6, %%mm3                                \n\t"
3046
                "psubw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3047
#endif
3048

    
3049
#ifdef HAVE_MMX2
3050
                "pminsw %%mm2, %%mm0                                \n\t"
3051
                "pminsw %%mm3, %%mm1                                \n\t"
3052
#else
3053
                "movq %%mm0, %%mm6                                \n\t"
3054
                "psubusw %%mm2, %%mm6                                \n\t"
3055
                "psubw %%mm6, %%mm0                                \n\t"
3056
                "movq %%mm1, %%mm6                                \n\t"
3057
                "psubusw %%mm3, %%mm6                                \n\t"
3058
                "psubw %%mm6, %%mm1                                \n\t"
3059
#endif
3060

    
3061
                "movd %2, %%mm2                                        \n\t" // QP
3062
                "punpcklbw %%mm7, %%mm2                                \n\t"
3063

    
3064
                "movq %%mm7, %%mm6                                \n\t" // 0
3065
                "pcmpgtw %%mm4, %%mm6                                \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
3066
                "pxor %%mm6, %%mm4                                \n\t"
3067
                "psubw %%mm6, %%mm4                                \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
3068
                "pcmpgtw %%mm5, %%mm7                                \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
3069
                "pxor %%mm7, %%mm5                                \n\t"
3070
                "psubw %%mm7, %%mm5                                \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
3071
// 100 opcodes
3072
                "psllw $3, %%mm2                                \n\t" // 8QP
3073
                "movq %%mm2, %%mm3                                \n\t" // 8QP
3074
                "pcmpgtw %%mm4, %%mm2                                \n\t"
3075
                "pcmpgtw %%mm5, %%mm3                                \n\t"
3076
                "pand %%mm2, %%mm4                                \n\t"
3077
                "pand %%mm3, %%mm5                                \n\t"
3078

    
3079

    
3080
                "psubusw %%mm0, %%mm4                                \n\t" // hd
3081
                "psubusw %%mm1, %%mm5                                \n\t" // ld
3082

    
3083

    
3084
                "movq "MANGLE(w05)", %%mm2                        \n\t" // 5
3085
                "pmullw %%mm2, %%mm4                                \n\t"
3086
                "pmullw %%mm2, %%mm5                                \n\t"
3087
                "movq "MANGLE(w20)", %%mm2                        \n\t" // 32
3088
                "paddw %%mm2, %%mm4                                \n\t"
3089
                "paddw %%mm2, %%mm5                                \n\t"
3090
                "psrlw $6, %%mm4                                \n\t"
3091
                "psrlw $6, %%mm5                                \n\t"
3092

    
3093
                "movq 16(%%ecx), %%mm0                                \n\t" // L3 - L4
3094
                "movq 24(%%ecx), %%mm1                                \n\t" // H3 - H4
3095

    
3096
                "pxor %%mm2, %%mm2                                \n\t"
3097
                "pxor %%mm3, %%mm3                                \n\t"
3098

    
3099
                "pcmpgtw %%mm0, %%mm2                                \n\t" // sign (L3-L4)
3100
                "pcmpgtw %%mm1, %%mm3                                \n\t" // sign (H3-H4)
3101
                "pxor %%mm2, %%mm0                                \n\t"
3102
                "pxor %%mm3, %%mm1                                \n\t"
3103
                "psubw %%mm2, %%mm0                                \n\t" // |L3-L4|
3104
                "psubw %%mm3, %%mm1                                \n\t" // |H3-H4|
3105
                "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
3106
                "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
3107

    
3108
                "pxor %%mm6, %%mm2                                \n\t"
3109
                "pxor %%mm7, %%mm3                                \n\t"
3110
                "pand %%mm2, %%mm4                                \n\t"
3111
                "pand %%mm3, %%mm5                                \n\t"
3112

    
3113
#ifdef HAVE_MMX2
3114
                "pminsw %%mm0, %%mm4                                \n\t"
3115
                "pminsw %%mm1, %%mm5                                \n\t"
3116
#else
3117
                "movq %%mm4, %%mm2                                \n\t"
3118
                "psubusw %%mm0, %%mm2                                \n\t"
3119
                "psubw %%mm2, %%mm4                                \n\t"
3120
                "movq %%mm5, %%mm2                                \n\t"
3121
                "psubusw %%mm1, %%mm2                                \n\t"
3122
                "psubw %%mm2, %%mm5                                \n\t"
3123
#endif
3124
                "pxor %%mm6, %%mm4                                \n\t"
3125
                "pxor %%mm7, %%mm5                                \n\t"
3126
                "psubw %%mm6, %%mm4                                \n\t"
3127
                "psubw %%mm7, %%mm5                                \n\t"
3128
                "packsswb %%mm5, %%mm4                                \n\t"
3129
                "movq %3, %%mm1                                        \n\t"
3130
                "pandn %%mm4, %%mm1                                \n\t"
3131
                "movq (%0), %%mm0                                \n\t"
3132
                "paddb   %%mm1, %%mm0                                \n\t"
3133
                "movq %%mm0, (%0)                                \n\t"
3134
                "movq (%0, %1), %%mm0                                \n\t"
3135
                "psubb %%mm1, %%mm0                                \n\t"
3136
                "movq %%mm0, (%0, %1)                                \n\t"
3137

    
3138
                : "+r" (temp_src)
3139
                : "r" (step), "m" (c->pQPb), "m"(eq_mask)
3140
                : "%eax", "%ecx"
3141
                );
3142
        }
3143
/*if(step==16){
3144
    STOP_TIMER("step16")
3145
}else{
3146
    STOP_TIMER("stepX")
3147
}*/
3148
}
3149
#endif //HAVE_MMX
3150

    
3151
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3152
        QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
3153

    
3154
/**
3155
 * Copies a block from src to dst and fixes the blacklevel
3156
 * levelFix == 0 -> dont touch the brighness & contrast
3157
 */
3158
#undef SCALED_CPY
3159

    
3160
static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
3161
        int levelFix, int64_t *packedOffsetAndScale)
3162
{
3163
#ifndef HAVE_MMX
3164
        int i;
3165
#endif
3166
        if(levelFix)
3167
        {
3168
#ifdef HAVE_MMX
3169
                                        asm volatile(
3170
                                                "movq (%%eax), %%mm2        \n\t" // packedYOffset
3171
                                                "movq 8(%%eax), %%mm3        \n\t" // packedYScale
3172
                                                "leal (%2,%4), %%eax        \n\t"
3173
                                                "leal (%3,%5), %%edx        \n\t"
3174
                                                "pxor %%mm4, %%mm4        \n\t"
3175
#ifdef HAVE_MMX2
3176
#define SCALED_CPY(src1, src2, dst1, dst2)                                        \
3177
                                                "movq " #src1 ", %%mm0        \n\t"\
3178
                                                "movq " #src1 ", %%mm5        \n\t"\
3179
                                                "movq " #src2 ", %%mm1        \n\t"\
3180
                                                "movq " #src2 ", %%mm6        \n\t"\
3181
                                                "punpcklbw %%mm0, %%mm0 \n\t"\
3182
                                                "punpckhbw %%mm5, %%mm5 \n\t"\
3183
                                                "punpcklbw %%mm1, %%mm1 \n\t"\
3184
                                                "punpckhbw %%mm6, %%mm6 \n\t"\
3185
                                                "pmulhuw %%mm3, %%mm0        \n\t"\
3186
                                                "pmulhuw %%mm3, %%mm5        \n\t"\
3187
                                                "pmulhuw %%mm3, %%mm1        \n\t"\
3188
                                                "pmulhuw %%mm3, %%mm6        \n\t"\
3189
                                                "psubw %%mm2, %%mm0        \n\t"\
3190
                                                "psubw %%mm2, %%mm5        \n\t"\
3191
                                                "psubw %%mm2, %%mm1        \n\t"\
3192
                                                "psubw %%mm2, %%mm6        \n\t"\
3193
                                                "packuswb %%mm5, %%mm0        \n\t"\
3194
                                                "packuswb %%mm6, %%mm1        \n\t"\
3195
                                                "movq %%mm0, " #dst1 "        \n\t"\
3196
                                                "movq %%mm1, " #dst2 "        \n\t"\
3197

    
3198
#else //HAVE_MMX2
3199
#define SCALED_CPY(src1, src2, dst1, dst2)                                        \
3200
                                                "movq " #src1 ", %%mm0        \n\t"\
3201
                                                "movq " #src1 ", %%mm5        \n\t"\
3202
                                                "punpcklbw %%mm4, %%mm0 \n\t"\
3203
                                                "punpckhbw %%mm4, %%mm5 \n\t"\
3204
                                                "psubw %%mm2, %%mm0        \n\t"\
3205
                                                "psubw %%mm2, %%mm5        \n\t"\
3206
                                                "movq " #src2 ", %%mm1        \n\t"\
3207
                                                "psllw $6, %%mm0        \n\t"\
3208
                                                "psllw $6, %%mm5        \n\t"\
3209
                                                "pmulhw %%mm3, %%mm0        \n\t"\
3210
                                                "movq " #src2 ", %%mm6        \n\t"\
3211
                                                "pmulhw %%mm3, %%mm5        \n\t"\
3212
                                                "punpcklbw %%mm4, %%mm1 \n\t"\
3213
                                                "punpckhbw %%mm4, %%mm6 \n\t"\
3214
                                                "psubw %%mm2, %%mm1        \n\t"\
3215
                                                "psubw %%mm2, %%mm6        \n\t"\
3216
                                                "psllw $6, %%mm1        \n\t"\
3217
                                                "psllw $6, %%mm6        \n\t"\
3218
                                                "pmulhw %%mm3, %%mm1        \n\t"\
3219
                                                "pmulhw %%mm3, %%mm6        \n\t"\
3220
                                                "packuswb %%mm5, %%mm0        \n\t"\
3221
                                                "packuswb %%mm6, %%mm1        \n\t"\
3222
                                                "movq %%mm0, " #dst1 "        \n\t"\
3223
                                                "movq %%mm1, " #dst2 "        \n\t"\
3224

    
3225
#endif //!HAVE_MMX2
3226

    
3227
SCALED_CPY((%2)       , (%2, %4)      , (%3)       , (%3, %5))
3228
SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2))
3229
SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4))
3230
                                                "leal (%%eax,%4,4), %%eax        \n\t"
3231
                                                "leal (%%edx,%5,4), %%edx        \n\t"
3232
SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
3233

    
3234

    
3235
                                                : "=&a" (packedOffsetAndScale)
3236
                                                : "0" (packedOffsetAndScale),
3237
                                                "r"(src),
3238
                                                "r"(dst),
3239
                                                "r" (srcStride),
3240
                                                "r" (dstStride)
3241
                                                : "%edx"
3242
                                        );
3243
#else
3244
                                for(i=0; i<8; i++)
3245
                                        memcpy(        &(dst[dstStride*i]),
3246
                                                &(src[srcStride*i]), BLOCK_SIZE);
3247
#endif
3248
        }
3249
        else
3250
        {
3251
#ifdef HAVE_MMX
3252
                                        asm volatile(
3253
                                                "leal (%0,%2), %%eax