Statistics
| Branch: | Revision:

ffmpeg / libavcodec / libpostproc / postprocess_template.c @ 58c2182d

History | View | Annotate | Download (95.8 KB)

1 3057fa66 Arpi
/*
2 bdd677ac Michael Niedermayer
    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3 3057fa66 Arpi

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18
19 b304569a Michael Niedermayer
/**
20
 * @file postprocess_template.c
21
 * mmx/mmx2/3dnow postprocess code.
22
 */
23
24
25 cc9b0679 Michael Niedermayer
#undef PAVGB
26
#undef PMINUB
27
#undef PMAXUB
28 e939e1c3 Arpi
29
#ifdef HAVE_MMX2
30
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
31
#elif defined (HAVE_3DNOW)
32
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
33
#endif
34 3057fa66 Arpi
35 2e212618 Michael Niedermayer
#ifdef HAVE_MMX2
36
#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
37
#elif defined (HAVE_MMX)
38
#define PMINUB(b,a,t) \
39
        "movq " #a ", " #t " \n\t"\
40
        "psubusb " #b ", " #t " \n\t"\
41
        "psubb " #t ", " #a " \n\t"
42
#endif
43
44
#ifdef HAVE_MMX2
45
#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
46
#elif defined (HAVE_MMX)
47
#define PMAXUB(a,b) \
48
        "psubusb " #a ", " #b " \n\t"\
49
        "paddb " #a ", " #b " \n\t"
50
#endif
51
52 3057fa66 Arpi
//FIXME? |255-0| = 1 (shouldnt be a problem ...)
53 9c9e467d Michael Niedermayer
#ifdef HAVE_MMX
54 3057fa66 Arpi
/**
55 acced553 Michael Niedermayer
 * Check if the middle 8x8 Block in the given 8x16 block is flat
56 3057fa66 Arpi
 */
57 cb482d25 Michael Niedermayer
static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
58
        int numEq= 0, dcOk;
59 acced553 Michael Niedermayer
        src+= stride*4; // src points to begin of the 8x8 Block
60 37da00fc Michael Niedermayer
asm volatile(
61 1e79606d Michael Niedermayer
                "movq %0, %%mm7                                        \n\t" 
62
                "movq %1, %%mm6                                        \n\t" 
63
                : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
64
                );
65
                
66
asm volatile(
67 cb482d25 Michael Niedermayer
                "leal (%2, %3), %%eax                                \n\t"
68 37da00fc Michael Niedermayer
//        0        1        2        3        4        5        6        7        8        9
69 9c9e467d Michael Niedermayer
//        %1        eax        eax+%2        eax+2%2        %1+4%2        ecx        ecx+%2        ecx+2%2        %1+8%2        ecx+4%2
70 ec487e5d Michael Niedermayer
71 cb482d25 Michael Niedermayer
                "movq (%2), %%mm0                                \n\t"
72 37da00fc Michael Niedermayer
                "movq (%%eax), %%mm1                                \n\t"
73 cb482d25 Michael Niedermayer
                "movq %%mm0, %%mm3                                \n\t"
74
                "movq %%mm0, %%mm4                                \n\t"
75
                PMAXUB(%%mm1, %%mm4)
76
                PMINUB(%%mm1, %%mm3, %%mm5)
77 3057fa66 Arpi
                "psubb %%mm1, %%mm0                                \n\t" // mm0 = differnece
78
                "paddb %%mm7, %%mm0                                \n\t"
79
                "pcmpgtb %%mm6, %%mm0                                \n\t"
80
81 cb482d25 Michael Niedermayer
                "movq (%%eax,%3), %%mm2                                \n\t"
82
                PMAXUB(%%mm2, %%mm4)
83
                PMINUB(%%mm2, %%mm3, %%mm5)
84 3057fa66 Arpi
                "psubb %%mm2, %%mm1                                \n\t"
85
                "paddb %%mm7, %%mm1                                \n\t"
86
                "pcmpgtb %%mm6, %%mm1                                \n\t"
87
                "paddb %%mm1, %%mm0                                \n\t"
88
89 cb482d25 Michael Niedermayer
                "movq (%%eax, %3, 2), %%mm1                        \n\t"
90
                PMAXUB(%%mm1, %%mm4)
91
                PMINUB(%%mm1, %%mm3, %%mm5)
92 3057fa66 Arpi
                "psubb %%mm1, %%mm2                                \n\t"
93
                "paddb %%mm7, %%mm2                                \n\t"
94
                "pcmpgtb %%mm6, %%mm2                                \n\t"
95
                "paddb %%mm2, %%mm0                                \n\t"
96 9c9e467d Michael Niedermayer
                
97 cb482d25 Michael Niedermayer
                "leal (%%eax, %3, 4), %%eax                        \n\t"
98 3057fa66 Arpi
99 cb482d25 Michael Niedermayer
                "movq (%2, %3, 4), %%mm2                        \n\t"
100
                PMAXUB(%%mm2, %%mm4)
101
                PMINUB(%%mm2, %%mm3, %%mm5)
102 3057fa66 Arpi
                "psubb %%mm2, %%mm1                                \n\t"
103
                "paddb %%mm7, %%mm1                                \n\t"
104
                "pcmpgtb %%mm6, %%mm1                                \n\t"
105
                "paddb %%mm1, %%mm0                                \n\t"
106
107 9c9e467d Michael Niedermayer
                "movq (%%eax), %%mm1                                \n\t"
108 cb482d25 Michael Niedermayer
                PMAXUB(%%mm1, %%mm4)
109
                PMINUB(%%mm1, %%mm3, %%mm5)
110 3057fa66 Arpi
                "psubb %%mm1, %%mm2                                \n\t"
111
                "paddb %%mm7, %%mm2                                \n\t"
112
                "pcmpgtb %%mm6, %%mm2                                \n\t"
113
                "paddb %%mm2, %%mm0                                \n\t"
114
115 cb482d25 Michael Niedermayer
                "movq (%%eax, %3), %%mm2                        \n\t"
116
                PMAXUB(%%mm2, %%mm4)
117
                PMINUB(%%mm2, %%mm3, %%mm5)
118 3057fa66 Arpi
                "psubb %%mm2, %%mm1                                \n\t"
119
                "paddb %%mm7, %%mm1                                \n\t"
120
                "pcmpgtb %%mm6, %%mm1                                \n\t"
121
                "paddb %%mm1, %%mm0                                \n\t"
122
123 cb482d25 Michael Niedermayer
                "movq (%%eax, %3, 2), %%mm1                        \n\t"
124
                PMAXUB(%%mm1, %%mm4)
125
                PMINUB(%%mm1, %%mm3, %%mm5)
126 3057fa66 Arpi
                "psubb %%mm1, %%mm2                                \n\t"
127
                "paddb %%mm7, %%mm2                                \n\t"
128
                "pcmpgtb %%mm6, %%mm2                                \n\t"
129
                "paddb %%mm2, %%mm0                                \n\t"
130 cb482d25 Michael Niedermayer
                "psubusb %%mm3, %%mm4                                \n\t"
131 3057fa66 Arpi
132
                "                                                \n\t"
133 cd38e322 Michael Niedermayer
#ifdef HAVE_MMX2
134
                "pxor %%mm7, %%mm7                                \n\t"
135
                "psadbw %%mm7, %%mm0                                \n\t"
136
#else
137 3057fa66 Arpi
                "movq %%mm0, %%mm1                                \n\t"
138
                "psrlw $8, %%mm0                                \n\t"
139
                "paddb %%mm1, %%mm0                                \n\t"
140
                "movq %%mm0, %%mm1                                \n\t"
141
                "psrlq $16, %%mm0                                \n\t"
142
                "paddb %%mm1, %%mm0                                \n\t"
143
                "movq %%mm0, %%mm1                                \n\t"
144
                "psrlq $32, %%mm0                                \n\t"
145
                "paddb %%mm1, %%mm0                                \n\t"
146 cd38e322 Michael Niedermayer
#endif
147 1e79606d Michael Niedermayer
                "movq %4, %%mm7                                        \n\t" // QP,..., QP
148 cb482d25 Michael Niedermayer
                "paddusb %%mm7, %%mm7                                \n\t" // 2QP ... 2QP
149
                "psubusb %%mm7, %%mm4                                \n\t" // Diff <= 2QP -> 0
150
                "packssdw %%mm4, %%mm4                                \n\t"
151 3057fa66 Arpi
                "movd %%mm0, %0                                        \n\t"
152 cb482d25 Michael Niedermayer
                "movd %%mm4, %1                                        \n\t"
153
154
                : "=r" (numEq), "=r" (dcOk)
155 1e79606d Michael Niedermayer
                : "r" (src), "r" (stride), "m" (c->pQPb)
156 9c9e467d Michael Niedermayer
                : "%eax"
157 3057fa66 Arpi
                );
158 cb482d25 Michael Niedermayer
159 cd38e322 Michael Niedermayer
        numEq= (-numEq) &0xFF;
160 cb482d25 Michael Niedermayer
        if(numEq > c->ppMode.flatnessThreshold){
161
            if(dcOk) return 0;
162
            else     return 1;
163
        }else{
164
            return 2;
165
        }
166 3057fa66 Arpi
}
167 9c9e467d Michael Niedermayer
#endif
168 3057fa66 Arpi
169
/**
170 acced553 Michael Niedermayer
 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
171 a6be8111 Michael Niedermayer
 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
172 3057fa66 Arpi
 */
173 9c9e467d Michael Niedermayer
static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
174 3057fa66 Arpi
{
175 13e00528 Arpi
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
176 acced553 Michael Niedermayer
        src+= stride*3;
177 3057fa66 Arpi
        asm volatile(        //"movv %0 %1 %2\n\t"
178 9c9e467d Michael Niedermayer
                "movq %2, %%mm0                        \n\t"  // QP,..., QP
179
                "pxor %%mm4, %%mm4                                \n\t"
180 3057fa66 Arpi
181
                "movq (%0), %%mm6                                \n\t"
182
                "movq (%0, %1), %%mm5                                \n\t"
183
                "movq %%mm5, %%mm1                                \n\t"
184
                "movq %%mm6, %%mm2                                \n\t"
185
                "psubusb %%mm6, %%mm5                                \n\t"
186
                "psubusb %%mm1, %%mm2                                \n\t"
187
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
188
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
189 9c9e467d Michael Niedermayer
                "pcmpeqb %%mm4, %%mm2                        \n\t" // diff <= QP -> FF
190 3057fa66 Arpi
191
                "pand %%mm2, %%mm6                                \n\t"
192
                "pandn %%mm1, %%mm2                                \n\t"
193
                "por %%mm2, %%mm6                                \n\t"// First Line to Filter
194
195
                "movq (%0, %1, 8), %%mm5                        \n\t"
196
                "leal (%0, %1, 4), %%eax                        \n\t"
197 9c9e467d Michael Niedermayer
                "leal (%0, %1, 8), %%ecx                        \n\t"
198
                "subl %1, %%ecx                                        \n\t"
199 3057fa66 Arpi
                "addl %1, %0                                        \n\t" // %0 points to line 1 not 0
200
                "movq (%0, %1, 8), %%mm7                        \n\t"
201
                "movq %%mm5, %%mm1                                \n\t"
202
                "movq %%mm7, %%mm2                                \n\t"
203
                "psubusb %%mm7, %%mm5                                \n\t"
204
                "psubusb %%mm1, %%mm2                                \n\t"
205
                "por %%mm5, %%mm2                                \n\t" // ABS Diff of lines
206
                "psubusb %%mm0, %%mm2                                \n\t" // diff <= QP -> 0
207 9c9e467d Michael Niedermayer
                "pcmpeqb %%mm4, %%mm2                        \n\t" // diff <= QP -> FF
208 3057fa66 Arpi
209
                "pand %%mm2, %%mm7                                \n\t"
210
                "pandn %%mm1, %%mm2                                \n\t"
211
                "por %%mm2, %%mm7                                \n\t" // First Line to Filter
212
213
214
                //         1        2        3        4        5        6        7        8
215 9c9e467d Michael Niedermayer
                //        %0        %0+%1        %0+2%1        eax        %0+4%1        eax+2%1        ecx        eax+4%1
216 3057fa66 Arpi
                // 6 4 2 2 1 1
217
                // 6 4 4 2
218
                // 6 8 2
219 acced553 Michael Niedermayer
220 3057fa66 Arpi
                "movq (%0, %1), %%mm0                                \n\t" //  1
221
                "movq %%mm0, %%mm1                                \n\t" //  1
222 13e00528 Arpi
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
223
                PAVGB(%%mm6, %%mm0)                                      //3 1        /4
224 3057fa66 Arpi
225
                "movq (%0, %1, 4), %%mm2                        \n\t" //     1
226
                "movq %%mm2, %%mm5                                \n\t" //     1
227 13e00528 Arpi
                PAVGB((%%eax), %%mm2)                                      //    11        /2
228
                PAVGB((%0, %1, 2), %%mm2)                              //   211        /4
229 3057fa66 Arpi
                "movq %%mm2, %%mm3                                \n\t" //   211        /4
230
                "movq (%0), %%mm4                                \n\t" // 1
231 13e00528 Arpi
                PAVGB(%%mm4, %%mm3)                                      // 4 211        /8
232
                PAVGB(%%mm0, %%mm3)                                      //642211        /16
233 3057fa66 Arpi
                "movq %%mm3, (%0)                                \n\t" // X
234
                // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
235
                "movq %%mm1, %%mm0                                \n\t" //  1
236 13e00528 Arpi
                PAVGB(%%mm6, %%mm0)                                      //1 1        /2
237 3057fa66 Arpi
                "movq %%mm4, %%mm3                                \n\t" // 1
238 13e00528 Arpi
                PAVGB((%0,%1,2), %%mm3)                                      // 1 1        /2
239
                PAVGB((%%eax,%1,2), %%mm5)                              //     11        /2
240
                PAVGB((%%eax), %%mm5)                                      //    211 /4
241
                PAVGB(%%mm5, %%mm3)                                      // 2 2211 /8
242
                PAVGB(%%mm0, %%mm3)                                      //4242211 /16
243 3057fa66 Arpi
                "movq %%mm3, (%0,%1)                                \n\t" //  X
244
                // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
245 13e00528 Arpi
                PAVGB(%%mm4, %%mm6)                                      //11        /2
246 9c9e467d Michael Niedermayer
                "movq (%%ecx), %%mm0                                \n\t" //       1
247 13e00528 Arpi
                PAVGB((%%eax, %1, 2), %%mm0)                              //      11/2
248 3057fa66 Arpi
                "movq %%mm0, %%mm3                                \n\t" //      11/2
249 13e00528 Arpi
                PAVGB(%%mm1, %%mm0)                                      //  2   11/4
250
                PAVGB(%%mm6, %%mm0)                                      //222   11/8
251
                PAVGB(%%mm2, %%mm0)                                      //22242211/16
252 3057fa66 Arpi
                "movq (%0, %1, 2), %%mm2                        \n\t" //   1
253
                "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
254
                // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
255
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
256 9c9e467d Michael Niedermayer
                PAVGB((%%ecx), %%mm0)                                      //       11        /2
257 13e00528 Arpi
                PAVGB(%%mm0, %%mm6)                                      //11     11        /4
258
                PAVGB(%%mm1, %%mm4)                                      // 11                /2
259
                PAVGB(%%mm2, %%mm1)                                      //  11                /2
260
                PAVGB(%%mm1, %%mm6)                                      //1122   11        /8
261
                PAVGB(%%mm5, %%mm6)                                      //112242211        /16
262 3057fa66 Arpi
                "movq (%%eax), %%mm5                                \n\t" //    1
263
                "movq %%mm6, (%%eax)                                \n\t" //    X
264
                // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
265
                "movq (%%eax, %1, 4), %%mm6                        \n\t" //        1
266 13e00528 Arpi
                PAVGB(%%mm7, %%mm6)                                      //        11        /2
267
                PAVGB(%%mm4, %%mm6)                                      // 11     11        /4
268
                PAVGB(%%mm3, %%mm6)                                      // 11   2211        /8
269
                PAVGB(%%mm5, %%mm2)                                      //   11                /2
270 3057fa66 Arpi
                "movq (%0, %1, 4), %%mm4                        \n\t" //     1
271 13e00528 Arpi
                PAVGB(%%mm4, %%mm2)                                      //   112                /4
272
                PAVGB(%%mm2, %%mm6)                                      // 112242211        /16
273 3057fa66 Arpi
                "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
274
                // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
275 13e00528 Arpi
                PAVGB(%%mm7, %%mm1)                                      //  11     2        /4
276
                PAVGB(%%mm4, %%mm5)                                      //    11                /2
277
                PAVGB(%%mm5, %%mm0)                                      //    11 11        /4
278 3057fa66 Arpi
                "movq (%%eax, %1, 2), %%mm6                        \n\t" //      1
279 13e00528 Arpi
                PAVGB(%%mm6, %%mm1)                                      //  11  4  2        /8
280
                PAVGB(%%mm0, %%mm1)                                      //  11224222        /16
281 3057fa66 Arpi
                "movq %%mm1, (%%eax, %1, 2)                        \n\t" //      X
282
                // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
283 9c9e467d Michael Niedermayer
                PAVGB((%%ecx), %%mm2)                                      //   112 4        /8
284 3057fa66 Arpi
                "movq (%%eax, %1, 4), %%mm0                        \n\t" //        1
285 13e00528 Arpi
                PAVGB(%%mm0, %%mm6)                                      //      1 1        /2
286
                PAVGB(%%mm7, %%mm6)                                      //      1 12        /4
287
                PAVGB(%%mm2, %%mm6)                                      //   1122424        /4
288 9c9e467d Michael Niedermayer
                "movq %%mm6, (%%ecx)                                \n\t" //       X
289 3057fa66 Arpi
                // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
290 13e00528 Arpi
                PAVGB(%%mm7, %%mm5)                                      //    11   2        /4
291
                PAVGB(%%mm7, %%mm5)                                      //    11   6        /8
292 3057fa66 Arpi
293 13e00528 Arpi
                PAVGB(%%mm3, %%mm0)                                      //      112        /4
294
                PAVGB(%%mm0, %%mm5)                                      //    112246        /16
295 3057fa66 Arpi
                "movq %%mm5, (%%eax, %1, 4)                        \n\t" //        X
296 8405b3fd Michael Niedermayer
                "subl %1, %0                                        \n\t"
297 3057fa66 Arpi
298
                :
299 9c9e467d Michael Niedermayer
                : "r" (src), "r" (stride), "m" (c->pQPb)
300
                : "%eax", "%ecx"
301 3057fa66 Arpi
        );
302
#else
303
        const int l1= stride;
304
        const int l2= stride + l1;
305
        const int l3= stride + l2;
306
        const int l4= stride + l3;
307
        const int l5= stride + l4;
308
        const int l6= stride + l5;
309
        const int l7= stride + l6;
310
        const int l8= stride + l7;
311
        const int l9= stride + l8;
312 d5a1a995 Michael Niedermayer
        int x;
313 acced553 Michael Niedermayer
        src+= stride*3;
314 d5a1a995 Michael Niedermayer
        for(x=0; x<BLOCK_SIZE; x++)
315 3057fa66 Arpi
        {
316 9c9e467d Michael Niedermayer
                const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
317
                const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
318 3057fa66 Arpi
319
                int sums[9];
320
                sums[0] = first + src[l1];
321
                sums[1] = src[l1] + src[l2];
322
                sums[2] = src[l2] + src[l3];
323
                sums[3] = src[l3] + src[l4];
324
                sums[4] = src[l4] + src[l5];
325
                sums[5] = src[l5] + src[l6];
326
                sums[6] = src[l6] + src[l7];
327
                sums[7] = src[l7] + src[l8];
328
                sums[8] = src[l8] + last;
329
330
                src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
331 e5c30e06 Michael Niedermayer
                src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
332
                src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
333
                src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
334
                src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
335
                src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
336
                src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
337
                src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
338 3057fa66 Arpi
339
                src++;
340
        }
341
#endif
342
}
343
344 9c9e467d Michael Niedermayer
#if 0
345 13e00528 Arpi
/**
346
 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
347
 * values are correctly clipped (MMX2)
348
 * values are wraparound (C)
349
 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
350
        0 8 16 24
351
        x = 8
352
        x/2 = 4
353
        x/8 = 1
354
        1 12 12 23
355
 */
356 cc9b0679 Michael Niedermayer
static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
357 13e00528 Arpi
{
358 d5a1a995 Michael Niedermayer
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
359 acced553 Michael Niedermayer
        src+= stride*3;
360 13e00528 Arpi
// FIXME rounding
361
        asm volatile(
362
                "pxor %%mm7, %%mm7                                \n\t" // 0
363 9b464428 Felix Bünemann
                "movq "MANGLE(b80)", %%mm6                        \n\t" // MIN_SIGNED_BYTE
364 13e00528 Arpi
                "leal (%0, %1), %%eax                                \n\t"
365 9c9e467d Michael Niedermayer
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
366 13e00528 Arpi
//        0        1        2        3        4        5        6        7        8        9
367 9c9e467d Michael Niedermayer
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1        %0+8%1        ecx+4%1
368 9b464428 Felix Bünemann
                "movq "MANGLE(pQPb)", %%mm0                        \n\t" // QP,..., QP
369 13e00528 Arpi
                "movq %%mm0, %%mm1                                \n\t" // QP,..., QP
370 9b464428 Felix Bünemann
                "paddusb "MANGLE(b02)", %%mm0                        \n\t"
371 13e00528 Arpi
                "psrlw $2, %%mm0                                \n\t"
372 9b464428 Felix Bünemann
                "pand "MANGLE(b3F)", %%mm0                        \n\t" // QP/4,..., QP/4
373 13e00528 Arpi
                "paddusb %%mm1, %%mm0                                \n\t" // QP*1.25 ...
374
                "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
375 9c9e467d Michael Niedermayer
                "movq (%%ecx), %%mm3                                \n\t" // line 5
376 13e00528 Arpi
                "movq %%mm2, %%mm4                                \n\t" // line 4
377
                "pcmpeqb %%mm5, %%mm5                                \n\t" // -1
378
                "pxor %%mm2, %%mm5                                \n\t" // -line 4 - 1
379 d5a1a995 Michael Niedermayer
                PAVGB(%%mm3, %%mm5)
380 13e00528 Arpi
                "paddb %%mm6, %%mm5                                \n\t" // (l5-l4)/2
381
                "psubusb %%mm3, %%mm4                                \n\t"
382
                "psubusb %%mm2, %%mm3                                \n\t"
383
                "por %%mm3, %%mm4                                \n\t" // |l4 - l5|
384
                "psubusb %%mm0, %%mm4                                \n\t"
385
                "pcmpeqb %%mm7, %%mm4                                \n\t"
386
                "pand %%mm4, %%mm5                                \n\t" // d/2
387

388
//                "paddb %%mm6, %%mm2                                \n\t" // line 4 + 0x80
389
                "paddb %%mm5, %%mm2                                \n\t"
390
//                "psubb %%mm6, %%mm2                                \n\t"
391
                "movq %%mm2, (%0,%1, 4)                                \n\t"
392

393 9c9e467d Michael Niedermayer
                "movq (%%ecx), %%mm2                                \n\t"
394 13e00528 Arpi
//                "paddb %%mm6, %%mm2                                \n\t" // line 5 + 0x80
395
                "psubb %%mm5, %%mm2                                \n\t"
396
//                "psubb %%mm6, %%mm2                                \n\t"
397 9c9e467d Michael Niedermayer
                "movq %%mm2, (%%ecx)                                \n\t"
398 13e00528 Arpi

399
                "paddb %%mm6, %%mm5                                \n\t"
400
                "psrlw $2, %%mm5                                \n\t"
401 9b464428 Felix Bünemann
                "pand "MANGLE(b3F)", %%mm5                        \n\t"
402
                "psubb "MANGLE(b20)", %%mm5                        \n\t" // (l5-l4)/8
403 13e00528 Arpi

404
                "movq (%%eax, %1, 2), %%mm2                        \n\t"
405
                "paddb %%mm6, %%mm2                                \n\t" // line 3 + 0x80
406
                "paddsb %%mm5, %%mm2                                \n\t"
407
                "psubb %%mm6, %%mm2                                \n\t"
408
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
409

410 9c9e467d Michael Niedermayer
                "movq (%%ecx, %1), %%mm2                        \n\t"
411 13e00528 Arpi
                "paddb %%mm6, %%mm2                                \n\t" // line 6 + 0x80
412
                "psubsb %%mm5, %%mm2                                \n\t"
413
                "psubb %%mm6, %%mm2                                \n\t"
414 9c9e467d Michael Niedermayer
                "movq %%mm2, (%%ecx, %1)                        \n\t"
415 13e00528 Arpi

416
                :
417
                : "r" (src), "r" (stride)
418 9c9e467d Michael Niedermayer
                : "%eax", "%ecx"
419 13e00528 Arpi
        );
420
#else
421
         const int l1= stride;
422
        const int l2= stride + l1;
423
        const int l3= stride + l2;
424
        const int l4= stride + l3;
425
        const int l5= stride + l4;
426
        const int l6= stride + l5;
427 e5c30e06 Michael Niedermayer
//        const int l7= stride + l6;
428
//        const int l8= stride + l7;
429
//        const int l9= stride + l8;
430 d5a1a995 Michael Niedermayer
        int x;
431 3407a972 Michael Niedermayer
        const int QP15= QP + (QP>>2);
432 acced553 Michael Niedermayer
        src+= stride*3;
433 d5a1a995 Michael Niedermayer
        for(x=0; x<BLOCK_SIZE; x++)
434 13e00528 Arpi
        {
435 3407a972 Michael Niedermayer
                const int v = (src[x+l5] - src[x+l4]);
436
                if(ABS(v) < QP15)
437 13e00528 Arpi
                {
438 3407a972 Michael Niedermayer
                        src[x+l3] +=v>>3;
439
                        src[x+l4] +=v>>1;
440
                        src[x+l5] -=v>>1;
441
                        src[x+l6] -=v>>3;
442 13e00528 Arpi
443
                }
444
        }
445
446
#endif
447
}
448 9c9e467d Michael Niedermayer
#endif
449 13e00528 Arpi
450
/**
451
 * Experimental Filter 1
452 9f45d04d Michael Niedermayer
 * will not damage linear gradients
453
 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
454 d5a1a995 Michael Niedermayer
 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
455
 * MMX2 version does correct clipping C version doesnt
456 13e00528 Arpi
 */
457 9c9e467d Michael Niedermayer
static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
458 13e00528 Arpi
{
459 d5a1a995 Michael Niedermayer
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
460 acced553 Michael Niedermayer
        src+= stride*3;
461
462 13e00528 Arpi
        asm volatile(
463 d5a1a995 Michael Niedermayer
                "pxor %%mm7, %%mm7                                \n\t" // 0
464
                "leal (%0, %1), %%eax                                \n\t"
465 9c9e467d Michael Niedermayer
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
466 d5a1a995 Michael Niedermayer
//        0        1        2        3        4        5        6        7        8        9
467 9c9e467d Michael Niedermayer
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1        %0+8%1        ecx+4%1
468 d5a1a995 Michael Niedermayer
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
469
                "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
470
                "movq %%mm1, %%mm2                                \n\t" // line 4
471
                "psubusb %%mm0, %%mm1                                \n\t"
472
                "psubusb %%mm2, %%mm0                                \n\t"
473
                "por %%mm1, %%mm0                                \n\t" // |l2 - l3|
474 9c9e467d Michael Niedermayer
                "movq (%%ecx), %%mm3                                \n\t" // line 5
475
                "movq (%%ecx, %1), %%mm4                        \n\t" // line 6
476 d5a1a995 Michael Niedermayer
                "movq %%mm3, %%mm5                                \n\t" // line 5
477
                "psubusb %%mm4, %%mm3                                \n\t"
478
                "psubusb %%mm5, %%mm4                                \n\t"
479
                "por %%mm4, %%mm3                                \n\t" // |l5 - l6|
480
                PAVGB(%%mm3, %%mm0)                                      // (|l2 - l3| + |l5 - l6|)/2
481
                "movq %%mm2, %%mm1                                \n\t" // line 4
482
                "psubusb %%mm5, %%mm2                                \n\t"
483
                "movq %%mm2, %%mm4                                \n\t"
484
                "pcmpeqb %%mm7, %%mm2                                \n\t" // (l4 - l5) <= 0 ? -1 : 0
485
                "psubusb %%mm1, %%mm5                                \n\t"
486
                "por %%mm5, %%mm4                                \n\t" // |l4 - l5|
487
                "psubusb %%mm0, %%mm4                \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
488
                "movq %%mm4, %%mm3                                \n\t" // d
489 9c9e467d Michael Niedermayer
                "movq %2, %%mm0                        \n\t"
490 dc16b332 Michael Niedermayer
                "paddusb %%mm0, %%mm0                                \n\t"
491
                "psubusb %%mm0, %%mm4                                \n\t"
492 d5a1a995 Michael Niedermayer
                "pcmpeqb %%mm7, %%mm4                                \n\t" // d <= QP ? -1 : 0
493 9b464428 Felix Bünemann
                "psubusb "MANGLE(b01)", %%mm3                        \n\t"
494 d5a1a995 Michael Niedermayer
                "pand %%mm4, %%mm3                                \n\t" // d <= QP ? d : 0
495
496
                PAVGB(%%mm7, %%mm3)                                      // d/2
497 9f45d04d Michael Niedermayer
                "movq %%mm3, %%mm1                                \n\t" // d/2
498
                PAVGB(%%mm7, %%mm3)                                      // d/4
499
                PAVGB(%%mm1, %%mm3)                                      // 3*d/8
500 d5a1a995 Michael Niedermayer
501
                "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
502
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
503
                "psubusb %%mm3, %%mm0                                \n\t"
504
                "pxor %%mm2, %%mm0                                \n\t"
505
                "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
506
507 9c9e467d Michael Niedermayer
                "movq (%%ecx), %%mm0                                \n\t" // line 5
508 d5a1a995 Michael Niedermayer
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
509
                "paddusb %%mm3, %%mm0                                \n\t"
510
                "pxor %%mm2, %%mm0                                \n\t"
511 9c9e467d Michael Niedermayer
                "movq %%mm0, (%%ecx)                                \n\t" // line 5
512 d5a1a995 Michael Niedermayer
513 9f45d04d Michael Niedermayer
                PAVGB(%%mm7, %%mm1)                                      // d/4
514 d5a1a995 Michael Niedermayer
515
                "movq (%%eax, %1, 2), %%mm0                        \n\t" // line 3
516
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
517 9f45d04d Michael Niedermayer
                "psubusb %%mm1, %%mm0                                \n\t"
518 d5a1a995 Michael Niedermayer
                "pxor %%mm2, %%mm0                                \n\t"
519
                "movq %%mm0, (%%eax, %1, 2)                        \n\t" // line 3
520
521 9c9e467d Michael Niedermayer
                "movq (%%ecx, %1), %%mm0                        \n\t" // line 6
522 d5a1a995 Michael Niedermayer
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
523 9f45d04d Michael Niedermayer
                "paddusb %%mm1, %%mm0                                \n\t"
524 d5a1a995 Michael Niedermayer
                "pxor %%mm2, %%mm0                                \n\t"
525 9c9e467d Michael Niedermayer
                "movq %%mm0, (%%ecx, %1)                        \n\t" // line 6
526 d5a1a995 Michael Niedermayer
527 9f45d04d Michael Niedermayer
                PAVGB(%%mm7, %%mm1)                                      // d/8
528 d5a1a995 Michael Niedermayer
529
                "movq (%%eax, %1), %%mm0                        \n\t" // line 2
530
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
531 9f45d04d Michael Niedermayer
                "psubusb %%mm1, %%mm0                                \n\t"
532 d5a1a995 Michael Niedermayer
                "pxor %%mm2, %%mm0                                \n\t"
533
                "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
534
535 9c9e467d Michael Niedermayer
                "movq (%%ecx, %1, 2), %%mm0                        \n\t" // line 7
536 d5a1a995 Michael Niedermayer
                "pxor %%mm2, %%mm0                                \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
537 9f45d04d Michael Niedermayer
                "paddusb %%mm1, %%mm0                                \n\t"
538 d5a1a995 Michael Niedermayer
                "pxor %%mm2, %%mm0                                \n\t"
539 9c9e467d Michael Niedermayer
                "movq %%mm0, (%%ecx, %1, 2)                        \n\t" // line 7
540 13e00528 Arpi
541
                :
542 9c9e467d Michael Niedermayer
                : "r" (src), "r" (stride), "m" (co->pQPb)
543
                : "%eax", "%ecx"
544 13e00528 Arpi
        );
545
#else
546 d5a1a995 Michael Niedermayer
547
         const int l1= stride;
548
        const int l2= stride + l1;
549
        const int l3= stride + l2;
550
        const int l4= stride + l3;
551
        const int l5= stride + l4;
552
        const int l6= stride + l5;
553
        const int l7= stride + l6;
554 e5c30e06 Michael Niedermayer
//        const int l8= stride + l7;
555
//        const int l9= stride + l8;
556 d5a1a995 Michael Niedermayer
        int x;
557 acced553 Michael Niedermayer
558
        src+= stride*3;
559 d5a1a995 Michael Niedermayer
        for(x=0; x<BLOCK_SIZE; x++)
560
        {
561
                int a= src[l3] - src[l4];
562
                int b= src[l4] - src[l5];
563 9f45d04d Michael Niedermayer
                int c= src[l5] - src[l6];
564 d5a1a995 Michael Niedermayer
565 3407a972 Michael Niedermayer
                int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
566
                d= MAX(d, 0);
567 d5a1a995 Michael Niedermayer
568 9c9e467d Michael Niedermayer
                if(d < co->QP*2)
569 d5a1a995 Michael Niedermayer
                {
570
                        int v = d * SIGN(-b);
571
572 3407a972 Michael Niedermayer
                        src[l2] +=v>>3;
573
                        src[l3] +=v>>2;
574
                        src[l4] +=(3*v)>>3;
575
                        src[l5] -=(3*v)>>3;
576
                        src[l6] -=v>>2;
577
                        src[l7] -=v>>3;
578 d5a1a995 Michael Niedermayer
579
                }
580
                src++;
581
        }
582 13e00528 Arpi
#endif
583
}
584
585 9c9e467d Michael Niedermayer
static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
586 3057fa66 Arpi
{
587 7f16f6e6 Michael Niedermayer
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
588
/*
589
        uint8_t tmp[16];
590
        const int l1= stride;
591
        const int l2= stride + l1;
592
        const int l3= stride + l2;
593
        const int l4= (int)tmp - (int)src - stride*3;
594
        const int l5= (int)tmp - (int)src - stride*3 + 8;
595
        const int l6= stride*3 + l3;
596
        const int l7= stride + l6;
597
        const int l8= stride + l7;
598

599
        memcpy(tmp, src+stride*7, 8);
600
        memcpy(tmp+8, src+stride*8, 8);
601
*/
602
        src+= stride*4;
603
        asm volatile(
604
605
#if 0 //sligtly more accurate and slightly slower
606
                "pxor %%mm7, %%mm7                                \n\t" // 0
607
                "leal (%0, %1), %%eax                                \n\t"
608 9c9e467d Michael Niedermayer
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
609 7f16f6e6 Michael Niedermayer
//        0        1        2        3        4        5        6        7
610 9c9e467d Michael Niedermayer
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ecx+%1        ecx+2%1
611
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1
612 7f16f6e6 Michael Niedermayer

613

614
                "movq (%0, %1, 2), %%mm0                        \n\t" // l2
615
                "movq (%0), %%mm1                                \n\t" // l0
616
                "movq %%mm0, %%mm2                                \n\t" // l2
617
                PAVGB(%%mm7, %%mm0)                                      // ~l2/2
618
                PAVGB(%%mm1, %%mm0)                                      // ~(l2 + 2l0)/4
619
                PAVGB(%%mm2, %%mm0)                                      // ~(5l2 + 2l0)/8
620

621
                "movq (%%eax), %%mm1                                \n\t" // l1
622
                "movq (%%eax, %1, 2), %%mm3                        \n\t" // l3
623
                "movq %%mm1, %%mm4                                \n\t" // l1
624
                PAVGB(%%mm7, %%mm1)                                      // ~l1/2
625
                PAVGB(%%mm3, %%mm1)                                      // ~(l1 + 2l3)/4
626
                PAVGB(%%mm4, %%mm1)                                      // ~(5l1 + 2l3)/8
627

628
                "movq %%mm0, %%mm4                                \n\t" // ~(5l2 + 2l0)/8
629
                "psubusb %%mm1, %%mm0                                \n\t"
630
                "psubusb %%mm4, %%mm1                                \n\t"
631
                "por %%mm0, %%mm1                                \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
632
// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
633

634
                "movq (%0, %1, 4), %%mm0                        \n\t" // l4
635
                "movq %%mm0, %%mm4                                \n\t" // l4
636
                PAVGB(%%mm7, %%mm0)                                      // ~l4/2
637
                PAVGB(%%mm2, %%mm0)                                      // ~(l4 + 2l2)/4
638
                PAVGB(%%mm4, %%mm0)                                      // ~(5l4 + 2l2)/8
639

640 9c9e467d Michael Niedermayer
                "movq (%%ecx), %%mm2                                \n\t" // l5
641 7f16f6e6 Michael Niedermayer
                "movq %%mm3, %%mm5                                \n\t" // l3
642
                PAVGB(%%mm7, %%mm3)                                      // ~l3/2
643
                PAVGB(%%mm2, %%mm3)                                      // ~(l3 + 2l5)/4
644
                PAVGB(%%mm5, %%mm3)                                      // ~(5l3 + 2l5)/8
645

646
                "movq %%mm0, %%mm6                                \n\t" // ~(5l4 + 2l2)/8
647
                "psubusb %%mm3, %%mm0                                \n\t"
648
                "psubusb %%mm6, %%mm3                                \n\t"
649
                "por %%mm0, %%mm3                                \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
650
                "pcmpeqb %%mm7, %%mm0                                \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
651
// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
652

653 9c9e467d Michael Niedermayer
                "movq (%%ecx, %1), %%mm6                        \n\t" // l6
654 7f16f6e6 Michael Niedermayer
                "movq %%mm6, %%mm5                                \n\t" // l6
655
                PAVGB(%%mm7, %%mm6)                                      // ~l6/2
656
                PAVGB(%%mm4, %%mm6)                                      // ~(l6 + 2l4)/4
657
                PAVGB(%%mm5, %%mm6)                                      // ~(5l6 + 2l4)/8
658

659 9c9e467d Michael Niedermayer
                "movq (%%ecx, %1, 2), %%mm5                        \n\t" // l7
660 7f16f6e6 Michael Niedermayer
                "movq %%mm2, %%mm4                                \n\t" // l5
661
                PAVGB(%%mm7, %%mm2)                                      // ~l5/2
662
                PAVGB(%%mm5, %%mm2)                                      // ~(l5 + 2l7)/4
663
                PAVGB(%%mm4, %%mm2)                                      // ~(5l5 + 2l7)/8
664

665
                "movq %%mm6, %%mm4                                \n\t" // ~(5l6 + 2l4)/8
666
                "psubusb %%mm2, %%mm6                                \n\t"
667
                "psubusb %%mm4, %%mm2                                \n\t"
668
                "por %%mm6, %%mm2                                \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
669
// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
670

671

672
                PMINUB(%%mm2, %%mm1, %%mm4)                              // MIN(|lenergy|,|renergy|)/8
673 9c9e467d Michael Niedermayer
                "movq %2, %%mm4                                        \n\t" // QP //FIXME QP+1 ?
674 9b464428 Felix Bünemann
                "paddusb "MANGLE(b01)", %%mm4                        \n\t"
675 7f16f6e6 Michael Niedermayer
                "pcmpgtb %%mm3, %%mm4                                \n\t" // |menergy|/8 < QP
676
                "psubusb %%mm1, %%mm3                                \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
677
                "pand %%mm4, %%mm3                                \n\t"
678

679
                "movq %%mm3, %%mm1                                \n\t"
680 9b464428 Felix Bünemann
//                "psubusb "MANGLE(b01)", %%mm3                        \n\t"
681 7f16f6e6 Michael Niedermayer
                PAVGB(%%mm7, %%mm3)
682
                PAVGB(%%mm7, %%mm3)
683
                "paddusb %%mm1, %%mm3                                \n\t"
684 9b464428 Felix Bünemann
//                "paddusb "MANGLE(b01)", %%mm3                        \n\t"
685 7f16f6e6 Michael Niedermayer

686
                "movq (%%eax, %1, 2), %%mm6                        \n\t" //l3
687
                "movq (%0, %1, 4), %%mm5                        \n\t" //l4
688
                "movq (%0, %1, 4), %%mm4                        \n\t" //l4
689
                "psubusb %%mm6, %%mm5                                \n\t"
690
                "psubusb %%mm4, %%mm6                                \n\t"
691
                "por %%mm6, %%mm5                                \n\t" // |l3-l4|
692
                "pcmpeqb %%mm7, %%mm6                                \n\t" // SIGN(l3-l4)
693
                "pxor %%mm6, %%mm0                                \n\t"
694
                "pand %%mm0, %%mm3                                \n\t"
695
                PMINUB(%%mm5, %%mm3, %%mm0)
696

697 9b464428 Felix Bünemann
                "psubusb "MANGLE(b01)", %%mm3                        \n\t"
698 7f16f6e6 Michael Niedermayer
                PAVGB(%%mm7, %%mm3)
699

700
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
701
                "movq (%0, %1, 4), %%mm2                        \n\t"
702
                "pxor %%mm6, %%mm0                                \n\t"
703
                "pxor %%mm6, %%mm2                                \n\t"
704
                "psubb %%mm3, %%mm0                                \n\t"
705
                "paddb %%mm3, %%mm2                                \n\t"
706
                "pxor %%mm6, %%mm0                                \n\t"
707
                "pxor %%mm6, %%mm2                                \n\t"
708
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
709
                "movq %%mm2, (%0, %1, 4)                        \n\t"
710
#endif
711
712
                "leal (%0, %1), %%eax                                \n\t"
713
                "pcmpeqb %%mm6, %%mm6                                \n\t" // -1
714
//        0        1        2        3        4        5        6        7
715 9c9e467d Michael Niedermayer
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        ecx+%1        ecx+2%1
716
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1
717 7f16f6e6 Michael Niedermayer
718
719
                "movq (%%eax, %1, 2), %%mm1                        \n\t" // l3
720
                "movq (%0, %1, 4), %%mm0                        \n\t" // l4
721
                "pxor %%mm6, %%mm1                                \n\t" // -l3-1
722
                PAVGB(%%mm1, %%mm0)                                      // -q+128 = (l4-l3+256)/2
723
// mm1=-l3-1, mm0=128-q
724
725
                "movq (%%eax, %1, 4), %%mm2                        \n\t" // l5
726
                "movq (%%eax, %1), %%mm3                        \n\t" // l2
727
                "pxor %%mm6, %%mm2                                \n\t" // -l5-1
728
                "movq %%mm2, %%mm5                                \n\t" // -l5-1
729 9b464428 Felix Bünemann
                "movq "MANGLE(b80)", %%mm4                        \n\t" // 128
730 9c9e467d Michael Niedermayer
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
731 7f16f6e6 Michael Niedermayer
                PAVGB(%%mm3, %%mm2)                                      // (l2-l5+256)/2
732
                PAVGB(%%mm0, %%mm4)                                      // ~(l4-l3)/4 + 128
733
                PAVGB(%%mm2, %%mm4)                                      // ~(l2-l5)/4 +(l4-l3)/8 + 128
734
                PAVGB(%%mm0, %%mm4)                                      // ~(l2-l5)/8 +5(l4-l3)/16 + 128
735
// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
736
737
                "movq (%%eax), %%mm2                                \n\t" // l1
738
                "pxor %%mm6, %%mm2                                \n\t" // -l1-1
739
                PAVGB(%%mm3, %%mm2)                                      // (l2-l1+256)/2
740
                PAVGB((%0), %%mm1)                                      // (l0-l3+256)/2
741 9b464428 Felix Bünemann
                "movq "MANGLE(b80)", %%mm3                        \n\t" // 128
742 7f16f6e6 Michael Niedermayer
                PAVGB(%%mm2, %%mm3)                                      // ~(l2-l1)/4 + 128
743
                PAVGB(%%mm1, %%mm3)                                      // ~(l0-l3)/4 +(l2-l1)/8 + 128
744
                PAVGB(%%mm2, %%mm3)                                      // ~(l0-l3)/8 +5(l2-l1)/16 + 128
745
// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
746
747 9c9e467d Michael Niedermayer
                PAVGB((%%ecx, %1), %%mm5)                              // (l6-l5+256)/2
748
                "movq (%%ecx, %1, 2), %%mm1                        \n\t" // l7
749 7f16f6e6 Michael Niedermayer
                "pxor %%mm6, %%mm1                                \n\t" // -l7-1
750
                PAVGB((%0, %1, 4), %%mm1)                              // (l4-l7+256)/2
751 9b464428 Felix Bünemann
                "movq "MANGLE(b80)", %%mm2                        \n\t" // 128
752 7f16f6e6 Michael Niedermayer
                PAVGB(%%mm5, %%mm2)                                      // ~(l6-l5)/4 + 128
753
                PAVGB(%%mm1, %%mm2)                                      // ~(l4-l7)/4 +(l6-l5)/8 + 128
754
                PAVGB(%%mm5, %%mm2)                                      // ~(l4-l7)/8 +5(l6-l5)/16 + 128
755
// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
756
757 9b464428 Felix Bünemann
                "movq "MANGLE(b00)", %%mm1                        \n\t" // 0
758
                "movq "MANGLE(b00)", %%mm5                        \n\t" // 0
759 7f16f6e6 Michael Niedermayer
                "psubb %%mm2, %%mm1                                \n\t" // 128 - renergy/16
760
                "psubb %%mm3, %%mm5                                \n\t" // 128 - lenergy/16
761
                PMAXUB(%%mm1, %%mm2)                                      // 128 + |renergy/16|
762
                 PMAXUB(%%mm5, %%mm3)                                      // 128 + |lenergy/16|
763
                PMINUB(%%mm2, %%mm3, %%mm1)                              // 128 + MIN(|lenergy|,|renergy|)/16
764
765
// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
766
767 9b464428 Felix Bünemann
                "movq "MANGLE(b00)", %%mm7                        \n\t" // 0
768 9c9e467d Michael Niedermayer
                "movq %2, %%mm2                                        \n\t" // QP
769 7f16f6e6 Michael Niedermayer
                PAVGB(%%mm6, %%mm2)                                      // 128 + QP/2
770
                "psubb %%mm6, %%mm2                                \n\t"
771
772
                "movq %%mm4, %%mm1                                \n\t"
773
                "pcmpgtb %%mm7, %%mm1                                \n\t" // SIGN(menergy)
774
                "pxor %%mm1, %%mm4                                \n\t"
775
                "psubb %%mm1, %%mm4                                \n\t" // 128 + |menergy|/16
776
                "pcmpgtb %%mm4, %%mm2                                \n\t" // |menergy|/16 < QP/2
777
                "psubusb %%mm3, %%mm4                                \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
778
// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
779
780
                "movq %%mm4, %%mm3                                \n\t" // d
781 9b464428 Felix Bünemann
                "psubusb "MANGLE(b01)", %%mm4                        \n\t"
782 7f16f6e6 Michael Niedermayer
                PAVGB(%%mm7, %%mm4)                                      // d/32
783
                PAVGB(%%mm7, %%mm4)                                      // (d + 32)/64
784
                "paddb %%mm3, %%mm4                                \n\t" // 5d/64
785
                "pand %%mm2, %%mm4                                \n\t"
786
787 9b464428 Felix Bünemann
                "movq "MANGLE(b80)", %%mm5                        \n\t" // 128
788 7f16f6e6 Michael Niedermayer
                "psubb %%mm0, %%mm5                                \n\t" // q
789
                "paddsb %%mm6, %%mm5                                \n\t" // fix bad rounding
790
                "pcmpgtb %%mm5, %%mm7                                \n\t" // SIGN(q)
791
                "pxor %%mm7, %%mm5                                \n\t"
792
793
                PMINUB(%%mm5, %%mm4, %%mm3)                              // MIN(|q|, 5d/64)
794
                "pxor %%mm1, %%mm7                                \n\t" // SIGN(d*q)
795
796
                "pand %%mm7, %%mm4                                \n\t"
797
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
798
                "movq (%0, %1, 4), %%mm2                        \n\t"
799
                "pxor %%mm1, %%mm0                                \n\t"
800
                "pxor %%mm1, %%mm2                                \n\t"
801
                "paddb %%mm4, %%mm0                                \n\t"
802
                "psubb %%mm4, %%mm2                                \n\t"
803
                "pxor %%mm1, %%mm0                                \n\t"
804
                "pxor %%mm1, %%mm2                                \n\t"
805
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
806
                "movq %%mm2, (%0, %1, 4)                        \n\t"
807
808
                :
809 9c9e467d Michael Niedermayer
                : "r" (src), "r" (stride), "m" (c->pQPb)
810
                : "%eax", "%ecx"
811 7f16f6e6 Michael Niedermayer
        );
812
813
/*
814
        {
815
        int x;
816
        src-= stride;
817
        for(x=0; x<BLOCK_SIZE; x++)
818
        {
819
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
820
                if(ABS(middleEnergy)< 8*QP)
821
                {
822
                        const int q=(src[l4] - src[l5])/2;
823
                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
824
                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
825

826
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
827
                        d= MAX(d, 0);
828

829
                        d= (5*d + 32) >> 6;
830
                        d*= SIGN(-middleEnergy);
831

832
                        if(q>0)
833
                        {
834
                                d= d<0 ? 0 : d;
835
                                d= d>q ? q : d;
836
                        }
837
                        else
838
                        {
839
                                d= d>0 ? 0 : d;
840
                                d= d<q ? q : d;
841
                        }
842

843
                        src[l4]-= d;
844
                        src[l5]+= d;
845
                }
846
                src++;
847
        }
848
src-=8;
849
        for(x=0; x<8; x++)
850
        {
851
                int y;
852
                for(y=4; y<6; y++)
853
                {
854
                        int d= src[x+y*stride] - tmp[x+(y-4)*8];
855
                        int ad= ABS(d);
856
                        static int max=0;
857
                        static int sum=0;
858
                        static int num=0;
859
                        static int bias=0;
860

861
                        if(max<ad) max=ad;
862
                        sum+= ad>3 ? 1 : 0;
863
                        if(ad>3)
864
                        {
865
                                src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
866
                        }
867
                        if(y==4) bias+=d;
868
                        num++;
869
                        if(num%1000000 == 0)
870
                        {
871
                                printf(" %d %d %d %d\n", num, sum, max, bias);
872
                        }
873
                }
874
        }
875
}
876
*/
877
#elif defined (HAVE_MMX)
878 acced553 Michael Niedermayer
        src+= stride*4;
879 3057fa66 Arpi
        asm volatile(
880
                "pxor %%mm7, %%mm7                                \n\t"
881 9c9e467d Michael Niedermayer
                "leal -40(%%esp), %%ecx                                \n\t" // make space for 4 8-byte vars
882
                "andl $0xFFFFFFF8, %%ecx                        \n\t" // align
883 3057fa66 Arpi
//        0        1        2        3        4        5        6        7
884 9c9e467d Michael Niedermayer
//        %0        %0+%1        %0+2%1        eax+2%1        %0+4%1        eax+4%1        edx+%1        edx+2%1
885
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1
886 3057fa66 Arpi
887
                "movq (%0), %%mm0                                \n\t"
888
                "movq %%mm0, %%mm1                                \n\t"
889
                "punpcklbw %%mm7, %%mm0                                \n\t" // low part of line 0
890
                "punpckhbw %%mm7, %%mm1                                \n\t" // high part of line 0
891
892 01dc3aa4 Michael Niedermayer
                "movq (%0, %1), %%mm2                                \n\t"
893
                "leal (%0, %1, 2), %%eax                        \n\t"
894 3057fa66 Arpi
                "movq %%mm2, %%mm3                                \n\t"
895
                "punpcklbw %%mm7, %%mm2                                \n\t" // low part of line 1
896
                "punpckhbw %%mm7, %%mm3                                \n\t" // high part of line 1
897
898 01dc3aa4 Michael Niedermayer
                "movq (%%eax), %%mm4                                \n\t"
899 3057fa66 Arpi
                "movq %%mm4, %%mm5                                \n\t"
900
                "punpcklbw %%mm7, %%mm4                                \n\t" // low part of line 2
901
                "punpckhbw %%mm7, %%mm5                                \n\t" // high part of line 2
902
903
                "paddw %%mm0, %%mm0                                \n\t" // 2L0
904
                "paddw %%mm1, %%mm1                                \n\t" // 2H0
905
                "psubw %%mm4, %%mm2                                \n\t" // L1 - L2
906
                "psubw %%mm5, %%mm3                                \n\t" // H1 - H2
907
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - L1 + L2
908
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - H1 + H2
909
910
                "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
911
                "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
912
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2
913
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2
914
915 01dc3aa4 Michael Niedermayer
                "movq (%%eax, %1), %%mm2                        \n\t"
916 3057fa66 Arpi
                "movq %%mm2, %%mm3                                \n\t"
917
                "punpcklbw %%mm7, %%mm2                                \n\t" // L3
918
                "punpckhbw %%mm7, %%mm3                                \n\t" // H3
919
920
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - L3
921
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - H3
922
                "psubw %%mm2, %%mm0                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
923
                "psubw %%mm3, %%mm1                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
924 9c9e467d Michael Niedermayer
                "movq %%mm0, (%%ecx)                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
925
                "movq %%mm1, 8(%%ecx)                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
926 3057fa66 Arpi
927 01dc3aa4 Michael Niedermayer
                "movq (%%eax, %1, 2), %%mm0                        \n\t"
928 3057fa66 Arpi
                "movq %%mm0, %%mm1                                \n\t"
929
                "punpcklbw %%mm7, %%mm0                                \n\t" // L4
930
                "punpckhbw %%mm7, %%mm1                                \n\t" // H4
931
932
                "psubw %%mm0, %%mm2                                \n\t" // L3 - L4
933
                "psubw %%mm1, %%mm3                                \n\t" // H3 - H4
934 9c9e467d Michael Niedermayer
                "movq %%mm2, 16(%%ecx)                                \n\t" // L3 - L4
935
                "movq %%mm3, 24(%%ecx)                                \n\t" // H3 - H4
936 3057fa66 Arpi
                "paddw %%mm4, %%mm4                                \n\t" // 2L2
937
                "paddw %%mm5, %%mm5                                \n\t" // 2H2
938
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - L3 + L4
939
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - H3 + H4
940
941 01dc3aa4 Michael Niedermayer
                "leal (%%eax, %1), %0                                \n\t"
942 3057fa66 Arpi
                "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
943
                "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
944
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4
945
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4
946
//50 opcodes so far
947 01dc3aa4 Michael Niedermayer
                "movq (%0, %1, 2), %%mm2                        \n\t"
948 3057fa66 Arpi
                "movq %%mm2, %%mm3                                \n\t"
949
                "punpcklbw %%mm7, %%mm2                                \n\t" // L5
950
                "punpckhbw %%mm7, %%mm3                                \n\t" // H5
951
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - L5
952
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - H5
953
                "psubw %%mm2, %%mm4                                \n\t" // 2L2 - 5L3 + 5L4 - 2L5
954
                "psubw %%mm3, %%mm5                                \n\t" // 2H2 - 5H3 + 5H4 - 2H5
955
956 01dc3aa4 Michael Niedermayer
                "movq (%%eax, %1, 4), %%mm6                        \n\t"
957 3057fa66 Arpi
                "punpcklbw %%mm7, %%mm6                                \n\t" // L6
958
                "psubw %%mm6, %%mm2                                \n\t" // L5 - L6
959 01dc3aa4 Michael Niedermayer
                "movq (%%eax, %1, 4), %%mm6                        \n\t"
960 3057fa66 Arpi
                "punpckhbw %%mm7, %%mm6                                \n\t" // H6
961
                "psubw %%mm6, %%mm3                                \n\t" // H5 - H6
962
963
                "paddw %%mm0, %%mm0                                \n\t" // 2L4
964
                "paddw %%mm1, %%mm1                                \n\t" // 2H4
965
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - L5 + L6
966
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - H5 + H6
967
968
                "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
969
                "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
970
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6
971
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6
972
973 01dc3aa4 Michael Niedermayer
                "movq (%0, %1, 4), %%mm2                        \n\t"
974 3057fa66 Arpi
                "movq %%mm2, %%mm3                                \n\t"
975
                "punpcklbw %%mm7, %%mm2                                \n\t" // L7
976
                "punpckhbw %%mm7, %%mm3                                \n\t" // H7
977
978
                "paddw %%mm2, %%mm2                                \n\t" // 2L7
979
                "paddw %%mm3, %%mm3                                \n\t" // 2H7
980
                "psubw %%mm2, %%mm0                                \n\t" // 2L4 - 5L5 + 5L6 - 2L7
981
                "psubw %%mm3, %%mm1                                \n\t" // 2H4 - 5H5 + 5H6 - 2H7
982
983 9c9e467d Michael Niedermayer
                "movq (%%ecx), %%mm2                                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
984
                "movq 8(%%ecx), %%mm3                                \n\t" // 2H0 - 5H1 + 5H2 - 2H3
985 8405b3fd Michael Niedermayer
986
#ifdef HAVE_MMX2
987
                "movq %%mm7, %%mm6                                \n\t" // 0
988
                "psubw %%mm0, %%mm6                                \n\t"
989
                "pmaxsw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
990
                "movq %%mm7, %%mm6                                \n\t" // 0
991
                "psubw %%mm1, %%mm6                                \n\t"
992
                "pmaxsw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
993
                "movq %%mm7, %%mm6                                \n\t" // 0
994
                "psubw %%mm2, %%mm6                                \n\t"
995
                "pmaxsw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
996
                "movq %%mm7, %%mm6                                \n\t" // 0
997
                "psubw %%mm3, %%mm6                                \n\t"
998
                "pmaxsw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
999
#else
1000 3057fa66 Arpi
                "movq %%mm7, %%mm6                                \n\t" // 0
1001
                "pcmpgtw %%mm0, %%mm6                                \n\t"
1002
                "pxor %%mm6, %%mm0                                \n\t"
1003
                "psubw %%mm6, %%mm0                                \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1004
                "movq %%mm7, %%mm6                                \n\t" // 0
1005
                "pcmpgtw %%mm1, %%mm6                                \n\t"
1006
                "pxor %%mm6, %%mm1                                \n\t"
1007
                "psubw %%mm6, %%mm1                                \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1008
                "movq %%mm7, %%mm6                                \n\t" // 0
1009
                "pcmpgtw %%mm2, %%mm6                                \n\t"
1010
                "pxor %%mm6, %%mm2                                \n\t"
1011
                "psubw %%mm6, %%mm2                                \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1012
                "movq %%mm7, %%mm6                                \n\t" // 0
1013
                "pcmpgtw %%mm3, %%mm6                                \n\t"
1014
                "pxor %%mm6, %%mm3                                \n\t"
1015
                "psubw %%mm6, %%mm3                                \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1016 8405b3fd Michael Niedermayer
#endif
1017 3057fa66 Arpi
1018
#ifdef HAVE_MMX2
1019
                "pminsw %%mm2, %%mm0                                \n\t"
1020
                "pminsw %%mm3, %%mm1                                \n\t"
1021
#else
1022
                "movq %%mm0, %%mm6                                \n\t"
1023
                "psubusw %%mm2, %%mm6                                \n\t"
1024
                "psubw %%mm6, %%mm0                                \n\t"
1025
                "movq %%mm1, %%mm6                                \n\t"
1026
                "psubusw %%mm3, %%mm6                                \n\t"
1027
                "psubw %%mm6, %%mm1                                \n\t"
1028
#endif
1029
1030
                "movq %%mm7, %%mm6                                \n\t" // 0
1031
                "pcmpgtw %%mm4, %%mm6                                \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1032
                "pxor %%mm6, %%mm4                                \n\t"
1033
                "psubw %%mm6, %%mm4                                \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1034
                "pcmpgtw %%mm5, %%mm7                                \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1035
                "pxor %%mm7, %%mm5                                \n\t"
1036
                "psubw %%mm7, %%mm5                                \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1037
// 100 opcodes
1038
                "movd %2, %%mm2                                        \n\t" // QP
1039
                "psllw $3, %%mm2                                \n\t" // 8QP
1040
                "movq %%mm2, %%mm3                                \n\t" // 8QP
1041
                "pcmpgtw %%mm4, %%mm2                                \n\t"
1042
                "pcmpgtw %%mm5, %%mm3                                \n\t"
1043
                "pand %%mm2, %%mm4                                \n\t"
1044
                "pand %%mm3, %%mm5                                \n\t"
1045
1046
1047
                "psubusw %%mm0, %%mm4                                \n\t" // hd
1048
                "psubusw %%mm1, %%mm5                                \n\t" // ld
1049
1050
1051 bf1595c4 Felix Bünemann
                "movq "MANGLE(w05)", %%mm2                        \n\t" // 5
1052 3057fa66 Arpi
                "pmullw %%mm2, %%mm4                                \n\t"
1053
                "pmullw %%mm2, %%mm5                                \n\t"
1054 bf1595c4 Felix Bünemann
                "movq "MANGLE(w20)", %%mm2                        \n\t" // 32
1055 3057fa66 Arpi
                "paddw %%mm2, %%mm4                                \n\t"
1056
                "paddw %%mm2, %%mm5                                \n\t"
1057
                "psrlw $6, %%mm4                                \n\t"
1058
                "psrlw $6, %%mm5                                \n\t"
1059
1060 9c9e467d Michael Niedermayer
                "movq 16(%%ecx), %%mm0                                \n\t" // L3 - L4
1061
                "movq 24(%%ecx), %%mm1                                \n\t" // H3 - H4
1062 3057fa66 Arpi
1063
                "pxor %%mm2, %%mm2                                \n\t"
1064
                "pxor %%mm3, %%mm3                                \n\t"
1065
1066
                "pcmpgtw %%mm0, %%mm2                                \n\t" // sign (L3-L4)
1067
                "pcmpgtw %%mm1, %%mm3                                \n\t" // sign (H3-H4)
1068
                "pxor %%mm2, %%mm0                                \n\t"
1069
                "pxor %%mm3, %%mm1                                \n\t"
1070
                "psubw %%mm2, %%mm0                                \n\t" // |L3-L4|
1071
                "psubw %%mm3, %%mm1                                \n\t" // |H3-H4|
1072 e5c30e06 Michael Niedermayer
                "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1073
                "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1074 3057fa66 Arpi
1075
                "pxor %%mm6, %%mm2                                \n\t"
1076
                "pxor %%mm7, %%mm3                                \n\t"
1077
                "pand %%mm2, %%mm4                                \n\t"
1078
                "pand %%mm3, %%mm5                                \n\t"
1079
1080
#ifdef HAVE_MMX2
1081
                "pminsw %%mm0, %%mm4                                \n\t"
1082
                "pminsw %%mm1, %%mm5                                \n\t"
1083
#else
1084
                "movq %%mm4, %%mm2                                \n\t"
1085
                "psubusw %%mm0, %%mm2                                \n\t"
1086
                "psubw %%mm2, %%mm4                                \n\t"
1087
                "movq %%mm5, %%mm2                                \n\t"
1088
                "psubusw %%mm1, %%mm2                                \n\t"
1089
                "psubw %%mm2, %%mm5                                \n\t"
1090
#endif
1091
                "pxor %%mm6, %%mm4                                \n\t"
1092
                "pxor %%mm7, %%mm5                                \n\t"
1093
                "psubw %%mm6, %%mm4                                \n\t"
1094
                "psubw %%mm7, %%mm5                                \n\t"
1095
                "packsswb %%mm5, %%mm4                                \n\t"
1096 01dc3aa4 Michael Niedermayer
                "movq (%0), %%mm0                                \n\t"
1097 3057fa66 Arpi
                "paddb   %%mm4, %%mm0                                \n\t"
1098 01dc3aa4 Michael Niedermayer
                "movq %%mm0, (%0)                                \n\t"
1099
                "movq (%0, %1), %%mm0                                \n\t"
1100 3057fa66 Arpi
                "psubb %%mm4, %%mm0                                \n\t"
1101 01dc3aa4 Michael Niedermayer
                "movq %%mm0, (%0, %1)                                \n\t"
1102 3057fa66 Arpi
1103 01dc3aa4 Michael Niedermayer
                : "+r" (src)
1104
                : "r" (stride), "m" (c->pQPb)
1105
                : "%eax", "%ecx"
1106 3057fa66 Arpi
        );
1107
#else
1108
        const int l1= stride;
1109
        const int l2= stride + l1;
1110
        const int l3= stride + l2;
1111
        const int l4= stride + l3;
1112
        const int l5= stride + l4;
1113
        const int l6= stride + l5;
1114
        const int l7= stride + l6;
1115
        const int l8= stride + l7;
1116
//        const int l9= stride + l8;
1117 d5a1a995 Michael Niedermayer
        int x;
1118 acced553 Michael Niedermayer
        src+= stride*3;
1119 d5a1a995 Michael Niedermayer
        for(x=0; x<BLOCK_SIZE; x++)
1120 3057fa66 Arpi
        {
1121
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1122 9c9e467d Michael Niedermayer
                if(ABS(middleEnergy) < 8*c->QP)
1123 3057fa66 Arpi
                {
1124
                        const int q=(src[l4] - src[l5])/2;
1125
                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1126
                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1127
1128
                        int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1129
                        d= MAX(d, 0);
1130
1131
                        d= (5*d + 32) >> 6;
1132
                        d*= SIGN(-middleEnergy);
1133
1134
                        if(q>0)
1135
                        {
1136
                                d= d<0 ? 0 : d;
1137
                                d= d>q ? q : d;
1138
                        }
1139
                        else
1140
                        {
1141
                                d= d>0 ? 0 : d;
1142
                                d= d<q ? q : d;
1143
                        }
1144
1145
                        src[l4]-= d;
1146
                        src[l5]+= d;
1147
                }
1148
                src++;
1149
        }
1150
#endif
1151
}
1152
1153 9c9e467d Michael Niedermayer
static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1154 3057fa66 Arpi
{
1155 e0f8ffae Michael Niedermayer
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1156 3057fa66 Arpi
        asm volatile(
1157 9c9e467d Michael Niedermayer
                "pxor %%mm6, %%mm6                                \n\t"
1158
                "pcmpeqb %%mm7, %%mm7                                \n\t"
1159
                "movq %2, %%mm0                                        \n\t"
1160
                "punpcklbw %%mm6, %%mm0                                \n\t"
1161
                "psrlw $1, %%mm0                                \n\t"
1162
                "psubw %%mm7, %%mm0                                \n\t"
1163
                "packuswb %%mm0, %%mm0                                \n\t"
1164
                "movq %%mm0, %3                                        \n\t"
1165 70c5ae87 Michael Niedermayer
1166 3057fa66 Arpi
                "leal (%0, %1), %%eax                                \n\t"
1167 9c9e467d Michael Niedermayer
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1168
                
1169 3057fa66 Arpi
//        0        1        2        3        4        5        6        7        8        9
1170 9c9e467d Michael Niedermayer
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1171 3057fa66 Arpi
1172 cc9b0679 Michael Niedermayer
#undef FIND_MIN_MAX
1173 e0f8ffae Michael Niedermayer
#ifdef HAVE_MMX2
1174 3057fa66 Arpi
#define FIND_MIN_MAX(addr)\
1175 70c5ae87 Michael Niedermayer
                "movq " #addr ", %%mm0                                \n\t"\
1176 cd38e322 Michael Niedermayer
                "pminub %%mm0, %%mm7                                \n\t"\
1177
                "pmaxub %%mm0, %%mm6                                \n\t"
1178 e0f8ffae Michael Niedermayer
#else
1179
#define FIND_MIN_MAX(addr)\
1180
                "movq " #addr ", %%mm0                                \n\t"\
1181 cd38e322 Michael Niedermayer
                "movq %%mm7, %%mm1                                \n\t"\
1182
                "psubusb %%mm0, %%mm6                                \n\t"\
1183
                "paddb %%mm0, %%mm6                                \n\t"\
1184 e0f8ffae Michael Niedermayer
                "psubusb %%mm0, %%mm1                                \n\t"\
1185 cd38e322 Michael Niedermayer
                "psubb %%mm1, %%mm7                                \n\t"
1186 e0f8ffae Michael Niedermayer
#endif
1187 3057fa66 Arpi
1188 70c5ae87 Michael Niedermayer
FIND_MIN_MAX((%%eax))
1189
FIND_MIN_MAX((%%eax, %1))
1190
FIND_MIN_MAX((%%eax, %1, 2))
1191
FIND_MIN_MAX((%0, %1, 4))
1192 9c9e467d Michael Niedermayer
FIND_MIN_MAX((%%edx))
1193
FIND_MIN_MAX((%%edx, %1))
1194
FIND_MIN_MAX((%%edx, %1, 2))
1195 70c5ae87 Michael Niedermayer
FIND_MIN_MAX((%0, %1, 8))
1196 3057fa66 Arpi
1197
                "movq %%mm7, %%mm4                                \n\t"
1198 e5c30e06 Michael Niedermayer
                "psrlq $8, %%mm7                                \n\t"
1199
#ifdef HAVE_MMX2
1200 cd38e322 Michael Niedermayer
                "pminub %%mm4, %%mm7                                \n\t" // min of pixels
1201 e5c30e06 Michael Niedermayer
                "pshufw $0xF9, %%mm7, %%mm4                        \n\t"
1202 cd38e322 Michael Niedermayer
                "pminub %%mm4, %%mm7                                \n\t" // min of pixels
1203 e5c30e06 Michael Niedermayer
                "pshufw $0xFE, %%mm7, %%mm4                        \n\t"
1204 cd38e322 Michael Niedermayer
                "pminub %%mm4, %%mm7                                \n\t"
1205 e5c30e06 Michael Niedermayer
#else
1206 cd38e322 Michael Niedermayer
                "movq %%mm7, %%mm1                                \n\t"
1207
                "psubusb %%mm4, %%mm1                                \n\t"
1208
                "psubb %%mm1, %%mm7                                \n\t"
1209 3057fa66 Arpi
                "movq %%mm7, %%mm4                                \n\t"
1210
                "psrlq $16, %%mm7                                \n\t"
1211 cd38e322 Michael Niedermayer
                "movq %%mm7, %%mm1                                \n\t"
1212
                "psubusb %%mm4, %%mm1                                \n\t"
1213
                "psubb %%mm1, %%mm7                                \n\t"
1214 3057fa66 Arpi
                "movq %%mm7, %%mm4                                \n\t"
1215 e5c30e06 Michael Niedermayer
                "psrlq $32, %%mm7                                \n\t"
1216 cd38e322 Michael Niedermayer
                "movq %%mm7, %%mm1                                \n\t"
1217
                "psubusb %%mm4, %%mm1                                \n\t"
1218
                "psubb %%mm1, %%mm7                                \n\t"
1219 e5c30e06 Michael Niedermayer
#endif
1220 cd38e322 Michael Niedermayer
1221
1222
                "movq %%mm6, %%mm4                                \n\t"
1223
                "psrlq $8, %%mm6                                \n\t"
1224
#ifdef HAVE_MMX2
1225
                "pmaxub %%mm4, %%mm6                                \n\t" // max of pixels
1226
                "pshufw $0xF9, %%mm6, %%mm4                        \n\t"
1227
                "pmaxub %%mm4, %%mm6                                \n\t"
1228
                "pshufw $0xFE, %%mm6, %%mm4                        \n\t"
1229
                "pmaxub %%mm4, %%mm6                                \n\t"
1230
#else
1231
                "psubusb %%mm4, %%mm6                                \n\t"
1232
                "paddb %%mm4, %%mm6                                \n\t"
1233
                "movq %%mm6, %%mm4                                \n\t"
1234
                "psrlq $16, %%mm6                                \n\t"
1235
                "psubusb %%mm4, %%mm6                                \n\t"
1236
                "paddb %%mm4, %%mm6                                \n\t"
1237
                "movq %%mm6, %%mm4                                \n\t"
1238
                "psrlq $32, %%mm6                                \n\t"
1239
                "psubusb %%mm4, %%mm6                                \n\t"
1240
                "paddb %%mm4, %%mm6                                \n\t"
1241
#endif
1242
                "movq %%mm6, %%mm0                                \n\t" // max
1243
                "psubb %%mm7, %%mm6                                \n\t" // max - min
1244
                "movd %%mm6, %%ecx                                \n\t"
1245 9b464428 Felix Bünemann
                "cmpb "MANGLE(deringThreshold)", %%cl                \n\t"
1246 cd38e322 Michael Niedermayer
                " jb 1f                                                \n\t"
1247 9c9e467d Michael Niedermayer
                "leal -24(%%esp), %%ecx                                \n\t"
1248
                "andl $0xFFFFFFF8, %%ecx                        \n\t" 
1249 cd38e322 Michael Niedermayer
                PAVGB(%%mm0, %%mm7)                                      // a=(max + min)/2
1250 e5c30e06 Michael Niedermayer
                "punpcklbw %%mm7, %%mm7                                \n\t"
1251
                "punpcklbw %%mm7, %%mm7                                \n\t"
1252
                "punpcklbw %%mm7, %%mm7                                \n\t"
1253 9c9e467d Michael Niedermayer
                "movq %%mm7, (%%ecx)                                \n\t"
1254 70c5ae87 Michael Niedermayer
1255
                "movq (%0), %%mm0                                \n\t" // L10
1256
                "movq %%mm0, %%mm1                                \n\t" // L10
1257
                "movq %%mm0, %%mm2                                \n\t" // L10
1258
                "psllq $8, %%mm1                                \n\t"
1259
                "psrlq $8, %%mm2                                \n\t"
1260
                "movd -4(%0), %%mm3                                \n\t"
1261
                "movd 8(%0), %%mm4                                \n\t"
1262
                "psrlq $24, %%mm3                                \n\t"
1263
                "psllq $56, %%mm4                                \n\t"
1264
                "por %%mm3, %%mm1                                \n\t" // L00
1265
                "por %%mm4, %%mm2                                \n\t" // L20
1266
                "movq %%mm1, %%mm3                                \n\t" // L00
1267
                PAVGB(%%mm2, %%mm1)                                      // (L20 + L00)/2
1268
                PAVGB(%%mm0, %%mm1)                                      // (L20 + L00 + 2L10)/4
1269
                "psubusb %%mm7, %%mm0                                \n\t"
1270
                "psubusb %%mm7, %%mm2                                \n\t"
1271
                "psubusb %%mm7, %%mm3                                \n\t"
1272 9b464428 Felix Bünemann
                "pcmpeqb "MANGLE(b00)", %%mm0                        \n\t" // L10 > a ? 0 : -1
1273
                "pcmpeqb "MANGLE(b00)", %%mm2                        \n\t" // L20 > a ? 0 : -1
1274
                "pcmpeqb "MANGLE(b00)", %%mm3                        \n\t" // L00 > a ? 0 : -1
1275 70c5ae87 Michael Niedermayer
                "paddb %%mm2, %%mm0                                \n\t"
1276
                "paddb %%mm3, %%mm0                                \n\t"
1277
1278
                "movq (%%eax), %%mm2                                \n\t" // L11
1279
                "movq %%mm2, %%mm3                                \n\t" // L11
1280
                "movq %%mm2, %%mm4                                \n\t" // L11
1281
                "psllq $8, %%mm3                                \n\t"
1282
                "psrlq $8, %%mm4                                \n\t"
1283
                "movd -4(%%eax), %%mm5                                \n\t"
1284
                "movd 8(%%eax), %%mm6                                \n\t"
1285
                "psrlq $24, %%mm5                                \n\t"
1286
                "psllq $56, %%mm6                                \n\t"
1287
                "por %%mm5, %%mm3                                \n\t" // L01
1288
                "por %%mm6, %%mm4                                \n\t" // L21
1289
                "movq %%mm3, %%mm5                                \n\t" // L01
1290
                PAVGB(%%mm4, %%mm3)                                      // (L21 + L01)/2
1291
                PAVGB(%%mm2, %%mm3)                                      // (L21 + L01 + 2L11)/4
1292
                "psubusb %%mm7, %%mm2                                \n\t"
1293
                "psubusb %%mm7, %%mm4                                \n\t"
1294
                "psubusb %%mm7, %%mm5                                \n\t"
1295 9b464428 Felix Bünemann
                "pcmpeqb "MANGLE(b00)", %%mm2                        \n\t" // L11 > a ? 0 : -1
1296
                "pcmpeqb "MANGLE(b00)", %%mm4                        \n\t" // L21 > a ? 0 : -1
1297
                "pcmpeqb "MANGLE(b00)", %%mm5                        \n\t" // L01 > a ? 0 : -1
1298 70c5ae87 Michael Niedermayer
                "paddb %%mm4, %%mm2                                \n\t"
1299
                "paddb %%mm5, %%mm2                                \n\t"
1300
// 0, 2, 3, 1
1301
#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1302
                "movq " #src ", " #sx "                                \n\t" /* src[0] */\
1303
                "movq " #sx ", " #lx "                                \n\t" /* src[0] */\
1304
                "movq " #sx ", " #t0 "                                \n\t" /* src[0] */\
1305
                "psllq $8, " #lx "                                \n\t"\
1306
                "psrlq $8, " #t0 "                                \n\t"\
1307
                "movd -4" #src ", " #t1 "                        \n\t"\
1308
                "psrlq $24, " #t1 "                                \n\t"\
1309
                "por " #t1 ", " #lx "                                \n\t" /* src[-1] */\
1310
                "movd 8" #src ", " #t1 "                        \n\t"\
1311
                "psllq $56, " #t1 "                                \n\t"\
1312
                "por " #t1 ", " #t0 "                                \n\t" /* src[+1] */\
1313
                "movq " #lx ", " #t1 "                                \n\t" /* src[-1] */\
1314
                PAVGB(t0, lx)                                              /* (src[-1] + src[+1])/2 */\
1315
                PAVGB(sx, lx)                                      /* (src[-1] + 2src[0] + src[+1])/4 */\
1316 9927c7ee Michael Niedermayer
                PAVGB(lx, pplx)                                             \
1317 9c9e467d Michael Niedermayer
                "movq " #lx ", 8(%%ecx)                                \n\t"\
1318
                "movq (%%ecx), " #lx "                                \n\t"\
1319 8405b3fd Michael Niedermayer
                "psubusb " #lx ", " #t1 "                        \n\t"\
1320
                "psubusb " #lx ", " #t0 "                        \n\t"\
1321
                "psubusb " #lx ", " #sx "                        \n\t"\
1322 9b464428 Felix Bünemann
                "movq "MANGLE(b00)", " #lx "                        \n\t"\
1323 8405b3fd Michael Niedermayer
                "pcmpeqb " #lx ", " #t1 "                        \n\t" /* src[-1] > a ? 0 : -1*/\
1324
                "pcmpeqb " #lx ", " #t0 "                        \n\t" /* src[+1] > a ? 0 : -1*/\
1325
                "pcmpeqb " #lx ", " #sx "                        \n\t" /* src[0]  > a ? 0 : -1*/\
1326 70c5ae87 Michael Niedermayer
                "paddb " #t1 ", " #t0 "                                \n\t"\
1327
                "paddb " #t0 ", " #sx "                                \n\t"\
1328
\
1329
                PAVGB(plx, pplx)                                      /* filtered */\
1330
                "movq " #dst ", " #t0 "                                \n\t" /* dst */\
1331 2e212618 Michael Niedermayer
                "movq " #t0 ", " #t1 "                                \n\t" /* dst */\
1332 9c9e467d Michael Niedermayer
                "psubusb %3, " #t0 "                                \n\t"\
1333
                "paddusb %3, " #t1 "                                \n\t"\
1334 2e212618 Michael Niedermayer
                PMAXUB(t0, pplx)\
1335
                PMINUB(t1, pplx, t0)\
1336 70c5ae87 Michael Niedermayer
                "paddb " #sx ", " #ppsx "                        \n\t"\
1337
                "paddb " #psx ", " #ppsx "                        \n\t"\
1338 9b464428 Felix Bünemann
                "#paddb "MANGLE(b02)", " #ppsx "                \n\t"\
1339
                "pand "MANGLE(b08)", " #ppsx "                        \n\t"\
1340 8405b3fd Michael Niedermayer
                "pcmpeqb " #lx ", " #ppsx "                        \n\t"\
1341 2e212618 Michael Niedermayer
                "pand " #ppsx ", " #pplx "                        \n\t"\
1342 70c5ae87 Michael Niedermayer
                "pandn " #dst ", " #ppsx "                        \n\t"\
1343 8405b3fd Michael Niedermayer
                "por " #pplx ", " #ppsx "                        \n\t"\
1344 9927c7ee Michael Niedermayer
                "movq " #ppsx ", " #dst "                        \n\t"\
1345 9c9e467d Michael Niedermayer
                "movq 8(%%ecx), " #lx "                                \n\t"
1346 2e212618 Michael Niedermayer
1347 70c5ae87 Michael Niedermayer
/*
1348
0000000
1349
1111111
1350 e5c30e06 Michael Niedermayer

1351 70c5ae87 Michael Niedermayer
1111110
1352
1111101
1353
1111100
1354
1111011
1355
1111010
1356
1111001
1357 e5c30e06 Michael Niedermayer

1358 70c5ae87 Michael Niedermayer
1111000
1359
1110111
1360 e5c30e06 Michael Niedermayer

1361 70c5ae87 Michael Niedermayer
*/
1362
//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1363
DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1364
DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1365
DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1366 9c9e467d Michael Niedermayer
DERING_CORE((%0, %1, 4),(%%edx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1367
DERING_CORE((%%edx),(%%edx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1368
DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1369
DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1370
DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1371 3057fa66 Arpi
1372 cd38e322 Michael Niedermayer
                "1:                        \n\t"
1373 9c9e467d Michael Niedermayer
                : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
1374
                : "%eax", "%edx", "%ecx"
1375 3057fa66 Arpi
        );
1376
#else
1377 2e212618 Michael Niedermayer
        int y;
1378
        int min=255;
1379
        int max=0;
1380
        int avg;
1381
        uint8_t *p;
1382
        int s[10];
1383 9c9e467d Michael Niedermayer
        const int QP2= c->QP/2 + 1;
1384 2e212618 Michael Niedermayer
1385
        for(y=1; y<9; y++)
1386
        {
1387
                int x;
1388
                p= src + stride*y;
1389
                for(x=1; x<9; x++)
1390
                {
1391
                        p++;
1392
                        if(*p > max) max= *p;
1393
                        if(*p < min) min= *p;
1394
                }
1395
        }
1396 9c9e467d Michael Niedermayer
        avg= (min + max + 1)>>1;
1397 2e212618 Michael Niedermayer
1398 cd38e322 Michael Niedermayer
        if(max - min <deringThreshold) return;
1399
1400 2e212618 Michael Niedermayer
        for(y=0; y<10; y++)
1401
        {
1402
                int t = 0;
1403 9c9e467d Michael Niedermayer
1404
                if(src[stride*y + 0] > avg) t+= 1;
1405
                if(src[stride*y + 1] > avg) t+= 2;
1406
                if(src[stride*y + 2] > avg) t+= 4;
1407
                if(src[stride*y + 3] > avg) t+= 8;
1408
                if(src[stride*y + 4] > avg) t+= 16;
1409
                if(src[stride*y + 5] > avg) t+= 32;
1410
                if(src[stride*y + 6] > avg) t+= 64;
1411
                if(src[stride*y + 7] > avg) t+= 128;
1412
                if(src[stride*y + 8] > avg) t+= 256;
1413
                if(src[stride*y + 9] > avg) t+= 512;
1414
                
1415 2e212618 Michael Niedermayer
                t |= (~t)<<16;
1416
                t &= (t<<1) & (t>>1);
1417
                s[y] = t;
1418
        }
1419 9c9e467d Michael Niedermayer
        
1420 2e212618 Michael Niedermayer
        for(y=1; y<9; y++)
1421
        {
1422
                int t = s[y-1] & s[y] & s[y+1];
1423
                t|= t>>16;
1424 9c9e467d Michael Niedermayer
                s[y-1]= t;
1425
        }
1426
1427
        for(y=1; y<9; y++)
1428
        {
1429
                int x;
1430
                int t = s[y-1];
1431 2e212618 Michael Niedermayer
1432
                p= src + stride*y;
1433
                for(x=1; x<9; x++)
1434
                {
1435
                        p++;
1436
                        if(t & (1<<x))
1437
                        {
1438
                                int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1439
                                      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1440
                                      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1441
                                f= (f + 8)>>4;
1442
1443 cd38e322 Michael Niedermayer
#ifdef DEBUG_DERING_THRESHOLD
1444
                                asm volatile("emms\n\t":);
1445
                                {
1446
                                static long long numPixels=0;
1447
                                if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1448
//                                if((max-min)<20 || (max-min)*QP<200)
1449
//                                if((max-min)*QP < 500)
1450
//                                if(max-min<QP/2)
1451
                                if(max-min < 20)
1452
                                {
1453
                                        static int numSkiped=0;
1454
                                        static int errorSum=0;
1455
                                        static int worstQP=0;
1456
                                        static int worstRange=0;
1457
                                        static int worstDiff=0;
1458
                                        int diff= (f - *p);
1459
                                        int absDiff= ABS(diff);
1460
                                        int error= diff*diff;
1461
1462
                                        if(x==1 || x==8 || y==1 || y==8) continue;
1463
1464
                                        numSkiped++;
1465
                                        if(absDiff > worstDiff)
1466
                                        {
1467
                                                worstDiff= absDiff;
1468
                                                worstQP= QP;
1469
                                                worstRange= max-min;
1470
                                        }
1471
                                        errorSum+= error;
1472
1473
                                        if(1024LL*1024LL*1024LL % numSkiped == 0)
1474
                                        {
1475
                                                printf( "sum:%1.3f, skip:%d, wQP:%d, "
1476
                                                        "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1477
                                                        (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1478
                                                        worstDiff, (float)numSkiped/numPixels);
1479
                                        }
1480
                                }
1481
                                }
1482
#endif
1483 9c9e467d Michael Niedermayer
                                if     (*p + QP2 < f) *p= *p + QP2;
1484
                                else if(*p - QP2 > f) *p= *p - QP2;
1485 2e212618 Michael Niedermayer
                                else *p=f;
1486
                        }
1487
                }
1488
        }
1489 cd38e322 Michael Niedermayer
#ifdef DEBUG_DERING_THRESHOLD
1490
        if(max-min < 20)
1491
        {
1492
                for(y=1; y<9; y++)
1493
                {
1494
                        int x;
1495
                        int t = 0;
1496
                        p= src + stride*y;
1497
                        for(x=1; x<9; x++)
1498
                        {
1499
                                p++;
1500
                                *p = MIN(*p + 20, 255);
1501
                        }
1502
                }
1503
//                src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1504
        }
1505
#endif
1506 3057fa66 Arpi
#endif
1507
}
1508
1509 3b58b885 Michael Niedermayer
/**
1510 b304569a Michael Niedermayer
 * Deinterlaces the given block by linearly interpolating every second line.
1511 7fb36f6c Michael Niedermayer
 * will be called for every 8x8 block and can read & write from line 4-15
1512
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1513
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1514 3b58b885 Michael Niedermayer
 */
1515 cc9b0679 Michael Niedermayer
static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1516 3b58b885 Michael Niedermayer
{
1517
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1518 7fb36f6c Michael Niedermayer
        src+= 4*stride;
1519 3b58b885 Michael Niedermayer
        asm volatile(
1520
                "leal (%0, %1), %%eax                                \n\t"
1521 9c9e467d Michael Niedermayer
                "leal (%%eax, %1, 4), %%ecx                        \n\t"
1522 3b58b885 Michael Niedermayer
//        0        1        2        3        4        5        6        7        8        9
1523 9c9e467d Michael Niedermayer
//        %0        eax        eax+%1        eax+2%1        %0+4%1        ecx        ecx+%1        ecx+2%1        %0+8%1        ecx+4%1
1524 3b58b885 Michael Niedermayer
1525
                "movq (%0), %%mm0                                \n\t"
1526
                "movq (%%eax, %1), %%mm1                        \n\t"
1527 acced553 Michael Niedermayer
                PAVGB(%%mm1, %%mm0)
1528 3b58b885 Michael Niedermayer
                "movq %%mm0, (%%eax)                                \n\t"
1529
                "movq (%0, %1, 4), %%mm0                        \n\t"
1530 acced553 Michael Niedermayer
                PAVGB(%%mm0, %%mm1)
1531 3b58b885 Michael Niedermayer
                "movq %%mm1, (%%eax, %1, 2)                        \n\t"
1532 9c9e467d Michael Niedermayer
                "movq (%%ecx, %1), %%mm1                        \n\t"
1533 acced553 Michael Niedermayer
                PAVGB(%%mm1, %%mm0)
1534 9c9e467d Michael Niedermayer
                "movq %%mm0, (%%ecx)                                \n\t"
1535 3b58b885 Michael Niedermayer
                "movq (%0, %1, 8), %%mm0                        \n\t"
1536 acced553 Michael Niedermayer
                PAVGB(%%mm0, %%mm1)
1537 9c9e467d Michael Niedermayer
                "movq %%mm1, (%%ecx, %1, 2)                        \n\t"
1538 3b58b885 Michael Niedermayer
1539
                : : "r" (src), "r" (stride)
1540 9c9e467d Michael Niedermayer
                : "%eax", "%ecx"
1541 3b58b885 Michael Niedermayer
        );
1542
#else
1543 99d33fa3 Michael Niedermayer
        int a, b, x;
1544 7fb36f6c Michael Niedermayer
        src+= 4*stride;
1545 99d33fa3 Michael Niedermayer
1546
        for(x=0; x<2; x++){
1547
                a= *(uint32_t*)&src[stride*0];
1548
                b= *(uint32_t*)&src[stride*2];
1549
                *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1550
                a= *(uint32_t*)&src[stride*4];
1551
                *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1552
                b= *(uint32_t*)&src[stride*6];
1553
                *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1554
                a= *(uint32_t*)&src[stride*8];
1555
                *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1556
                src += 4;
1557 3b58b885 Michael Niedermayer
        }
1558
#endif
1559
}
1560
1561
/**
1562 b304569a Michael Niedermayer
 * Deinterlaces the given block by cubic interpolating every second line.
1563 7fb36f6c Michael Niedermayer
 * will be called for every 8x8 block and can read & write from line 4-15
1564
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1565
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1566
 * this filter will read lines 3-15 and write 7-13
1567 3b58b885 Michael Niedermayer
 */
1568 cc9b0679 Michael Niedermayer
static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1569 3b58b885 Michael Niedermayer
{
1570
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1571 7fb36f6c Michael Niedermayer
        src+= stride*3;
1572 3b58b885 Michael Niedermayer
        asm volatile(
1573
                "leal (%0, %1), %%eax                                \n\t"
1574 9c9e467d Michael Niedermayer
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1575
                "leal (%%edx, %1, 4), %%ecx                        \n\t"
1576 acced553 Michael Niedermayer
                "addl %1, %%ecx                                        \n\t"
1577
                "pxor %%mm7, %%mm7                                \n\t"
1578
//        0        1        2        3        4        5        6        7        8        9        10
1579 9c9e467d Michael Niedermayer
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1 ecx
1580 3b58b885 Michael Niedermayer
1581 acced553 Michael Niedermayer
#define DEINT_CUBIC(a,b,c,d,e)\
1582
                "movq " #a ", %%mm0                                \n\t"\
1583
                "movq " #b ", %%mm1                                \n\t"\
1584
                "movq " #d ", %%mm2                                \n\t"\
1585
                "movq " #e ", %%mm3                                \n\t"\
1586
                PAVGB(%%mm2, %%mm1)                                        /* (b+d) /2 */\
1587
                PAVGB(%%mm3, %%mm0)                                        /* a(a+e) /2 */\
1588
                "movq %%mm0, %%mm2                                \n\t"\
1589
                "punpcklbw %%mm7, %%mm0                                \n\t"\
1590
                "punpckhbw %%mm7, %%mm2                                \n\t"\
1591
                "movq %%mm1, %%mm3                                \n\t"\
1592
                "punpcklbw %%mm7, %%mm1                                \n\t"\
1593
                "punpckhbw %%mm7, %%mm3                                \n\t"\
1594
                "psubw %%mm1, %%mm0                                \n\t"        /* L(a+e - (b+d))/2 */\
1595
                "psubw %%mm3, %%mm2                                \n\t"        /* H(a+e - (b+d))/2 */\
1596
                "psraw $3, %%mm0                                \n\t"        /* L(a+e - (b+d))/16 */\
1597
                "psraw $3, %%mm2                                \n\t"        /* H(a+e - (b+d))/16 */\
1598
                "psubw %%mm0, %%mm1                                \n\t"        /* L(9b + 9d - a - e)/16 */\
1599
                "psubw %%mm2, %%mm3                                \n\t"        /* H(9b + 9d - a - e)/16 */\
1600
                "packuswb %%mm3, %%mm1                                \n\t"\
1601
                "movq %%mm1, " #c "                                \n\t"
1602
1603 9c9e467d Michael Niedermayer
DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))
1604
DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))
1605
DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))
1606
DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1607 3b58b885 Michael Niedermayer
1608
                : : "r" (src), "r" (stride)
1609 9c9e467d Michael Niedermayer
                : "%eax", "%edx", "ecx"
1610 3b58b885 Michael Niedermayer
        );
1611
#else
1612
        int x;
1613 7fb36f6c Michael Niedermayer
        src+= stride*3;
1614 3b58b885 Michael Niedermayer
        for(x=0; x<8; x++)
1615
        {
1616 134eb1e5 Michael Niedermayer
                src[stride*3] = CLIP((-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1617
                src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1618
                src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1619
                src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1620 3b58b885 Michael Niedermayer
                src++;
1621
        }
1622
#endif
1623
}
1624
1625
/**
1626 b304569a Michael Niedermayer
 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1627 7fb36f6c Michael Niedermayer
 * will be called for every 8x8 block and can read & write from line 4-15
1628
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1629
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1630 9c9e467d Michael Niedermayer
 * this filter will read lines 4-13 and write 5-11
1631
 */
1632
static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1633
{
1634
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1635
        src+= stride*4;
1636
        asm volatile(
1637
                "leal (%0, %1), %%eax                                \n\t"
1638
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1639
                "pxor %%mm7, %%mm7                                \n\t"
1640
                "movq (%2), %%mm0                                \n\t"
1641
//        0        1        2        3        4        5        6        7        8        9        10
1642
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1 ecx
1643
1644
#define DEINT_FF(a,b,c,d)\
1645
                "movq " #a ", %%mm1                                \n\t"\
1646
                "movq " #b ", %%mm2                                \n\t"\
1647
                "movq " #c ", %%mm3                                \n\t"\
1648
                "movq " #d ", %%mm4                                \n\t"\
1649
                PAVGB(%%mm3, %%mm1)                                        \
1650
                PAVGB(%%mm4, %%mm0)                                        \
1651
                "movq %%mm0, %%mm3                                \n\t"\
1652
                "punpcklbw %%mm7, %%mm0                                \n\t"\
1653
                "punpckhbw %%mm7, %%mm3                                \n\t"\
1654
                "movq %%mm1, %%mm4                                \n\t"\
1655
                "punpcklbw %%mm7, %%mm1                                \n\t"\
1656
                "punpckhbw %%mm7, %%mm4                                \n\t"\
1657
                "psllw $2, %%mm1                                \n\t"\
1658
                "psllw $2, %%mm4                                \n\t"\
1659
                "psubw %%mm0, %%mm1                                \n\t"\
1660
                "psubw %%mm3, %%mm4                                \n\t"\
1661
                "movq %%mm2, %%mm5                                \n\t"\
1662
                "movq %%mm2, %%mm0                                \n\t"\
1663
                "punpcklbw %%mm7, %%mm2                                \n\t"\
1664
                "punpckhbw %%mm7, %%mm5                                \n\t"\
1665
                "paddw %%mm2, %%mm1                                \n\t"\
1666
                "paddw %%mm5, %%mm4                                \n\t"\
1667
                "psraw $2, %%mm1                                \n\t"\
1668
                "psraw $2, %%mm4                                \n\t"\
1669
                "packuswb %%mm4, %%mm1                                \n\t"\
1670
                "movq %%mm1, " #b "                                \n\t"\
1671
1672
DEINT_FF((%0)       , (%%eax)       , (%%eax, %1), (%%eax, %1, 2))
1673
DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx)       )
1674
DEINT_FF((%0, %1, 4), (%%edx)       , (%%edx, %1), (%%edx, %1, 2))
1675
DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4))
1676
1677
                "movq %%mm0, (%2)                                \n\t"
1678
                : : "r" (src), "r" (stride), "r"(tmp)
1679
                : "%eax", "%edx"
1680
        );
1681
#else
1682
        int x;
1683
        src+= stride*4;
1684
        for(x=0; x<8; x++)
1685
        {
1686
                int t1= tmp[x];
1687
                int t2= src[stride*1];
1688
1689 134eb1e5 Michael Niedermayer
                src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1690 9c9e467d Michael Niedermayer
                t1= src[stride*4];
1691 134eb1e5 Michael Niedermayer
                src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1692 9c9e467d Michael Niedermayer
                t2= src[stride*6];
1693 134eb1e5 Michael Niedermayer
                src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1694 9c9e467d Michael Niedermayer
                t1= src[stride*8];
1695 134eb1e5 Michael Niedermayer
                src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1696 9c9e467d Michael Niedermayer
                tmp[x]= t1;
1697
1698
                src++;
1699
        }
1700
#endif
1701
}
1702
1703
/**
1704 134eb1e5 Michael Niedermayer
 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter.
1705
 * will be called for every 8x8 block and can read & write from line 4-15
1706
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1707
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1708
 * this filter will read lines 4-13 and write 4-11
1709
 */
1710
static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1711
{
1712
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1713
        src+= stride*4;
1714
        asm volatile(
1715
                "leal (%0, %1), %%eax                                \n\t"
1716
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1717
                "pxor %%mm7, %%mm7                                \n\t"
1718
                "movq (%2), %%mm0                                \n\t"
1719
                "movq (%3), %%mm1                                \n\t"
1720
//        0        1        2        3        4        5        6        7        8        9        10
1721
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1 ecx
1722
1723
#define DEINT_L5(t1,t2,a,b,c)\
1724
                "movq " #a ", %%mm2                                \n\t"\
1725
                "movq " #b ", %%mm3                                \n\t"\
1726
                "movq " #c ", %%mm4                                \n\t"\
1727
                PAVGB(t2, %%mm3)                                        \
1728
                PAVGB(t1, %%mm4)                                        \
1729
                "movq %%mm2, %%mm5                                \n\t"\
1730
                "movq %%mm2, " #t1 "                                \n\t"\
1731
                "punpcklbw %%mm7, %%mm2                                \n\t"\
1732
                "punpckhbw %%mm7, %%mm5                                \n\t"\
1733
                "movq %%mm2, %%mm6                                \n\t"\
1734
                "paddw %%mm2, %%mm2                                \n\t"\
1735
                "paddw %%mm6, %%mm2                                \n\t"\
1736
                "movq %%mm5, %%mm6                                \n\t"\
1737
                "paddw %%mm5, %%mm5                                \n\t"\
1738
                "paddw %%mm6, %%mm5                                \n\t"\
1739
                "movq %%mm3, %%mm6                                \n\t"\
1740
                "punpcklbw %%mm7, %%mm3                                \n\t"\
1741
                "punpckhbw %%mm7, %%mm6                                \n\t"\
1742
                "paddw %%mm3, %%mm3                                \n\t"\
1743
                "paddw %%mm6, %%mm6                                \n\t"\
1744
                "paddw %%mm3, %%mm2                                \n\t"\
1745
                "paddw %%mm6, %%mm5                                \n\t"\
1746
                "movq %%mm4, %%mm6                                \n\t"\
1747
                "punpcklbw %%mm7, %%mm4                                \n\t"\
1748
                "punpckhbw %%mm7, %%mm6                                \n\t"\
1749
                "psubw %%mm4, %%mm2                                \n\t"\
1750
                "psubw %%mm6, %%mm5                                \n\t"\
1751
                "psraw $2, %%mm2                                \n\t"\
1752
                "psraw $2, %%mm5                                \n\t"\
1753
                "packuswb %%mm5, %%mm2                                \n\t"\
1754
                "movq %%mm2, " #a "                                \n\t"\
1755
1756
DEINT_L5(%%mm0, %%mm1, (%0)          , (%%eax)       , (%%eax, %1)   )
1757
DEINT_L5(%%mm1, %%mm0, (%%eax)       , (%%eax, %1)   , (%%eax, %1, 2))
1758
DEINT_L5(%%mm0, %%mm1, (%%eax, %1)   , (%%eax, %1, 2), (%0, %1, 4)   )
1759
DEINT_L5(%%mm1, %%mm0, (%%eax, %1, 2), (%0, %1, 4)   , (%%edx)       )
1760
DEINT_L5(%%mm0, %%mm1, (%0, %1, 4)   , (%%edx)       , (%%edx, %1)   )  
1761
DEINT_L5(%%mm1, %%mm0, (%%edx)       , (%%edx, %1)   , (%%edx, %1, 2))
1762
DEINT_L5(%%mm0, %%mm1, (%%edx, %1)   , (%%edx, %1, 2), (%0, %1, 8)   )
1763
DEINT_L5(%%mm1, %%mm0, (%%edx, %1, 2), (%0, %1, 8)   , (%%edx, %1, 4))
1764
1765
                "movq %%mm0, (%2)                                \n\t"
1766
                "movq %%mm1, (%3)                                \n\t"
1767
                : : "r" (src), "r" (stride), "r"(tmp), "r"(tmp2)
1768
                : "%eax", "%edx"
1769
        );
1770
#else
1771
        int x;
1772
        src+= stride*4;
1773
        for(x=0; x<8; x++)
1774
        {
1775
                int t1= tmp[x];
1776
                int t2= tmp2[x];
1777
                int t3= src[0];
1778
1779
                src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1780
                t1= src[stride*1];
1781
                src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1782
                t2= src[stride*2];
1783
                src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1784
                t3= src[stride*3];
1785
                src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1786
                t1= src[stride*4];
1787
                src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1788
                t2= src[stride*5];
1789
                src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1790
                t3= src[stride*6];
1791
                src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1792
                t1= src[stride*7];
1793
                src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1794
1795
                tmp[x]= t3;
1796
                tmp2[x]= t1;
1797
1798
                src++;
1799
        }
1800
#endif
1801
}
1802
1803
/**
1804 b304569a Michael Niedermayer
 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter.
1805 9c9e467d Michael Niedermayer
 * will be called for every 8x8 block and can read & write from line 4-15
1806
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1807
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1808 7fb36f6c Michael Niedermayer
 * this filter will read lines 4-13 and write 4-11
1809 3b58b885 Michael Niedermayer
 */
1810 13ba9ae4 Michael Niedermayer
static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1811 3b58b885 Michael Niedermayer
{
1812
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1813 7fb36f6c Michael Niedermayer
        src+= 4*stride;
1814 3b58b885 Michael Niedermayer
        asm volatile(
1815
                "leal (%0, %1), %%eax                                \n\t"
1816 9c9e467d Michael Niedermayer
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1817 3b58b885 Michael Niedermayer
//        0        1        2        3        4        5        6        7        8        9
1818 9c9e467d Michael Niedermayer
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1819 3b58b885 Michael Niedermayer
1820 13ba9ae4 Michael Niedermayer
                "movq (%2), %%mm0                                \n\t" // L0
1821
                "movq (%%eax), %%mm1                                \n\t" // L2
1822 3b58b885 Michael Niedermayer
                PAVGB(%%mm1, %%mm0)                                      // L0+L2
1823 13ba9ae4 Michael Niedermayer
                "movq (%0), %%mm2                                \n\t" // L1
1824 3b58b885 Michael Niedermayer
                PAVGB(%%mm2, %%mm0)
1825
                "movq %%mm0, (%0)                                \n\t"
1826 13ba9ae4 Michael Niedermayer
                "movq (%%eax, %1), %%mm0                        \n\t" // L3
1827 3b58b885 Michael Niedermayer
                PAVGB(%%mm0, %%mm2)                                      // L1+L3
1828
                PAVGB(%%mm1, %%mm2)                                      // 2L2 + L1 + L3
1829
                "movq %%mm2, (%%eax)                                \n\t"
1830 13ba9ae4 Michael Niedermayer
                "movq (%%eax, %1, 2), %%mm2                        \n\t" // L4
1831 3b58b885 Michael Niedermayer
                PAVGB(%%mm2, %%mm1)                                      // L2+L4
1832
                PAVGB(%%mm0, %%mm1)                                      // 2L3 + L2 + L4
1833
                "movq %%mm1, (%%eax, %1)                        \n\t"
1834 13ba9ae4 Michael Niedermayer
                "movq (%0, %1, 4), %%mm1                        \n\t" // L5
1835 3b58b885 Michael Niedermayer
                PAVGB(%%mm1, %%mm0)                                      // L3+L5
1836
                PAVGB(%%mm2, %%mm0)                                      // 2L4 + L3 + L5
1837
                "movq %%mm0, (%%eax, %1, 2)                        \n\t"
1838 13ba9ae4 Michael Niedermayer
                "movq (%%edx), %%mm0                                \n\t" // L6
1839 3b58b885 Michael Niedermayer
                PAVGB(%%mm0, %%mm2)                                      // L4+L6
1840
                PAVGB(%%mm1, %%mm2)                                      // 2L5 + L4 + L6
1841
                "movq %%mm2, (%0, %1, 4)                        \n\t"
1842 13ba9ae4 Michael Niedermayer
                "movq (%%edx, %1), %%mm2                        \n\t" // L7
1843 3b58b885 Michael Niedermayer
                PAVGB(%%mm2, %%mm1)                                      // L5+L7
1844
                PAVGB(%%mm0, %%mm1)                                      // 2L6 + L5 + L7
1845 9c9e467d Michael Niedermayer
                "movq %%mm1, (%%edx)                                \n\t"
1846 13ba9ae4 Michael Niedermayer
                "movq (%%edx, %1, 2), %%mm1                        \n\t" // L8
1847 3b58b885 Michael Niedermayer
                PAVGB(%%mm1, %%mm0)                                      // L6+L8
1848
                PAVGB(%%mm2, %%mm0)                                      // 2L7 + L6 + L8
1849 9c9e467d Michael Niedermayer
                "movq %%mm0, (%%edx, %1)                        \n\t"
1850 13ba9ae4 Michael Niedermayer
                "movq (%0, %1, 8), %%mm0                        \n\t" // L9
1851 3b58b885 Michael Niedermayer
                PAVGB(%%mm0, %%mm2)                                      // L7+L9
1852
                PAVGB(%%mm1, %%mm2)                                      // 2L8 + L7 + L9
1853 9c9e467d Michael Niedermayer
                "movq %%mm2, (%%edx, %1, 2)                        \n\t"
1854 13ba9ae4 Michael Niedermayer
                "movq %%mm1, (%2)                                \n\t"
1855 3b58b885 Michael Niedermayer
1856 13ba9ae4 Michael Niedermayer
                : : "r" (src), "r" (stride), "r" (tmp)
1857 9c9e467d Michael Niedermayer
                : "%eax", "%edx"
1858 3b58b885 Michael Niedermayer
        );
1859
#else
1860 99d33fa3 Michael Niedermayer
        int a, b, c, x;
1861 7fb36f6c Michael Niedermayer
        src+= 4*stride;
1862 99d33fa3 Michael Niedermayer
1863
        for(x=0; x<2; x++){
1864 13ba9ae4 Michael Niedermayer
                a= *(uint32_t*)&tmp[stride*0];
1865
                b= *(uint32_t*)&src[stride*0];
1866
                c= *(uint32_t*)&src[stride*1];
1867 99d33fa3 Michael Niedermayer
                a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1868
                *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1869
1870 13ba9ae4 Michael Niedermayer
                a= *(uint32_t*)&src[stride*2];
1871 99d33fa3 Michael Niedermayer
                b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1872
                *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1873
1874 13ba9ae4 Michael Niedermayer
                b= *(uint32_t*)&src[stride*3];
1875 99d33fa3 Michael Niedermayer
                c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1876
                *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1877
1878 13ba9ae4 Michael Niedermayer
                c= *(uint32_t*)&src[stride*4];
1879 99d33fa3 Michael Niedermayer
                a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1880
                *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1881
1882 13ba9ae4 Michael Niedermayer
                a= *(uint32_t*)&src[stride*5];
1883 99d33fa3 Michael Niedermayer
                b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1884
                *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1885
1886 13ba9ae4 Michael Niedermayer
                b= *(uint32_t*)&src[stride*6];
1887 99d33fa3 Michael Niedermayer
                c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1888
                *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1889
1890 13ba9ae4 Michael Niedermayer
                c= *(uint32_t*)&src[stride*7];
1891 99d33fa3 Michael Niedermayer
                a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1892
                *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1893
1894 13ba9ae4 Michael Niedermayer
                a= *(uint32_t*)&src[stride*8];
1895 99d33fa3 Michael Niedermayer
                b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1896
                *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1897
1898 13ba9ae4 Michael Niedermayer
                *(uint32_t*)&tmp[stride*0]= c;
1899 99d33fa3 Michael Niedermayer
                src += 4;
1900 13ba9ae4 Michael Niedermayer
                tmp += 4;
1901 3b58b885 Michael Niedermayer
        }
1902
#endif
1903
}
1904
1905
/**
1906 b304569a Michael Niedermayer
 * Deinterlaces the given block by applying a median filter to every second line.
1907 7fb36f6c Michael Niedermayer
 * will be called for every 8x8 block and can read & write from line 4-15,
1908
 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1909
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1910 3b58b885 Michael Niedermayer
 */
1911 cc9b0679 Michael Niedermayer
static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1912 3b58b885 Michael Niedermayer
{
1913 a6be8111 Michael Niedermayer
#ifdef HAVE_MMX
1914 7fb36f6c Michael Niedermayer
        src+= 4*stride;
1915 a6be8111 Michael Niedermayer
#ifdef HAVE_MMX2
1916 3b58b885 Michael Niedermayer
        asm volatile(
1917
                "leal (%0, %1), %%eax                                \n\t"
1918 9c9e467d Michael Niedermayer
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1919 3b58b885 Michael Niedermayer
//        0        1        2        3        4        5        6        7        8        9
1920 9c9e467d Michael Niedermayer
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1921 3b58b885 Michael Niedermayer
1922
                "movq (%0), %%mm0                                \n\t" //
1923
                "movq (%%eax, %1), %%mm2                        \n\t" //
1924
                "movq (%%eax), %%mm1                                \n\t" //
1925
                "movq %%mm0, %%mm3                                \n\t"
1926
                "pmaxub %%mm1, %%mm0                                \n\t" //
1927
                "pminub %%mm3, %%mm1                                \n\t" //
1928
                "pmaxub %%mm2, %%mm1                                \n\t" //
1929
                "pminub %%mm1, %%mm0                                \n\t"
1930
                "movq %%mm0, (%%eax)                                \n\t"
1931
1932
                "movq (%0, %1, 4), %%mm0                        \n\t" //
1933
                "movq (%%eax, %1, 2), %%mm1                        \n\t" //
1934
                "movq %%mm2, %%mm3                                \n\t"
1935
                "pmaxub %%mm1, %%mm2                                \n\t" //
1936
                "pminub %%mm3, %%mm1                                \n\t" //
1937
                "pmaxub %%mm0, %%mm1                                \n\t" //
1938
                "pminub %%mm1, %%mm2                                \n\t"
1939
                "movq %%mm2, (%%eax, %1, 2)                        \n\t"
1940
1941 9c9e467d Michael Niedermayer
                "movq (%%edx), %%mm2                                \n\t" //
1942
                "movq (%%edx, %1), %%mm1                        \n\t" //
1943 3b58b885 Michael Niedermayer
                "movq %%mm2, %%mm3                                \n\t"
1944
                "pmaxub %%mm0, %%mm2                                \n\t" //
1945
                "pminub %%mm3, %%mm0                                \n\t" //
1946
                "pmaxub %%mm1, %%mm0                                \n\t" //
1947
                "pminub %%mm0, %%mm2                                \n\t"
1948 9c9e467d Michael Niedermayer
                "movq %%mm2, (%%edx)                                \n\t"
1949 3b58b885 Michael Niedermayer
1950 9c9e467d Michael Niedermayer
                "movq (%%edx, %1, 2), %%mm2                        \n\t" //
1951 3b58b885 Michael Niedermayer
                "movq (%0, %1, 8), %%mm0                        \n\t" //
1952
                "movq %%mm2, %%mm3                                \n\t"
1953
                "pmaxub %%mm0, %%mm2                                \n\t" //
1954
                "pminub %%mm3, %%mm0                                \n\t" //
1955
                "pmaxub %%mm1, %%mm0                                \n\t" //
1956
                "pminub %%mm0, %%mm2                                \n\t"
1957 9c9e467d Michael Niedermayer
                "movq %%mm2, (%%edx, %1, 2)                        \n\t"
1958 3b58b885 Michael Niedermayer
1959
1960
                : : "r" (src), "r" (stride)
1961 9c9e467d Michael Niedermayer
                : "%eax", "%edx"
1962 3b58b885 Michael Niedermayer
        );
1963 a6be8111 Michael Niedermayer
1964
#else // MMX without MMX2
1965
        asm volatile(
1966
                "leal (%0, %1), %%eax                                \n\t"
1967 9c9e467d Michael Niedermayer
                "leal (%%eax, %1, 4), %%edx                        \n\t"
1968 a6be8111 Michael Niedermayer
//        0        1        2        3        4        5        6        7        8        9
1969 9c9e467d Michael Niedermayer
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1970 a6be8111 Michael Niedermayer
                "pxor %%mm7, %%mm7                                \n\t"
1971
1972
#define MEDIAN(a,b,c)\
1973
                "movq " #a ", %%mm0                                \n\t"\
1974
                "movq " #b ", %%mm2                                \n\t"\
1975
                "movq " #c ", %%mm1                                \n\t"\
1976
                "movq %%mm0, %%mm3                                \n\t"\
1977
                "movq %%mm1, %%mm4                                \n\t"\
1978
                "movq %%mm2, %%mm5                                \n\t"\
1979
                "psubusb %%mm1, %%mm3                                \n\t"\
1980
                "psubusb %%mm2, %%mm4                                \n\t"\
1981
                "psubusb %%mm0, %%mm5                                \n\t"\
1982
                "pcmpeqb %%mm7, %%mm3                                \n\t"\
1983
                "pcmpeqb %%mm7, %%mm4                                \n\t"\
1984
                "pcmpeqb %%mm7, %%mm5                                \n\t"\
1985
                "movq %%mm3, %%mm6                                \n\t"\
1986
                "pxor %%mm4, %%mm3                                \n\t"\
1987
                "pxor %%mm5, %%mm4                                \n\t"\
1988
                "pxor %%mm6, %%mm5                                \n\t"\
1989
                "por %%mm3, %%mm1                                \n\t"\
1990
                "por %%mm4, %%mm2                                \n\t"\
1991
                "por %%mm5, %%mm0                                \n\t"\
1992
                "pand %%mm2, %%mm0                                \n\t"\
1993
                "pand %%mm1, %%mm0                                \n\t"\
1994
                "movq %%mm0, " #b "                                \n\t"
1995
1996
MEDIAN((%0), (%%eax), (%%eax, %1))
1997
MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
1998 9c9e467d Michael Niedermayer
MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1))
1999
MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
2000 a6be8111 Michael Niedermayer
2001
                : : "r" (src), "r" (stride)
2002 9c9e467d Michael Niedermayer
                : "%eax", "%edx"
2003 a6be8111 Michael Niedermayer
        );
2004
#endif // MMX
2005 3b58b885 Michael Niedermayer
#else
2006 9b1663fc D Richard Felker III
        int x, y;
2007 7fb36f6c Michael Niedermayer
        src+= 4*stride;
2008 9b1663fc D Richard Felker III
        // FIXME - there should be a way to do a few columns in parallel like w/mmx
2009 3b58b885 Michael Niedermayer
        for(x=0; x<8; x++)
2010
        {
2011 9b1663fc D Richard Felker III
                uint8_t *colsrc = src;
2012
                for (y=0; y<4; y++)
2013
                {
2014
                        int a, b, c, d, e, f;
2015
                        a = colsrc[0       ];
2016
                        b = colsrc[stride  ];
2017
                        c = colsrc[stride*2];
2018
                        d = (a-b)>>31;
2019
                        e = (b-c)>>31;
2020
                        f = (c-a)>>31;
2021
                        colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
2022
                        colsrc += stride*2;
2023
                }
2024 3b58b885 Michael Niedermayer
                src++;
2025
        }
2026
#endif
2027
}
2028
2029 e5c30e06 Michael Niedermayer
#ifdef HAVE_MMX
2030 4e4dcbc5 Michael Niedermayer
/**
2031
 * transposes and shift the given 8x8 Block into dst1 and dst2
2032
 */
2033 cc9b0679 Michael Niedermayer
static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2034 4e4dcbc5 Michael Niedermayer
{
2035
        asm(
2036
                "leal (%0, %1), %%eax                                \n\t"
2037
//        0        1        2        3        4        5        6        7        8        9
2038 9c9e467d Michael Niedermayer
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
2039 4e4dcbc5 Michael Niedermayer
                "movq (%0), %%mm0                \n\t" // 12345678
2040
                "movq (%%eax), %%mm1                \n\t" // abcdefgh
2041
                "movq %%mm0, %%mm2                \n\t" // 12345678
2042
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2043
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2044
2045
                "movq (%%eax, %1), %%mm1        \n\t"
2046
                "movq (%%eax, %1, 2), %%mm3        \n\t"
2047
                "movq %%mm1, %%mm4                \n\t"
2048
                "punpcklbw %%mm3, %%mm1                \n\t"
2049
                "punpckhbw %%mm3, %%mm4                \n\t"
2050
2051
                "movq %%mm0, %%mm3                \n\t"
2052
                "punpcklwd %%mm1, %%mm0                \n\t"
2053
                "punpckhwd %%mm1, %%mm3                \n\t"
2054
                "movq %%mm2, %%mm1                \n\t"
2055
                "punpcklwd %%mm4, %%mm2                \n\t"
2056
                "punpckhwd %%mm4, %%mm1                \n\t"
2057
2058
                "movd %%mm0, 128(%2)                \n\t"
2059
                "psrlq $32, %%mm0                \n\t"
2060
                "movd %%mm0, 144(%2)                \n\t"
2061
                "movd %%mm3, 160(%2)                \n\t"
2062
                "psrlq $32, %%mm3                \n\t"
2063
                "movd %%mm3, 176(%2)                \n\t"
2064
                "movd %%mm3, 48(%3)                \n\t"
2065
                "movd %%mm2, 192(%2)                \n\t"
2066
                "movd %%mm2, 64(%3)                \n\t"
2067
                "psrlq $32, %%mm2                \n\t"
2068
                "movd %%mm2, 80(%3)                \n\t"
2069
                "movd %%mm1, 96(%3)                \n\t"
2070
                "psrlq $32, %%mm1                \n\t"
2071
                "movd %%mm1, 112(%3)                \n\t"
2072
2073 abd140db Michael Niedermayer
                "leal (%%eax, %1, 4), %%eax        \n\t"
2074
                
2075 4e4dcbc5 Michael Niedermayer
                "movq (%0, %1, 4), %%mm0        \n\t" // 12345678
2076 abd140db Michael Niedermayer
                "movq (%%eax), %%mm1                \n\t" // abcdefgh
2077 4e4dcbc5 Michael Niedermayer
                "movq %%mm0, %%mm2                \n\t" // 12345678
2078
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2079
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2080
2081 abd140db Michael Niedermayer
                "movq (%%eax, %1), %%mm1        \n\t"
2082
                "movq (%%eax, %1, 2), %%mm3        \n\t"
2083 4e4dcbc5 Michael Niedermayer
                "movq %%mm1, %%mm4                \n\t"
2084
                "punpcklbw %%mm3, %%mm1                \n\t"
2085
                "punpckhbw %%mm3, %%mm4                \n\t"
2086
2087
                "movq %%mm0, %%mm3                \n\t"
2088
                "punpcklwd %%mm1, %%mm0                \n\t"
2089
                "punpckhwd %%mm1, %%mm3                \n\t"
2090
                "movq %%mm2, %%mm1                \n\t"
2091
                "punpcklwd %%mm4, %%mm2                \n\t"
2092
                "punpckhwd %%mm4, %%mm1                \n\t"
2093
2094
                "movd %%mm0, 132(%2)                \n\t"
2095
                "psrlq $32, %%mm0                \n\t"
2096
                "movd %%mm0, 148(%2)                \n\t"
2097
                "movd %%mm3, 164(%2)                \n\t"
2098
                "psrlq $32, %%mm3                \n\t"
2099
                "movd %%mm3, 180(%2)                \n\t"
2100
                "movd %%mm3, 52(%3)                \n\t"
2101
                "movd %%mm2, 196(%2)                \n\t"
2102
                "movd %%mm2, 68(%3)                \n\t"
2103
                "psrlq $32, %%mm2                \n\t"
2104
                "movd %%mm2, 84(%3)                \n\t"
2105
                "movd %%mm1, 100(%3)                \n\t"
2106
                "psrlq $32, %%mm1                \n\t"
2107
                "movd %%mm1, 116(%3)                \n\t"
2108
2109
2110
        :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2111 abd140db Michael Niedermayer
        : "%eax"
2112 4e4dcbc5 Michael Niedermayer
        );
2113
}
2114
2115
/**
2116
 * transposes the given 8x8 block
2117
 */
2118 cc9b0679 Michael Niedermayer
static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2119 4e4dcbc5 Michael Niedermayer
{
2120
        asm(
2121
                "leal (%0, %1), %%eax                                \n\t"
2122 9c9e467d Michael Niedermayer
                "leal (%%eax, %1, 4), %%edx                        \n\t"
2123 4e4dcbc5 Michael Niedermayer
//        0        1        2        3        4        5        6        7        8        9
2124 9c9e467d Michael Niedermayer
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
2125 4e4dcbc5 Michael Niedermayer
                "movq (%2), %%mm0                \n\t" // 12345678
2126
                "movq 16(%2), %%mm1                \n\t" // abcdefgh
2127
                "movq %%mm0, %%mm2                \n\t" // 12345678
2128
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2129
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2130
2131
                "movq 32(%2), %%mm1                \n\t"
2132
                "movq 48(%2), %%mm3                \n\t"
2133
                "movq %%mm1, %%mm4                \n\t"
2134
                "punpcklbw %%mm3, %%mm1                \n\t"
2135
                "punpckhbw %%mm3, %%mm4                \n\t"
2136
2137
                "movq %%mm0, %%mm3                \n\t"
2138
                "punpcklwd %%mm1, %%mm0                \n\t"
2139
                "punpckhwd %%mm1, %%mm3                \n\t"
2140
                "movq %%mm2, %%mm1                \n\t"
2141
                "punpcklwd %%mm4, %%mm2                \n\t"
2142
                "punpckhwd %%mm4, %%mm1                \n\t"
2143
2144
                "movd %%mm0, (%0)                \n\t"
2145
                "psrlq $32, %%mm0                \n\t"
2146
                "movd %%mm0, (%%eax)                \n\t"
2147
                "movd %%mm3, (%%eax, %1)        \n\t"
2148
                "psrlq $32, %%mm3                \n\t"
2149
                "movd %%mm3, (%%eax, %1, 2)        \n\t"
2150
                "movd %%mm2, (%0, %1, 4)        \n\t"
2151
                "psrlq $32, %%mm2                \n\t"
2152 9c9e467d Michael Niedermayer
                "movd %%mm2, (%%edx)                \n\t"
2153
                "movd %%mm1, (%%edx, %1)        \n\t"
2154 4e4dcbc5 Michael Niedermayer
                "psrlq $32, %%mm1                \n\t"
2155 9c9e467d Michael Niedermayer
                "movd %%mm1, (%%edx, %1, 2)        \n\t"
2156 4e4dcbc5 Michael Niedermayer
2157
2158
                "movq 64(%2), %%mm0                \n\t" // 12345678
2159
                "movq 80(%2), %%mm1                \n\t" // abcdefgh
2160
                "movq %%mm0, %%mm2                \n\t" // 12345678
2161
                "punpcklbw %%mm1, %%mm0                \n\t" // 1a2b3c4d
2162
                "punpckhbw %%mm1, %%mm2                \n\t" // 5e6f7g8h
2163
2164
                "movq 96(%2), %%mm1                \n\t"
2165
                "movq 112(%2), %%mm3                \n\t"
2166
                "movq %%mm1, %%mm4                \n\t"
2167
                "punpcklbw %%mm3, %%mm1                \n\t"
2168
                "punpckhbw %%mm3, %%mm4                \n\t"
2169
2170
                "movq %%mm0, %%mm3                \n\t"
2171
                "punpcklwd %%mm1, %%mm0                \n\t"
2172
                "punpckhwd %%mm1, %%mm3                \n\t"
2173
                "movq %%mm2, %%mm1                \n\t"
2174
                "punpcklwd %%mm4, %%mm2                \n\t"
2175
                "punpckhwd %%mm4, %%mm1                \n\t"
2176
2177
                "movd %%mm0, 4(%0)                \n\t"
2178
                "psrlq $32, %%mm0                \n\t"
2179
                "movd %%mm0, 4(%%eax)                \n\t"
2180
                "movd %%mm3, 4(%%eax, %1)        \n\t"
2181
                "psrlq $32, %%mm3                \n\t"
2182
                "movd %%mm3, 4(%%eax, %1, 2)        \n\t"
2183
                "movd %%mm2, 4(%0, %1, 4)        \n\t"
2184
                "psrlq $32, %%mm2                \n\t"
2185 9c9e467d Michael Niedermayer
                "movd %%mm2, 4(%%edx)                \n\t"
2186
                "movd %%mm1, 4(%%edx, %1)        \n\t"
2187 4e4dcbc5 Michael Niedermayer
                "psrlq $32, %%mm1                \n\t"
2188 9c9e467d Michael Niedermayer
                "movd %%mm1, 4(%%edx, %1, 2)        \n\t"
2189 4e4dcbc5 Michael Niedermayer
2190
        :: "r" (dst), "r" (dstStride), "r" (src)
2191 9c9e467d Michael Niedermayer
        : "%eax", "%edx"
2192 4e4dcbc5 Michael Niedermayer
        );
2193
}
2194 e5c30e06 Michael Niedermayer
#endif
2195 be44a4d7 Michael Niedermayer
//static int test=0;
2196 4e4dcbc5 Michael Niedermayer
2197 a2596758 Michael Niedermayer
static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2198 a9c77978 Michael Niedermayer
                                    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2199 117e45b0 Michael Niedermayer
{
2200 9c9e467d Michael Niedermayer
        // to save a register (FIXME do this outside of the loops)
2201
        tempBluredPast[127]= maxNoise[0];
2202
        tempBluredPast[128]= maxNoise[1];
2203
        tempBluredPast[129]= maxNoise[2];
2204
        
2205 be44a4d7 Michael Niedermayer
#define FAST_L2_DIFF
2206
//#define L1_DIFF //u should change the thresholds too if u try that one
2207
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2208
        asm volatile(
2209
                "leal (%2, %2, 2), %%eax                        \n\t" // 3*stride
2210 9c9e467d Michael Niedermayer
                "leal (%2, %2, 4), %%edx                        \n\t" // 5*stride
2211
                "leal (%%edx, %2, 2), %%ecx                        \n\t" // 7*stride
2212 be44a4d7 Michael Niedermayer
//        0        1        2        3        4        5        6        7        8        9
2213 9c9e467d Michael Niedermayer
//        %x        %x+%2        %x+2%2        %x+eax        %x+4%2        %x+edx        %x+2eax        %x+ecx        %x+8%2
2214 be44a4d7 Michael Niedermayer
//FIXME reorder?
2215
#ifdef L1_DIFF //needs mmx2
2216
                "movq (%0), %%mm0                                \n\t" // L0
2217
                "psadbw (%1), %%mm0                                \n\t" // |L0-R0|
2218
                "movq (%0, %2), %%mm1                                \n\t" // L1
2219
                "psadbw (%1, %2), %%mm1                                \n\t" // |L1-R1|
2220
                "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2221
                "psadbw (%1, %2, 2), %%mm2                        \n\t" // |L2-R2|
2222
                "movq (%0, %%eax), %%mm3                        \n\t" // L3
2223
                "psadbw (%1, %%eax), %%mm3                        \n\t" // |L3-R3|
2224
2225
                "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2226
                "paddw %%mm1, %%mm0                                \n\t"
2227
                "psadbw (%1, %2, 4), %%mm4                        \n\t" // |L4-R4|
2228 9c9e467d Michael Niedermayer
                "movq (%0, %%edx), %%mm5                        \n\t" // L5
2229 be44a4d7 Michael Niedermayer
                "paddw %%mm2, %%mm0                                \n\t"
2230 9c9e467d Michael Niedermayer
                "psadbw (%1, %%edx), %%mm5                        \n\t" // |L5-R5|
2231 be44a4d7 Michael Niedermayer
                "movq (%0, %%eax, 2), %%mm6                        \n\t" // L6
2232
                "paddw %%mm3, %%mm0                                \n\t"
2233
                "psadbw (%1, %%eax, 2), %%mm6                        \n\t" // |L6-R6|
2234
                "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2235
                "paddw %%mm4, %%mm0                                \n\t"
2236
                "psadbw (%1, %%ecx), %%mm7                        \n\t" // |L7-R7|
2237
                "paddw %%mm5, %%mm6                                \n\t"
2238
                "paddw %%mm7, %%mm6                                \n\t"
2239
                "paddw %%mm6, %%mm0                                \n\t"
2240
#elif defined (FAST_L2_DIFF)
2241
                "pcmpeqb %%mm7, %%mm7                                \n\t"
2242 9b464428 Felix Bünemann
                "movq "MANGLE(b80)", %%mm6                        \n\t"
2243 be44a4d7 Michael Niedermayer
                "pxor %%mm0, %%mm0                                \n\t"
2244
#define L2_DIFF_CORE(a, b)\
2245
                "movq " #a ", %%mm5                                \n\t"\
2246
                "movq " #b ", %%mm2                                \n\t"\
2247
                "pxor %%mm7, %%mm2                                \n\t"\
2248
                PAVGB(%%mm2, %%mm5)\
2249
                "paddb %%mm6, %%mm5                                \n\t"\
2250
                "movq %%mm5, %%mm2                                \n\t"\
2251
                "psllw $8, %%mm5                                \n\t"\
2252
                "pmaddwd %%mm5, %%mm5                                \n\t"\
2253
                "pmaddwd %%mm2, %%mm2                                \n\t"\
2254
                "paddd %%mm2, %%mm5                                \n\t"\
2255
                "psrld $14, %%mm5                                \n\t"\
2256
                "paddd %%mm5, %%mm0                                \n\t"
2257
2258
L2_DIFF_CORE((%0), (%1))
2259
L2_DIFF_CORE((%0, %2), (%1, %2))
2260
L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2261
L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
</