Statistics
| Branch: | Revision:

ffmpeg / libpostproc / postprocess_template.c @ 1cee4eaf

History | View | Annotate | Download (161 KB)

1 3057fa66 Arpi
/*
2 b78e7197 Diego Biurrun
 * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 e5a389a1 Diego Biurrun
 */
20 3057fa66 Arpi
21 b304569a Michael Niedermayer
/**
22 bad5537e Diego Biurrun
 * @file libpostproc/postprocess_template.c
23 b304569a Michael Niedermayer
 * mmx/mmx2/3dnow postprocess code.
24
 */
25
26 245976da Diego Biurrun
#include "libavutil/x86_cpu.h"
27 b304569a Michael Niedermayer
28 6fcc9af0 Måns Rullgård
#define ALIGN_MASK "$-8"
29 053dea12 Aurelien Jacobs
30 cc9b0679 Michael Niedermayer
#undef PAVGB
31
#undef PMINUB
32
#undef PMAXUB
33 e939e1c3 Arpi
34 b250f9c6 Aurelien Jacobs
#if   HAVE_MMX2
35 053dea12 Aurelien Jacobs
#define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36 ebc3209a Diego Biurrun
#elif HAVE_AMD3DNOW
37 4471e59b D Richard Felker III
#define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
38 e939e1c3 Arpi
#endif
39 053dea12 Aurelien Jacobs
#define PAVGB(a,b)  REAL_PAVGB(a,b)
40 3057fa66 Arpi
41 b250f9c6 Aurelien Jacobs
#if   HAVE_MMX2
42 2e212618 Michael Niedermayer
#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
43 b250f9c6 Aurelien Jacobs
#elif HAVE_MMX
44 2e212618 Michael Niedermayer
#define PMINUB(b,a,t) \
45 16e0bf73 Diego Biurrun
    "movq " #a ", " #t " \n\t"\
46
    "psubusb " #b ", " #t " \n\t"\
47
    "psubb " #t ", " #a " \n\t"
48 2e212618 Michael Niedermayer
#endif
49
50 b250f9c6 Aurelien Jacobs
#if   HAVE_MMX2
51 2e212618 Michael Niedermayer
#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
52 b250f9c6 Aurelien Jacobs
#elif HAVE_MMX
53 2e212618 Michael Niedermayer
#define PMAXUB(a,b) \
54 16e0bf73 Diego Biurrun
    "psubusb " #a ", " #b " \n\t"\
55
    "paddb " #a ", " #b " \n\t"
56 2e212618 Michael Niedermayer
#endif
57
58 755bfeab Diego Biurrun
//FIXME? |255-0| = 1 (should not be a problem ...)
59 b250f9c6 Aurelien Jacobs
#if HAVE_MMX
60 3057fa66 Arpi
/**
61 acced553 Michael Niedermayer
 * Check if the middle 8x8 Block in the given 8x16 block is flat
62 3057fa66 Arpi
 */
63 cb482d25 Michael Niedermayer
static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
64 16e0bf73 Diego Biurrun
    int numEq= 0, dcOk;
65
    src+= stride*4; // src points to begin of the 8x8 Block
66 be449fca Diego Pettenò
    __asm__ volatile(
67 16e0bf73 Diego Biurrun
        "movq %0, %%mm7                         \n\t"
68
        "movq %1, %%mm6                         \n\t"
69
        : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
70
        );
71
72 be449fca Diego Pettenò
    __asm__ volatile(
73 16e0bf73 Diego Biurrun
        "lea (%2, %3), %%"REG_a"                \n\t"
74 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9
75
//      %1      eax     eax+%2  eax+2%2 %1+4%2  ecx     ecx+%2  ecx+2%2 %1+8%2  ecx+4%2
76
77 16e0bf73 Diego Biurrun
        "movq (%2), %%mm0                       \n\t"
78
        "movq (%%"REG_a"), %%mm1                \n\t"
79
        "movq %%mm0, %%mm3                      \n\t"
80
        "movq %%mm0, %%mm4                      \n\t"
81
        PMAXUB(%%mm1, %%mm4)
82
        PMINUB(%%mm1, %%mm3, %%mm5)
83
        "psubb %%mm1, %%mm0                     \n\t" // mm0 = differnece
84
        "paddb %%mm7, %%mm0                     \n\t"
85
        "pcmpgtb %%mm6, %%mm0                   \n\t"
86
87
        "movq (%%"REG_a",%3), %%mm2             \n\t"
88
        PMAXUB(%%mm2, %%mm4)
89
        PMINUB(%%mm2, %%mm3, %%mm5)
90
        "psubb %%mm2, %%mm1                     \n\t"
91
        "paddb %%mm7, %%mm1                     \n\t"
92
        "pcmpgtb %%mm6, %%mm1                   \n\t"
93
        "paddb %%mm1, %%mm0                     \n\t"
94
95
        "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
96
        PMAXUB(%%mm1, %%mm4)
97
        PMINUB(%%mm1, %%mm3, %%mm5)
98
        "psubb %%mm1, %%mm2                     \n\t"
99
        "paddb %%mm7, %%mm2                     \n\t"
100
        "pcmpgtb %%mm6, %%mm2                   \n\t"
101
        "paddb %%mm2, %%mm0                     \n\t"
102
103
        "lea (%%"REG_a", %3, 4), %%"REG_a"      \n\t"
104
105
        "movq (%2, %3, 4), %%mm2                \n\t"
106
        PMAXUB(%%mm2, %%mm4)
107
        PMINUB(%%mm2, %%mm3, %%mm5)
108
        "psubb %%mm2, %%mm1                     \n\t"
109
        "paddb %%mm7, %%mm1                     \n\t"
110
        "pcmpgtb %%mm6, %%mm1                   \n\t"
111
        "paddb %%mm1, %%mm0                     \n\t"
112
113
        "movq (%%"REG_a"), %%mm1                \n\t"
114
        PMAXUB(%%mm1, %%mm4)
115
        PMINUB(%%mm1, %%mm3, %%mm5)
116
        "psubb %%mm1, %%mm2                     \n\t"
117
        "paddb %%mm7, %%mm2                     \n\t"
118
        "pcmpgtb %%mm6, %%mm2                   \n\t"
119
        "paddb %%mm2, %%mm0                     \n\t"
120
121
        "movq (%%"REG_a", %3), %%mm2            \n\t"
122
        PMAXUB(%%mm2, %%mm4)
123
        PMINUB(%%mm2, %%mm3, %%mm5)
124
        "psubb %%mm2, %%mm1                     \n\t"
125
        "paddb %%mm7, %%mm1                     \n\t"
126
        "pcmpgtb %%mm6, %%mm1                   \n\t"
127
        "paddb %%mm1, %%mm0                     \n\t"
128
129
        "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
130
        PMAXUB(%%mm1, %%mm4)
131
        PMINUB(%%mm1, %%mm3, %%mm5)
132
        "psubb %%mm1, %%mm2                     \n\t"
133
        "paddb %%mm7, %%mm2                     \n\t"
134
        "pcmpgtb %%mm6, %%mm2                   \n\t"
135
        "paddb %%mm2, %%mm0                     \n\t"
136
        "psubusb %%mm3, %%mm4                   \n\t"
137
138
        "                                       \n\t"
139 b250f9c6 Aurelien Jacobs
#if HAVE_MMX2
140 16e0bf73 Diego Biurrun
        "pxor %%mm7, %%mm7                      \n\t"
141
        "psadbw %%mm7, %%mm0                    \n\t"
142 cd38e322 Michael Niedermayer
#else
143 16e0bf73 Diego Biurrun
        "movq %%mm0, %%mm1                      \n\t"
144
        "psrlw $8, %%mm0                        \n\t"
145
        "paddb %%mm1, %%mm0                     \n\t"
146
        "movq %%mm0, %%mm1                      \n\t"
147
        "psrlq $16, %%mm0                       \n\t"
148
        "paddb %%mm1, %%mm0                     \n\t"
149
        "movq %%mm0, %%mm1                      \n\t"
150
        "psrlq $32, %%mm0                       \n\t"
151
        "paddb %%mm1, %%mm0                     \n\t"
152 cd38e322 Michael Niedermayer
#endif
153 16e0bf73 Diego Biurrun
        "movq %4, %%mm7                         \n\t" // QP,..., QP
154
        "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
155
        "psubusb %%mm7, %%mm4                   \n\t" // Diff <= 2QP -> 0
156
        "packssdw %%mm4, %%mm4                  \n\t"
157
        "movd %%mm0, %0                         \n\t"
158
        "movd %%mm4, %1                         \n\t"
159
160
        : "=r" (numEq), "=r" (dcOk)
161 7cebed70 Reimar Döffinger
        : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
162 16e0bf73 Diego Biurrun
        : "%"REG_a
163
        );
164
165
    numEq= (-numEq) &0xFF;
166
    if(numEq > c->ppMode.flatnessThreshold){
167
        if(dcOk) return 0;
168
        else     return 1;
169
    }else{
170
        return 2;
171
    }
172 3057fa66 Arpi
}
173 64c968da Diego Biurrun
#endif //HAVE_MMX
174 3057fa66 Arpi
175
/**
176 acced553 Michael Niedermayer
 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
177 a6be8111 Michael Niedermayer
 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
178 3057fa66 Arpi
 */
179 b250f9c6 Aurelien Jacobs
#if !HAVE_ALTIVEC
180 9c9e467d Michael Niedermayer
static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
181 3057fa66 Arpi
{
182 ebc3209a Diego Biurrun
#if HAVE_MMX2 || HAVE_AMD3DNOW
183 16e0bf73 Diego Biurrun
    src+= stride*3;
184 be449fca Diego Pettenò
    __asm__ volatile(        //"movv %0 %1 %2\n\t"
185 16e0bf73 Diego Biurrun
        "movq %2, %%mm0                         \n\t"  // QP,..., QP
186
        "pxor %%mm4, %%mm4                      \n\t"
187
188
        "movq (%0), %%mm6                       \n\t"
189
        "movq (%0, %1), %%mm5                   \n\t"
190
        "movq %%mm5, %%mm1                      \n\t"
191
        "movq %%mm6, %%mm2                      \n\t"
192
        "psubusb %%mm6, %%mm5                   \n\t"
193
        "psubusb %%mm1, %%mm2                   \n\t"
194
        "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
195
        "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
196
        "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
197
198
        "pand %%mm2, %%mm6                      \n\t"
199
        "pandn %%mm1, %%mm2                     \n\t"
200
        "por %%mm2, %%mm6                       \n\t"// First Line to Filter
201
202
        "movq (%0, %1, 8), %%mm5                \n\t"
203
        "lea (%0, %1, 4), %%"REG_a"             \n\t"
204
        "lea (%0, %1, 8), %%"REG_c"             \n\t"
205
        "sub %1, %%"REG_c"                      \n\t"
206
        "add %1, %0                             \n\t" // %0 points to line 1 not 0
207
        "movq (%0, %1, 8), %%mm7                \n\t"
208
        "movq %%mm5, %%mm1                      \n\t"
209
        "movq %%mm7, %%mm2                      \n\t"
210
        "psubusb %%mm7, %%mm5                   \n\t"
211
        "psubusb %%mm1, %%mm2                   \n\t"
212
        "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
213
        "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
214
        "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
215
216
        "pand %%mm2, %%mm7                      \n\t"
217
        "pandn %%mm1, %%mm2                     \n\t"
218
        "por %%mm2, %%mm7                       \n\t" // First Line to Filter
219
220
221
        //      1       2       3       4       5       6       7       8
222
        //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ecx     eax+4%1
223
        // 6 4 2 2 1 1
224
        // 6 4 4 2
225
        // 6 8 2
226
227
        "movq (%0, %1), %%mm0                   \n\t" //  1
228
        "movq %%mm0, %%mm1                      \n\t" //  1
229
        PAVGB(%%mm6, %%mm0)                           //1 1        /2
230
        PAVGB(%%mm6, %%mm0)                           //3 1        /4
231
232
        "movq (%0, %1, 4), %%mm2                \n\t" //     1
233
        "movq %%mm2, %%mm5                      \n\t" //     1
234
        PAVGB((%%REGa), %%mm2)                        //    11        /2
235
        PAVGB((%0, %1, 2), %%mm2)                     //   211        /4
236
        "movq %%mm2, %%mm3                      \n\t" //   211        /4
237
        "movq (%0), %%mm4                       \n\t" // 1
238
        PAVGB(%%mm4, %%mm3)                           // 4 211        /8
239
        PAVGB(%%mm0, %%mm3)                           //642211        /16
240
        "movq %%mm3, (%0)                       \n\t" // X
241
        // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
242
        "movq %%mm1, %%mm0                      \n\t" //  1
243
        PAVGB(%%mm6, %%mm0)                           //1 1        /2
244
        "movq %%mm4, %%mm3                      \n\t" // 1
245
        PAVGB((%0,%1,2), %%mm3)                       // 1 1        /2
246
        PAVGB((%%REGa,%1,2), %%mm5)                   //     11        /2
247
        PAVGB((%%REGa), %%mm5)                        //    211 /4
248
        PAVGB(%%mm5, %%mm3)                           // 2 2211 /8
249
        PAVGB(%%mm0, %%mm3)                           //4242211 /16
250
        "movq %%mm3, (%0,%1)                    \n\t" //  X
251
        // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
252
        PAVGB(%%mm4, %%mm6)                                   //11        /2
253
        "movq (%%"REG_c"), %%mm0                \n\t" //       1
254
        PAVGB((%%REGa, %1, 2), %%mm0)                 //      11/2
255
        "movq %%mm0, %%mm3                      \n\t" //      11/2
256
        PAVGB(%%mm1, %%mm0)                           //  2   11/4
257
        PAVGB(%%mm6, %%mm0)                           //222   11/8
258
        PAVGB(%%mm2, %%mm0)                           //22242211/16
259
        "movq (%0, %1, 2), %%mm2                \n\t" //   1
260
        "movq %%mm0, (%0, %1, 2)                \n\t" //   X
261
        // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
262
        "movq (%%"REG_a", %1, 4), %%mm0         \n\t" //        1
263
        PAVGB((%%REGc), %%mm0)                        //       11        /2
264
        PAVGB(%%mm0, %%mm6)                           //11     11        /4
265
        PAVGB(%%mm1, %%mm4)                           // 11                /2
266
        PAVGB(%%mm2, %%mm1)                           //  11                /2
267
        PAVGB(%%mm1, %%mm6)                           //1122   11        /8
268
        PAVGB(%%mm5, %%mm6)                           //112242211        /16
269
        "movq (%%"REG_a"), %%mm5                \n\t" //    1
270
        "movq %%mm6, (%%"REG_a")                \n\t" //    X
271
        // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
272
        "movq (%%"REG_a", %1, 4), %%mm6         \n\t" //        1
273
        PAVGB(%%mm7, %%mm6)                           //        11        /2
274
        PAVGB(%%mm4, %%mm6)                           // 11     11        /4
275
        PAVGB(%%mm3, %%mm6)                           // 11   2211        /8
276
        PAVGB(%%mm5, %%mm2)                           //   11                /2
277
        "movq (%0, %1, 4), %%mm4                \n\t" //     1
278
        PAVGB(%%mm4, %%mm2)                           //   112                /4
279
        PAVGB(%%mm2, %%mm6)                           // 112242211        /16
280
        "movq %%mm6, (%0, %1, 4)                \n\t" //     X
281
        // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
282
        PAVGB(%%mm7, %%mm1)                           //  11     2        /4
283
        PAVGB(%%mm4, %%mm5)                           //    11                /2
284
        PAVGB(%%mm5, %%mm0)                           //    11 11        /4
285
        "movq (%%"REG_a", %1, 2), %%mm6         \n\t" //      1
286
        PAVGB(%%mm6, %%mm1)                           //  11  4  2        /8
287
        PAVGB(%%mm0, %%mm1)                           //  11224222        /16
288
        "movq %%mm1, (%%"REG_a", %1, 2)         \n\t" //      X
289
        // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
290
        PAVGB((%%REGc), %%mm2)                        //   112 4        /8
291
        "movq (%%"REG_a", %1, 4), %%mm0         \n\t" //        1
292
        PAVGB(%%mm0, %%mm6)                           //      1 1        /2
293
        PAVGB(%%mm7, %%mm6)                           //      1 12        /4
294
        PAVGB(%%mm2, %%mm6)                           //   1122424        /4
295
        "movq %%mm6, (%%"REG_c")                \n\t" //       X
296
        // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
297
        PAVGB(%%mm7, %%mm5)                           //    11   2        /4
298
        PAVGB(%%mm7, %%mm5)                           //    11   6        /8
299
300
        PAVGB(%%mm3, %%mm0)                           //      112        /4
301
        PAVGB(%%mm0, %%mm5)                           //    112246        /16
302
        "movq %%mm5, (%%"REG_a", %1, 4)         \n\t" //        X
303
        "sub %1, %0                             \n\t"
304
305
        :
306 7cebed70 Reimar Döffinger
        : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
307 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_c
308
    );
309 ebc3209a Diego Biurrun
#else //HAVE_MMX2 || HAVE_AMD3DNOW
310 16e0bf73 Diego Biurrun
    const int l1= stride;
311
    const int l2= stride + l1;
312
    const int l3= stride + l2;
313
    const int l4= stride + l3;
314
    const int l5= stride + l4;
315
    const int l6= stride + l5;
316
    const int l7= stride + l6;
317
    const int l8= stride + l7;
318
    const int l9= stride + l8;
319
    int x;
320
    src+= stride*3;
321
    for(x=0; x<BLOCK_SIZE; x++){
322
        const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
323
        const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
324
325
        int sums[10];
326
        sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
327
        sums[1] = sums[0] - first  + src[l4];
328
        sums[2] = sums[1] - first  + src[l5];
329
        sums[3] = sums[2] - first  + src[l6];
330
        sums[4] = sums[3] - first  + src[l7];
331
        sums[5] = sums[4] - src[l1] + src[l8];
332
        sums[6] = sums[5] - src[l2] + last;
333
        sums[7] = sums[6] - src[l3] + last;
334
        sums[8] = sums[7] - src[l4] + last;
335
        sums[9] = sums[8] - src[l5] + last;
336
337
        src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
338
        src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
339
        src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
340
        src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
341
        src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
342
        src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
343
        src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
344
        src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
345
346
        src++;
347
    }
348 ebc3209a Diego Biurrun
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
349 3057fa66 Arpi
}
350 b0ac780a Michael Niedermayer
#endif //HAVE_ALTIVEC
351 3057fa66 Arpi
352 9c9e467d Michael Niedermayer
#if 0
353 13e00528 Arpi
/**
354
 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
355
 * values are correctly clipped (MMX2)
356
 * values are wraparound (C)
357 90b5b51e Diego Biurrun
 * Conclusion: It is fast, but introduces ugly horizontal patterns
358
 * if there is a continuous gradient.
359 bb270c08 Diego Biurrun
        0 8 16 24
360
        x = 8
361
        x/2 = 4
362
        x/8 = 1
363
        1 12 12 23
364 13e00528 Arpi
 */
365 cc9b0679 Michael Niedermayer
static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
366 13e00528 Arpi
{
367 ebc3209a Diego Biurrun
#if HAVE_MMX2 || HAVE_AMD3DNOW
368 16e0bf73 Diego Biurrun
    src+= stride*3;
369 13e00528 Arpi
// FIXME rounding
370 be449fca Diego Pettenò
    __asm__ volatile(
371 16e0bf73 Diego Biurrun
        "pxor %%mm7, %%mm7                      \n\t" // 0
372
        "movq "MANGLE(b80)", %%mm6              \n\t" // MIN_SIGNED_BYTE
373
        "leal (%0, %1), %%"REG_a"               \n\t"
374
        "leal (%%"REG_a", %1, 4), %%"REG_c"     \n\t"
375 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9
376
//      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
377 16e0bf73 Diego Biurrun
        "movq "MANGLE(pQPb)", %%mm0             \n\t" // QP,..., QP
378
        "movq %%mm0, %%mm1                      \n\t" // QP,..., QP
379
        "paddusb "MANGLE(b02)", %%mm0           \n\t"
380
        "psrlw $2, %%mm0                        \n\t"
381
        "pand "MANGLE(b3F)", %%mm0              \n\t" // QP/4,..., QP/4
382
        "paddusb %%mm1, %%mm0                   \n\t" // QP*1.25 ...
383
        "movq (%0, %1, 4), %%mm2                \n\t" // line 4
384
        "movq (%%"REG_c"), %%mm3                \n\t" // line 5
385
        "movq %%mm2, %%mm4                      \n\t" // line 4
386
        "pcmpeqb %%mm5, %%mm5                   \n\t" // -1
387
        "pxor %%mm2, %%mm5                      \n\t" // -line 4 - 1
388
        PAVGB(%%mm3, %%mm5)
389
        "paddb %%mm6, %%mm5                     \n\t" // (l5-l4)/2
390
        "psubusb %%mm3, %%mm4                   \n\t"
391
        "psubusb %%mm2, %%mm3                   \n\t"
392
        "por %%mm3, %%mm4                       \n\t" // |l4 - l5|
393
        "psubusb %%mm0, %%mm4                   \n\t"
394
        "pcmpeqb %%mm7, %%mm4                   \n\t"
395
        "pand %%mm4, %%mm5                      \n\t" // d/2
396

397
//        "paddb %%mm6, %%mm2                     \n\t" // line 4 + 0x80
398
        "paddb %%mm5, %%mm2                     \n\t"
399
//        "psubb %%mm6, %%mm2                     \n\t"
400
        "movq %%mm2, (%0,%1, 4)                 \n\t"
401

402
        "movq (%%"REG_c"), %%mm2                \n\t"
403
//        "paddb %%mm6, %%mm2                     \n\t" // line 5 + 0x80
404
        "psubb %%mm5, %%mm2                     \n\t"
405
//        "psubb %%mm6, %%mm2                     \n\t"
406
        "movq %%mm2, (%%"REG_c")                \n\t"
407

408
        "paddb %%mm6, %%mm5                     \n\t"
409
        "psrlw $2, %%mm5                        \n\t"
410
        "pand "MANGLE(b3F)", %%mm5              \n\t"
411
        "psubb "MANGLE(b20)", %%mm5             \n\t" // (l5-l4)/8
412

413
        "movq (%%"REG_a", %1, 2), %%mm2         \n\t"
414
        "paddb %%mm6, %%mm2                     \n\t" // line 3 + 0x80
415
        "paddsb %%mm5, %%mm2                    \n\t"
416
        "psubb %%mm6, %%mm2                     \n\t"
417
        "movq %%mm2, (%%"REG_a", %1, 2)         \n\t"
418

419
        "movq (%%"REG_c", %1), %%mm2            \n\t"
420
        "paddb %%mm6, %%mm2                     \n\t" // line 6 + 0x80
421
        "psubsb %%mm5, %%mm2                    \n\t"
422
        "psubb %%mm6, %%mm2                     \n\t"
423
        "movq %%mm2, (%%"REG_c", %1)            \n\t"
424

425
        :
426 7cebed70 Reimar Döffinger
        : "r" (src), "r" ((x86_reg)stride)
427 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_c
428
    );
429 ebc3209a Diego Biurrun
#else //HAVE_MMX2 || HAVE_AMD3DNOW
430 16e0bf73 Diego Biurrun
    const int l1= stride;
431
    const int l2= stride + l1;
432
    const int l3= stride + l2;
433
    const int l4= stride + l3;
434
    const int l5= stride + l4;
435
    const int l6= stride + l5;
436
//    const int l7= stride + l6;
437
//    const int l8= stride + l7;
438
//    const int l9= stride + l8;
439
    int x;
440
    const int QP15= QP + (QP>>2);
441
    src+= stride*3;
442
    for(x=0; x<BLOCK_SIZE; x++){
443
        const int v = (src[x+l5] - src[x+l4]);
444
        if(FFABS(v) < QP15){
445
            src[x+l3] +=v>>3;
446
            src[x+l4] +=v>>1;
447
            src[x+l5] -=v>>1;
448
            src[x+l6] -=v>>3;
449 bb270c08 Diego Biurrun
        }
450 16e0bf73 Diego Biurrun
    }
451 13e00528 Arpi
452 ebc3209a Diego Biurrun
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
453 13e00528 Arpi
}
454 64c968da Diego Biurrun
#endif //0
455 13e00528 Arpi
456
/**
457
 * Experimental Filter 1
458 9f45d04d Michael Niedermayer
 * will not damage linear gradients
459 bd107136 Diego Biurrun
 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
460 755bfeab Diego Biurrun
 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
461
 * MMX2 version does correct clipping C version does not
462 13e00528 Arpi
 */
463 9c9e467d Michael Niedermayer
static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
464 13e00528 Arpi
{
465 ebc3209a Diego Biurrun
#if HAVE_MMX2 || HAVE_AMD3DNOW
466 16e0bf73 Diego Biurrun
    src+= stride*3;
467 bb270c08 Diego Biurrun
468 be449fca Diego Pettenò
    __asm__ volatile(
469 16e0bf73 Diego Biurrun
        "pxor %%mm7, %%mm7                      \n\t" // 0
470
        "lea (%0, %1), %%"REG_a"                \n\t"
471
        "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
472 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9
473
//      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
474 16e0bf73 Diego Biurrun
        "movq (%%"REG_a", %1, 2), %%mm0         \n\t" // line 3
475
        "movq (%0, %1, 4), %%mm1                \n\t" // line 4
476
        "movq %%mm1, %%mm2                      \n\t" // line 4
477
        "psubusb %%mm0, %%mm1                   \n\t"
478
        "psubusb %%mm2, %%mm0                   \n\t"
479
        "por %%mm1, %%mm0                       \n\t" // |l2 - l3|
480
        "movq (%%"REG_c"), %%mm3                \n\t" // line 5
481
        "movq (%%"REG_c", %1), %%mm4            \n\t" // line 6
482
        "movq %%mm3, %%mm5                      \n\t" // line 5
483
        "psubusb %%mm4, %%mm3                   \n\t"
484
        "psubusb %%mm5, %%mm4                   \n\t"
485
        "por %%mm4, %%mm3                       \n\t" // |l5 - l6|
486
        PAVGB(%%mm3, %%mm0)                           // (|l2 - l3| + |l5 - l6|)/2
487
        "movq %%mm2, %%mm1                      \n\t" // line 4
488
        "psubusb %%mm5, %%mm2                   \n\t"
489
        "movq %%mm2, %%mm4                      \n\t"
490
        "pcmpeqb %%mm7, %%mm2                   \n\t" // (l4 - l5) <= 0 ? -1 : 0
491
        "psubusb %%mm1, %%mm5                   \n\t"
492
        "por %%mm5, %%mm4                       \n\t" // |l4 - l5|
493
        "psubusb %%mm0, %%mm4                   \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
494
        "movq %%mm4, %%mm3                      \n\t" // d
495
        "movq %2, %%mm0                         \n\t"
496
        "paddusb %%mm0, %%mm0                   \n\t"
497
        "psubusb %%mm0, %%mm4                   \n\t"
498
        "pcmpeqb %%mm7, %%mm4                   \n\t" // d <= QP ? -1 : 0
499
        "psubusb "MANGLE(b01)", %%mm3           \n\t"
500
        "pand %%mm4, %%mm3                      \n\t" // d <= QP ? d : 0
501
502
        PAVGB(%%mm7, %%mm3)                           // d/2
503
        "movq %%mm3, %%mm1                      \n\t" // d/2
504
        PAVGB(%%mm7, %%mm3)                           // d/4
505
        PAVGB(%%mm1, %%mm3)                           // 3*d/8
506
507
        "movq (%0, %1, 4), %%mm0                \n\t" // line 4
508
        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
509
        "psubusb %%mm3, %%mm0                   \n\t"
510
        "pxor %%mm2, %%mm0                      \n\t"
511
        "movq %%mm0, (%0, %1, 4)                \n\t" // line 4
512
513
        "movq (%%"REG_c"), %%mm0                \n\t" // line 5
514
        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
515
        "paddusb %%mm3, %%mm0                   \n\t"
516
        "pxor %%mm2, %%mm0                      \n\t"
517
        "movq %%mm0, (%%"REG_c")                \n\t" // line 5
518
519
        PAVGB(%%mm7, %%mm1)                           // d/4
520
521
        "movq (%%"REG_a", %1, 2), %%mm0         \n\t" // line 3
522
        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
523
        "psubusb %%mm1, %%mm0                   \n\t"
524
        "pxor %%mm2, %%mm0                      \n\t"
525
        "movq %%mm0, (%%"REG_a", %1, 2)         \n\t" // line 3
526
527
        "movq (%%"REG_c", %1), %%mm0            \n\t" // line 6
528
        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
529
        "paddusb %%mm1, %%mm0                   \n\t"
530
        "pxor %%mm2, %%mm0                      \n\t"
531
        "movq %%mm0, (%%"REG_c", %1)            \n\t" // line 6
532
533
        PAVGB(%%mm7, %%mm1)                           // d/8
534
535
        "movq (%%"REG_a", %1), %%mm0            \n\t" // line 2
536
        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
537
        "psubusb %%mm1, %%mm0                   \n\t"
538
        "pxor %%mm2, %%mm0                      \n\t"
539
        "movq %%mm0, (%%"REG_a", %1)            \n\t" // line 2
540
541
        "movq (%%"REG_c", %1, 2), %%mm0         \n\t" // line 7
542
        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
543
        "paddusb %%mm1, %%mm0                   \n\t"
544
        "pxor %%mm2, %%mm0                      \n\t"
545
        "movq %%mm0, (%%"REG_c", %1, 2)         \n\t" // line 7
546
547
        :
548 7cebed70 Reimar Döffinger
        : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
549 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_c
550
    );
551 ebc3209a Diego Biurrun
#else //HAVE_MMX2 || HAVE_AMD3DNOW
552 d5a1a995 Michael Niedermayer
553 16e0bf73 Diego Biurrun
    const int l1= stride;
554
    const int l2= stride + l1;
555
    const int l3= stride + l2;
556
    const int l4= stride + l3;
557
    const int l5= stride + l4;
558
    const int l6= stride + l5;
559
    const int l7= stride + l6;
560
//    const int l8= stride + l7;
561
//    const int l9= stride + l8;
562
    int x;
563
564
    src+= stride*3;
565
    for(x=0; x<BLOCK_SIZE; x++){
566
        int a= src[l3] - src[l4];
567
        int b= src[l4] - src[l5];
568
        int c= src[l5] - src[l6];
569
570
        int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
571
        d= FFMAX(d, 0);
572
573
        if(d < co->QP*2){
574
            int v = d * FFSIGN(-b);
575
576
            src[l2] +=v>>3;
577
            src[l3] +=v>>2;
578
            src[l4] +=(3*v)>>3;
579
            src[l5] -=(3*v)>>3;
580
            src[l6] -=v>>2;
581
            src[l7] -=v>>3;
582 bb270c08 Diego Biurrun
        }
583 16e0bf73 Diego Biurrun
        src++;
584
    }
585 ebc3209a Diego Biurrun
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
586 13e00528 Arpi
}
587
588 b250f9c6 Aurelien Jacobs
#if !HAVE_ALTIVEC
589 9c9e467d Michael Niedermayer
static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
590 3057fa66 Arpi
{
591 ebc3209a Diego Biurrun
#if HAVE_MMX2 || HAVE_AMD3DNOW
592 7f16f6e6 Michael Niedermayer
/*
593 16e0bf73 Diego Biurrun
    uint8_t tmp[16];
594
    const int l1= stride;
595
    const int l2= stride + l1;
596
    const int l3= stride + l2;
597
    const int l4= (int)tmp - (int)src - stride*3;
598
    const int l5= (int)tmp - (int)src - stride*3 + 8;
599
    const int l6= stride*3 + l3;
600
    const int l7= stride + l6;
601
    const int l8= stride + l7;
602

603
    memcpy(tmp, src+stride*7, 8);
604
    memcpy(tmp+8, src+stride*8, 8);
605 7f16f6e6 Michael Niedermayer
*/
606 16e0bf73 Diego Biurrun
    src+= stride*4;
607 be449fca Diego Pettenò
    __asm__ volatile(
608 7f16f6e6 Michael Niedermayer
609 04932b0d Diego Biurrun
#if 0 //slightly more accurate and slightly slower
610 16e0bf73 Diego Biurrun
        "pxor %%mm7, %%mm7                      \n\t" // 0
611
        "lea (%0, %1), %%"REG_a"                \n\t"
612
        "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
613 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7
614
//      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ecx+%1  ecx+2%1
615
//      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1
616

617

618 16e0bf73 Diego Biurrun
        "movq (%0, %1, 2), %%mm0                \n\t" // l2
619
        "movq (%0), %%mm1                       \n\t" // l0
620
        "movq %%mm0, %%mm2                      \n\t" // l2
621
        PAVGB(%%mm7, %%mm0)                           // ~l2/2
622
        PAVGB(%%mm1, %%mm0)                           // ~(l2 + 2l0)/4
623
        PAVGB(%%mm2, %%mm0)                           // ~(5l2 + 2l0)/8
624

625
        "movq (%%"REG_a"), %%mm1                \n\t" // l1
626
        "movq (%%"REG_a", %1, 2), %%mm3         \n\t" // l3
627
        "movq %%mm1, %%mm4                      \n\t" // l1
628
        PAVGB(%%mm7, %%mm1)                           // ~l1/2
629
        PAVGB(%%mm3, %%mm1)                           // ~(l1 + 2l3)/4
630
        PAVGB(%%mm4, %%mm1)                           // ~(5l1 + 2l3)/8
631

632
        "movq %%mm0, %%mm4                      \n\t" // ~(5l2 + 2l0)/8
633
        "psubusb %%mm1, %%mm0                   \n\t"
634
        "psubusb %%mm4, %%mm1                   \n\t"
635
        "por %%mm0, %%mm1                       \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
636 7f16f6e6 Michael Niedermayer
// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
637

638 16e0bf73 Diego Biurrun
        "movq (%0, %1, 4), %%mm0                \n\t" // l4
639
        "movq %%mm0, %%mm4                      \n\t" // l4
640
        PAVGB(%%mm7, %%mm0)                           // ~l4/2
641
        PAVGB(%%mm2, %%mm0)                           // ~(l4 + 2l2)/4
642
        PAVGB(%%mm4, %%mm0)                           // ~(5l4 + 2l2)/8
643

644
        "movq (%%"REG_c"), %%mm2                \n\t" // l5
645
        "movq %%mm3, %%mm5                      \n\t" // l3
646
        PAVGB(%%mm7, %%mm3)                           // ~l3/2
647
        PAVGB(%%mm2, %%mm3)                           // ~(l3 + 2l5)/4
648
        PAVGB(%%mm5, %%mm3)                           // ~(5l3 + 2l5)/8
649

650
        "movq %%mm0, %%mm6                      \n\t" // ~(5l4 + 2l2)/8
651
        "psubusb %%mm3, %%mm0                   \n\t"
652
        "psubusb %%mm6, %%mm3                   \n\t"
653
        "por %%mm0, %%mm3                       \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
654
        "pcmpeqb %%mm7, %%mm0                   \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
655 7f16f6e6 Michael Niedermayer
// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
656

657 16e0bf73 Diego Biurrun
        "movq (%%"REG_c", %1), %%mm6            \n\t" // l6
658
        "movq %%mm6, %%mm5                      \n\t" // l6
659
        PAVGB(%%mm7, %%mm6)                           // ~l6/2
660
        PAVGB(%%mm4, %%mm6)                           // ~(l6 + 2l4)/4
661
        PAVGB(%%mm5, %%mm6)                           // ~(5l6 + 2l4)/8
662

663
        "movq (%%"REG_c", %1, 2), %%mm5         \n\t" // l7
664
        "movq %%mm2, %%mm4                      \n\t" // l5
665
        PAVGB(%%mm7, %%mm2)                           // ~l5/2
666
        PAVGB(%%mm5, %%mm2)                           // ~(l5 + 2l7)/4
667
        PAVGB(%%mm4, %%mm2)                           // ~(5l5 + 2l7)/8
668

669
        "movq %%mm6, %%mm4                      \n\t" // ~(5l6 + 2l4)/8
670
        "psubusb %%mm2, %%mm6                   \n\t"
671
        "psubusb %%mm4, %%mm2                   \n\t"
672
        "por %%mm6, %%mm2                       \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
673 7f16f6e6 Michael Niedermayer
// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
674

675

676 16e0bf73 Diego Biurrun
        PMINUB(%%mm2, %%mm1, %%mm4)                   // MIN(|lenergy|,|renergy|)/8
677
        "movq %2, %%mm4                         \n\t" // QP //FIXME QP+1 ?
678
        "paddusb "MANGLE(b01)", %%mm4           \n\t"
679
        "pcmpgtb %%mm3, %%mm4                   \n\t" // |menergy|/8 < QP
680
        "psubusb %%mm1, %%mm3                   \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
681
        "pand %%mm4, %%mm3                      \n\t"
682

683
        "movq %%mm3, %%mm1                      \n\t"
684
//        "psubusb "MANGLE(b01)", %%mm3           \n\t"
685
        PAVGB(%%mm7, %%mm3)
686
        PAVGB(%%mm7, %%mm3)
687
        "paddusb %%mm1, %%mm3                   \n\t"
688
//        "paddusb "MANGLE(b01)", %%mm3           \n\t"
689

690
        "movq (%%"REG_a", %1, 2), %%mm6         \n\t" //l3
691
        "movq (%0, %1, 4), %%mm5                \n\t" //l4
692
        "movq (%0, %1, 4), %%mm4                \n\t" //l4
693
        "psubusb %%mm6, %%mm5                   \n\t"
694
        "psubusb %%mm4, %%mm6                   \n\t"
695
        "por %%mm6, %%mm5                       \n\t" // |l3-l4|
696
        "pcmpeqb %%mm7, %%mm6                   \n\t" // SIGN(l3-l4)
697
        "pxor %%mm6, %%mm0                      \n\t"
698
        "pand %%mm0, %%mm3                      \n\t"
699
        PMINUB(%%mm5, %%mm3, %%mm0)
700

701
        "psubusb "MANGLE(b01)", %%mm3           \n\t"
702
        PAVGB(%%mm7, %%mm3)
703

704
        "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
705
        "movq (%0, %1, 4), %%mm2                \n\t"
706
        "pxor %%mm6, %%mm0                      \n\t"
707
        "pxor %%mm6, %%mm2                      \n\t"
708
        "psubb %%mm3, %%mm0                     \n\t"
709
        "paddb %%mm3, %%mm2                     \n\t"
710
        "pxor %%mm6, %%mm0                      \n\t"
711
        "pxor %%mm6, %%mm2                      \n\t"
712
        "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
713
        "movq %%mm2, (%0, %1, 4)                \n\t"
714 64c968da Diego Biurrun
#endif //0
715 7f16f6e6 Michael Niedermayer
716 16e0bf73 Diego Biurrun
        "lea (%0, %1), %%"REG_a"                \n\t"
717
        "pcmpeqb %%mm6, %%mm6                   \n\t" // -1
718 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7
719
//      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ecx+%1  ecx+2%1
720
//      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1
721 7f16f6e6 Michael Niedermayer
722
723 16e0bf73 Diego Biurrun
        "movq (%%"REG_a", %1, 2), %%mm1         \n\t" // l3
724
        "movq (%0, %1, 4), %%mm0                \n\t" // l4
725
        "pxor %%mm6, %%mm1                      \n\t" // -l3-1
726
        PAVGB(%%mm1, %%mm0)                           // -q+128 = (l4-l3+256)/2
727 7f16f6e6 Michael Niedermayer
// mm1=-l3-1, mm0=128-q
728
729 16e0bf73 Diego Biurrun
        "movq (%%"REG_a", %1, 4), %%mm2         \n\t" // l5
730
        "movq (%%"REG_a", %1), %%mm3            \n\t" // l2
731
        "pxor %%mm6, %%mm2                      \n\t" // -l5-1
732
        "movq %%mm2, %%mm5                      \n\t" // -l5-1
733
        "movq "MANGLE(b80)", %%mm4              \n\t" // 128
734
        "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
735
        PAVGB(%%mm3, %%mm2)                           // (l2-l5+256)/2
736
        PAVGB(%%mm0, %%mm4)                           // ~(l4-l3)/4 + 128
737
        PAVGB(%%mm2, %%mm4)                           // ~(l2-l5)/4 +(l4-l3)/8 + 128
738
        PAVGB(%%mm0, %%mm4)                           // ~(l2-l5)/8 +5(l4-l3)/16 + 128
739 7f16f6e6 Michael Niedermayer
// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
740
741 16e0bf73 Diego Biurrun
        "movq (%%"REG_a"), %%mm2                \n\t" // l1
742
        "pxor %%mm6, %%mm2                      \n\t" // -l1-1
743
        PAVGB(%%mm3, %%mm2)                           // (l2-l1+256)/2
744
        PAVGB((%0), %%mm1)                            // (l0-l3+256)/2
745
        "movq "MANGLE(b80)", %%mm3              \n\t" // 128
746
        PAVGB(%%mm2, %%mm3)                           // ~(l2-l1)/4 + 128
747
        PAVGB(%%mm1, %%mm3)                           // ~(l0-l3)/4 +(l2-l1)/8 + 128
748
        PAVGB(%%mm2, %%mm3)                           // ~(l0-l3)/8 +5(l2-l1)/16 + 128
749 7f16f6e6 Michael Niedermayer
// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
750
751 16e0bf73 Diego Biurrun
        PAVGB((%%REGc, %1), %%mm5)                    // (l6-l5+256)/2
752
        "movq (%%"REG_c", %1, 2), %%mm1         \n\t" // l7
753
        "pxor %%mm6, %%mm1                      \n\t" // -l7-1
754
        PAVGB((%0, %1, 4), %%mm1)                     // (l4-l7+256)/2
755
        "movq "MANGLE(b80)", %%mm2              \n\t" // 128
756
        PAVGB(%%mm5, %%mm2)                           // ~(l6-l5)/4 + 128
757
        PAVGB(%%mm1, %%mm2)                           // ~(l4-l7)/4 +(l6-l5)/8 + 128
758
        PAVGB(%%mm5, %%mm2)                           // ~(l4-l7)/8 +5(l6-l5)/16 + 128
759 7f16f6e6 Michael Niedermayer
// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
760
761 16e0bf73 Diego Biurrun
        "movq "MANGLE(b00)", %%mm1              \n\t" // 0
762
        "movq "MANGLE(b00)", %%mm5              \n\t" // 0
763
        "psubb %%mm2, %%mm1                     \n\t" // 128 - renergy/16
764
        "psubb %%mm3, %%mm5                     \n\t" // 128 - lenergy/16
765
        PMAXUB(%%mm1, %%mm2)                          // 128 + |renergy/16|
766
        PMAXUB(%%mm5, %%mm3)                          // 128 + |lenergy/16|
767
        PMINUB(%%mm2, %%mm3, %%mm1)                   // 128 + MIN(|lenergy|,|renergy|)/16
768 7f16f6e6 Michael Niedermayer
769
// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
770
771 16e0bf73 Diego Biurrun
        "movq "MANGLE(b00)", %%mm7              \n\t" // 0
772
        "movq %2, %%mm2                         \n\t" // QP
773
        PAVGB(%%mm6, %%mm2)                           // 128 + QP/2
774
        "psubb %%mm6, %%mm2                     \n\t"
775
776
        "movq %%mm4, %%mm1                      \n\t"
777
        "pcmpgtb %%mm7, %%mm1                   \n\t" // SIGN(menergy)
778
        "pxor %%mm1, %%mm4                      \n\t"
779
        "psubb %%mm1, %%mm4                     \n\t" // 128 + |menergy|/16
780
        "pcmpgtb %%mm4, %%mm2                   \n\t" // |menergy|/16 < QP/2
781
        "psubusb %%mm3, %%mm4                   \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
782 7f16f6e6 Michael Niedermayer
// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
783
784 16e0bf73 Diego Biurrun
        "movq %%mm4, %%mm3                      \n\t" // d
785
        "psubusb "MANGLE(b01)", %%mm4           \n\t"
786
        PAVGB(%%mm7, %%mm4)                           // d/32
787
        PAVGB(%%mm7, %%mm4)                           // (d + 32)/64
788
        "paddb %%mm3, %%mm4                     \n\t" // 5d/64
789
        "pand %%mm2, %%mm4                      \n\t"
790
791
        "movq "MANGLE(b80)", %%mm5              \n\t" // 128
792
        "psubb %%mm0, %%mm5                     \n\t" // q
793
        "paddsb %%mm6, %%mm5                    \n\t" // fix bad rounding
794
        "pcmpgtb %%mm5, %%mm7                   \n\t" // SIGN(q)
795
        "pxor %%mm7, %%mm5                      \n\t"
796
797
        PMINUB(%%mm5, %%mm4, %%mm3)                   // MIN(|q|, 5d/64)
798
        "pxor %%mm1, %%mm7                      \n\t" // SIGN(d*q)
799
800
        "pand %%mm7, %%mm4                      \n\t"
801
        "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
802
        "movq (%0, %1, 4), %%mm2                \n\t"
803
        "pxor %%mm1, %%mm0                      \n\t"
804
        "pxor %%mm1, %%mm2                      \n\t"
805
        "paddb %%mm4, %%mm0                     \n\t"
806
        "psubb %%mm4, %%mm2                     \n\t"
807
        "pxor %%mm1, %%mm0                      \n\t"
808
        "pxor %%mm1, %%mm2                      \n\t"
809
        "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
810
        "movq %%mm2, (%0, %1, 4)                \n\t"
811
812
        :
813 7cebed70 Reimar Döffinger
        : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
814 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_c
815
    );
816 7f16f6e6 Michael Niedermayer
817
/*
818 16e0bf73 Diego Biurrun
    {
819
    int x;
820
    src-= stride;
821
    for(x=0; x<BLOCK_SIZE; x++){
822
        const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
823
        if(FFABS(middleEnergy)< 8*QP){
824
            const int q=(src[l4] - src[l5])/2;
825
            const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
826
            const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
827

828
            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
829
            d= FFMAX(d, 0);
830

831
            d= (5*d + 32) >> 6;
832
            d*= FFSIGN(-middleEnergy);
833

834
            if(q>0){
835
                d= d<0 ? 0 : d;
836
                d= d>q ? q : d;
837
            }else{
838
                d= d>0 ? 0 : d;
839
                d= d<q ? q : d;
840
            }
841

842
            src[l4]-= d;
843
            src[l5]+= d;
844 bb270c08 Diego Biurrun
        }
845 16e0bf73 Diego Biurrun
        src++;
846
    }
847
    src-=8;
848
    for(x=0; x<8; x++){
849
        int y;
850
        for(y=4; y<6; y++){
851
            int d= src[x+y*stride] - tmp[x+(y-4)*8];
852
            int ad= FFABS(d);
853
            static int max=0;
854
            static int sum=0;
855
            static int num=0;
856
            static int bias=0;
857

858
            if(max<ad) max=ad;
859
            sum+= ad>3 ? 1 : 0;
860
            if(ad>3){
861
                src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
862
            }
863
            if(y==4) bias+=d;
864
            num++;
865
            if(num%1000000 == 0){
866
                av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
867
            }
868 bb270c08 Diego Biurrun
        }
869 16e0bf73 Diego Biurrun
    }
870 7f16f6e6 Michael Niedermayer
}
871
*/
872 b250f9c6 Aurelien Jacobs
#elif HAVE_MMX
873 16e0bf73 Diego Biurrun
    src+= stride*4;
874 be449fca Diego Pettenò
    __asm__ volatile(
875 16e0bf73 Diego Biurrun
        "pxor %%mm7, %%mm7                      \n\t"
876
        "lea -40(%%"REG_SP"), %%"REG_c"         \n\t" // make space for 4 8-byte vars
877
        "and "ALIGN_MASK", %%"REG_c"            \n\t" // align
878 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7
879
//      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 edx+%1  edx+2%1
880
//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1
881
882 16e0bf73 Diego Biurrun
        "movq (%0), %%mm0                       \n\t"
883
        "movq %%mm0, %%mm1                      \n\t"
884
        "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
885
        "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
886
887
        "movq (%0, %1), %%mm2                   \n\t"
888
        "lea (%0, %1, 2), %%"REG_a"             \n\t"
889
        "movq %%mm2, %%mm3                      \n\t"
890
        "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
891
        "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
892
893
        "movq (%%"REG_a"), %%mm4                \n\t"
894
        "movq %%mm4, %%mm5                      \n\t"
895
        "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
896
        "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
897
898
        "paddw %%mm0, %%mm0                     \n\t" // 2L0
899
        "paddw %%mm1, %%mm1                     \n\t" // 2H0
900
        "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
901
        "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
902
        "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
903
        "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
904
905
        "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
906
        "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
907
        "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
908
        "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
909
910
        "movq (%%"REG_a", %1), %%mm2            \n\t"
911
        "movq %%mm2, %%mm3                      \n\t"
912
        "punpcklbw %%mm7, %%mm2                 \n\t" // L3
913
        "punpckhbw %%mm7, %%mm3                 \n\t" // H3
914
915
        "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
916
        "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
917
        "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
918
        "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
919
        "movq %%mm0, (%%"REG_c")                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
920
        "movq %%mm1, 8(%%"REG_c")               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
921
922
        "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
923
        "movq %%mm0, %%mm1                      \n\t"
924
        "punpcklbw %%mm7, %%mm0                 \n\t" // L4
925
        "punpckhbw %%mm7, %%mm1                 \n\t" // H4
926
927
        "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
928
        "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
929
        "movq %%mm2, 16(%%"REG_c")              \n\t" // L3 - L4
930
        "movq %%mm3, 24(%%"REG_c")              \n\t" // H3 - H4
931
        "paddw %%mm4, %%mm4                     \n\t" // 2L2
932
        "paddw %%mm5, %%mm5                     \n\t" // 2H2
933
        "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
934
        "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
935
936
        "lea (%%"REG_a", %1), %0                \n\t"
937
        "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
938
        "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
939
        "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
940
        "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
941 3057fa66 Arpi
//50 opcodes so far
942 16e0bf73 Diego Biurrun
        "movq (%0, %1, 2), %%mm2                \n\t"
943
        "movq %%mm2, %%mm3                      \n\t"
944
        "punpcklbw %%mm7, %%mm2                 \n\t" // L5
945
        "punpckhbw %%mm7, %%mm3                 \n\t" // H5
946
        "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
947
        "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
948
        "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
949
        "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5
950
951
        "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
952
        "punpcklbw %%mm7, %%mm6                 \n\t" // L6
953
        "psubw %%mm6, %%mm2                     \n\t" // L5 - L6
954
        "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
955
        "punpckhbw %%mm7, %%mm6                 \n\t" // H6
956
        "psubw %%mm6, %%mm3                     \n\t" // H5 - H6
957
958
        "paddw %%mm0, %%mm0                     \n\t" // 2L4
959
        "paddw %%mm1, %%mm1                     \n\t" // 2H4
960
        "psubw %%mm2, %%mm0                     \n\t" // 2L4 - L5 + L6
961
        "psubw %%mm3, %%mm1                     \n\t" // 2H4 - H5 + H6
962
963
        "psllw $2, %%mm2                        \n\t" // 4L5 - 4L6
964
        "psllw $2, %%mm3                        \n\t" // 4H5 - 4H6
965
        "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6
966
        "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6
967
968
        "movq (%0, %1, 4), %%mm2                \n\t"
969
        "movq %%mm2, %%mm3                      \n\t"
970
        "punpcklbw %%mm7, %%mm2                 \n\t" // L7
971
        "punpckhbw %%mm7, %%mm3                 \n\t" // H7
972
973
        "paddw %%mm2, %%mm2                     \n\t" // 2L7
974
        "paddw %%mm3, %%mm3                     \n\t" // 2H7
975
        "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6 - 2L7
976
        "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6 - 2H7
977
978
        "movq (%%"REG_c"), %%mm2                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
979
        "movq 8(%%"REG_c"), %%mm3               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
980 8405b3fd Michael Niedermayer
981 b250f9c6 Aurelien Jacobs
#if HAVE_MMX2
982 16e0bf73 Diego Biurrun
        "movq %%mm7, %%mm6                      \n\t" // 0
983
        "psubw %%mm0, %%mm6                     \n\t"
984
        "pmaxsw %%mm6, %%mm0                    \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
985
        "movq %%mm7, %%mm6                      \n\t" // 0
986
        "psubw %%mm1, %%mm6                     \n\t"
987
        "pmaxsw %%mm6, %%mm1                    \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
988
        "movq %%mm7, %%mm6                      \n\t" // 0
989
        "psubw %%mm2, %%mm6                     \n\t"
990
        "pmaxsw %%mm6, %%mm2                    \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
991
        "movq %%mm7, %%mm6                      \n\t" // 0
992
        "psubw %%mm3, %%mm6                     \n\t"
993
        "pmaxsw %%mm6, %%mm3                    \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
994 8405b3fd Michael Niedermayer
#else
995 16e0bf73 Diego Biurrun
        "movq %%mm7, %%mm6                      \n\t" // 0
996
        "pcmpgtw %%mm0, %%mm6                   \n\t"
997
        "pxor %%mm6, %%mm0                      \n\t"
998
        "psubw %%mm6, %%mm0                     \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
999
        "movq %%mm7, %%mm6                      \n\t" // 0
1000
        "pcmpgtw %%mm1, %%mm6                   \n\t"
1001
        "pxor %%mm6, %%mm1                      \n\t"
1002
        "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1003
        "movq %%mm7, %%mm6                      \n\t" // 0
1004
        "pcmpgtw %%mm2, %%mm6                   \n\t"
1005
        "pxor %%mm6, %%mm2                      \n\t"
1006
        "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1007
        "movq %%mm7, %%mm6                      \n\t" // 0
1008
        "pcmpgtw %%mm3, %%mm6                   \n\t"
1009
        "pxor %%mm6, %%mm3                      \n\t"
1010
        "psubw %%mm6, %%mm3                     \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1011 8405b3fd Michael Niedermayer
#endif
1012 3057fa66 Arpi
1013 b250f9c6 Aurelien Jacobs
#if HAVE_MMX2
1014 16e0bf73 Diego Biurrun
        "pminsw %%mm2, %%mm0                    \n\t"
1015
        "pminsw %%mm3, %%mm1                    \n\t"
1016 3057fa66 Arpi
#else
1017 16e0bf73 Diego Biurrun
        "movq %%mm0, %%mm6                      \n\t"
1018
        "psubusw %%mm2, %%mm6                   \n\t"
1019
        "psubw %%mm6, %%mm0                     \n\t"
1020
        "movq %%mm1, %%mm6                      \n\t"
1021
        "psubusw %%mm3, %%mm6                   \n\t"
1022
        "psubw %%mm6, %%mm1                     \n\t"
1023 3057fa66 Arpi
#endif
1024
1025 16e0bf73 Diego Biurrun
        "movd %2, %%mm2                         \n\t" // QP
1026
        "punpcklbw %%mm7, %%mm2                 \n\t"
1027 792a5a7c Michael Niedermayer
1028 16e0bf73 Diego Biurrun
        "movq %%mm7, %%mm6                      \n\t" // 0
1029
        "pcmpgtw %%mm4, %%mm6                   \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1030
        "pxor %%mm6, %%mm4                      \n\t"
1031
        "psubw %%mm6, %%mm4                     \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1032
        "pcmpgtw %%mm5, %%mm7                   \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1033
        "pxor %%mm7, %%mm5                      \n\t"
1034
        "psubw %%mm7, %%mm5                     \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1035 3057fa66 Arpi
// 100 opcodes
1036 16e0bf73 Diego Biurrun
        "psllw $3, %%mm2                        \n\t" // 8QP
1037
        "movq %%mm2, %%mm3                      \n\t" // 8QP
1038
        "pcmpgtw %%mm4, %%mm2                   \n\t"
1039
        "pcmpgtw %%mm5, %%mm3                   \n\t"
1040
        "pand %%mm2, %%mm4                      \n\t"
1041
        "pand %%mm3, %%mm5                      \n\t"
1042
1043
1044
        "psubusw %%mm0, %%mm4                   \n\t" // hd
1045
        "psubusw %%mm1, %%mm5                   \n\t" // ld
1046
1047
1048
        "movq "MANGLE(w05)", %%mm2              \n\t" // 5
1049
        "pmullw %%mm2, %%mm4                    \n\t"
1050
        "pmullw %%mm2, %%mm5                    \n\t"
1051
        "movq "MANGLE(w20)", %%mm2              \n\t" // 32
1052
        "paddw %%mm2, %%mm4                     \n\t"
1053
        "paddw %%mm2, %%mm5                     \n\t"
1054
        "psrlw $6, %%mm4                        \n\t"
1055
        "psrlw $6, %%mm5                        \n\t"
1056
1057
        "movq 16(%%"REG_c"), %%mm0              \n\t" // L3 - L4
1058
        "movq 24(%%"REG_c"), %%mm1              \n\t" // H3 - H4
1059
1060
        "pxor %%mm2, %%mm2                      \n\t"
1061
        "pxor %%mm3, %%mm3                      \n\t"
1062
1063
        "pcmpgtw %%mm0, %%mm2                   \n\t" // sign (L3-L4)
1064
        "pcmpgtw %%mm1, %%mm3                   \n\t" // sign (H3-H4)
1065
        "pxor %%mm2, %%mm0                      \n\t"
1066
        "pxor %%mm3, %%mm1                      \n\t"
1067
        "psubw %%mm2, %%mm0                     \n\t" // |L3-L4|
1068
        "psubw %%mm3, %%mm1                     \n\t" // |H3-H4|
1069
        "psrlw $1, %%mm0                        \n\t" // |L3 - L4|/2
1070
        "psrlw $1, %%mm1                        \n\t" // |H3 - H4|/2
1071
1072
        "pxor %%mm6, %%mm2                      \n\t"
1073
        "pxor %%mm7, %%mm3                      \n\t"
1074
        "pand %%mm2, %%mm4                      \n\t"
1075
        "pand %%mm3, %%mm5                      \n\t"
1076 3057fa66 Arpi
1077 b250f9c6 Aurelien Jacobs
#if HAVE_MMX2
1078 16e0bf73 Diego Biurrun
        "pminsw %%mm0, %%mm4                    \n\t"
1079
        "pminsw %%mm1, %%mm5                    \n\t"
1080 3057fa66 Arpi
#else
1081 16e0bf73 Diego Biurrun
        "movq %%mm4, %%mm2                      \n\t"
1082
        "psubusw %%mm0, %%mm2                   \n\t"
1083
        "psubw %%mm2, %%mm4                     \n\t"
1084
        "movq %%mm5, %%mm2                      \n\t"
1085
        "psubusw %%mm1, %%mm2                   \n\t"
1086
        "psubw %%mm2, %%mm5                     \n\t"
1087 3057fa66 Arpi
#endif
1088 16e0bf73 Diego Biurrun
        "pxor %%mm6, %%mm4                      \n\t"
1089
        "pxor %%mm7, %%mm5                      \n\t"
1090
        "psubw %%mm6, %%mm4                     \n\t"
1091
        "psubw %%mm7, %%mm5                     \n\t"
1092
        "packsswb %%mm5, %%mm4                  \n\t"
1093
        "movq (%0), %%mm0                       \n\t"
1094
        "paddb   %%mm4, %%mm0                   \n\t"
1095
        "movq %%mm0, (%0)                       \n\t"
1096
        "movq (%0, %1), %%mm0                   \n\t"
1097
        "psubb %%mm4, %%mm0                     \n\t"
1098
        "movq %%mm0, (%0, %1)                   \n\t"
1099
1100
        : "+r" (src)
1101 7cebed70 Reimar Döffinger
        : "r" ((x86_reg)stride), "m" (c->pQPb)
1102 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_c
1103
    );
1104 ebc3209a Diego Biurrun
#else //HAVE_MMX2 || HAVE_AMD3DNOW
1105 16e0bf73 Diego Biurrun
    const int l1= stride;
1106
    const int l2= stride + l1;
1107
    const int l3= stride + l2;
1108
    const int l4= stride + l3;
1109
    const int l5= stride + l4;
1110
    const int l6= stride + l5;
1111
    const int l7= stride + l6;
1112
    const int l8= stride + l7;
1113
//    const int l9= stride + l8;
1114
    int x;
1115
    src+= stride*3;
1116
    for(x=0; x<BLOCK_SIZE; x++){
1117
        const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1118
        if(FFABS(middleEnergy) < 8*c->QP){
1119
            const int q=(src[l4] - src[l5])/2;
1120
            const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1121
            const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1122
1123
            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
1124
            d= FFMAX(d, 0);
1125
1126
            d= (5*d + 32) >> 6;
1127
            d*= FFSIGN(-middleEnergy);
1128
1129
            if(q>0){
1130
                d= d<0 ? 0 : d;
1131
                d= d>q ? q : d;
1132
            }else{
1133
                d= d>0 ? 0 : d;
1134
                d= d<q ? q : d;
1135
            }
1136
1137
            src[l4]-= d;
1138
            src[l5]+= d;
1139 bb270c08 Diego Biurrun
        }
1140 16e0bf73 Diego Biurrun
        src++;
1141
    }
1142 ebc3209a Diego Biurrun
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
1143 3057fa66 Arpi
}
1144 b0ac780a Michael Niedermayer
#endif //HAVE_ALTIVEC
1145 3057fa66 Arpi
1146 b250f9c6 Aurelien Jacobs
#if !HAVE_ALTIVEC
1147 9c9e467d Michael Niedermayer
static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1148 3057fa66 Arpi
{
1149 ebc3209a Diego Biurrun
#if HAVE_MMX2 || HAVE_AMD3DNOW
1150 be449fca Diego Pettenò
    __asm__ volatile(
1151 16e0bf73 Diego Biurrun
        "pxor %%mm6, %%mm6                      \n\t"
1152
        "pcmpeqb %%mm7, %%mm7                   \n\t"
1153
        "movq %2, %%mm0                         \n\t"
1154
        "punpcklbw %%mm6, %%mm0                 \n\t"
1155
        "psrlw $1, %%mm0                        \n\t"
1156
        "psubw %%mm7, %%mm0                     \n\t"
1157
        "packuswb %%mm0, %%mm0                  \n\t"
1158
        "movq %%mm0, %3                         \n\t"
1159
1160
        "lea (%0, %1), %%"REG_a"                \n\t"
1161
        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1162 bb270c08 Diego Biurrun
1163
//        0        1        2        3        4        5        6        7        8        9
1164
//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1165 3057fa66 Arpi
1166 cc9b0679 Michael Niedermayer
#undef FIND_MIN_MAX
1167 b250f9c6 Aurelien Jacobs
#if HAVE_MMX2
1168 053dea12 Aurelien Jacobs
#define REAL_FIND_MIN_MAX(addr)\
1169 16e0bf73 Diego Biurrun
        "movq " #addr ", %%mm0                  \n\t"\
1170
        "pminub %%mm0, %%mm7                    \n\t"\
1171
        "pmaxub %%mm0, %%mm6                    \n\t"
1172 e0f8ffae Michael Niedermayer
#else
1173 053dea12 Aurelien Jacobs
#define REAL_FIND_MIN_MAX(addr)\
1174 16e0bf73 Diego Biurrun
        "movq " #addr ", %%mm0                  \n\t"\
1175
        "movq %%mm7, %%mm1                      \n\t"\
1176
        "psubusb %%mm0, %%mm6                   \n\t"\
1177
        "paddb %%mm0, %%mm6                     \n\t"\
1178
        "psubusb %%mm0, %%mm1                   \n\t"\
1179
        "psubb %%mm1, %%mm7                     \n\t"
1180 e0f8ffae Michael Niedermayer
#endif
1181 053dea12 Aurelien Jacobs
#define FIND_MIN_MAX(addr)  REAL_FIND_MIN_MAX(addr)
1182 3057fa66 Arpi
1183 053dea12 Aurelien Jacobs
FIND_MIN_MAX((%%REGa))
1184
FIND_MIN_MAX((%%REGa, %1))
1185
FIND_MIN_MAX((%%REGa, %1, 2))
1186 70c5ae87 Michael Niedermayer
FIND_MIN_MAX((%0, %1, 4))
1187 053dea12 Aurelien Jacobs
FIND_MIN_MAX((%%REGd))
1188
FIND_MIN_MAX((%%REGd, %1))
1189
FIND_MIN_MAX((%%REGd, %1, 2))
1190 70c5ae87 Michael Niedermayer
FIND_MIN_MAX((%0, %1, 8))
1191 3057fa66 Arpi
1192 16e0bf73 Diego Biurrun
        "movq %%mm7, %%mm4                      \n\t"
1193
        "psrlq $8, %%mm7                        \n\t"
1194 b250f9c6 Aurelien Jacobs
#if HAVE_MMX2
1195 16e0bf73 Diego Biurrun
        "pminub %%mm4, %%mm7                    \n\t" // min of pixels
1196
        "pshufw $0xF9, %%mm7, %%mm4             \n\t"
1197
        "pminub %%mm4, %%mm7                    \n\t" // min of pixels
1198
        "pshufw $0xFE, %%mm7, %%mm4             \n\t"
1199
        "pminub %%mm4, %%mm7                    \n\t"
1200 e5c30e06 Michael Niedermayer
#else
1201 16e0bf73 Diego Biurrun
        "movq %%mm7, %%mm1                      \n\t"
1202
        "psubusb %%mm4, %%mm1                   \n\t"
1203
        "psubb %%mm1, %%mm7                     \n\t"
1204
        "movq %%mm7, %%mm4                      \n\t"
1205
        "psrlq $16, %%mm7                       \n\t"
1206
        "movq %%mm7, %%mm1                      \n\t"
1207
        "psubusb %%mm4, %%mm1                   \n\t"
1208
        "psubb %%mm1, %%mm7                     \n\t"
1209
        "movq %%mm7, %%mm4                      \n\t"
1210
        "psrlq $32, %%mm7                       \n\t"
1211
        "movq %%mm7, %%mm1                      \n\t"
1212
        "psubusb %%mm4, %%mm1                   \n\t"
1213
        "psubb %%mm1, %%mm7                     \n\t"
1214 e5c30e06 Michael Niedermayer
#endif
1215 cd38e322 Michael Niedermayer
1216
1217 16e0bf73 Diego Biurrun
        "movq %%mm6, %%mm4                      \n\t"
1218
        "psrlq $8, %%mm6                        \n\t"
1219 b250f9c6 Aurelien Jacobs
#if HAVE_MMX2
1220 16e0bf73 Diego Biurrun
        "pmaxub %%mm4, %%mm6                    \n\t" // max of pixels
1221
        "pshufw $0xF9, %%mm6, %%mm4             \n\t"
1222
        "pmaxub %%mm4, %%mm6                    \n\t"
1223
        "pshufw $0xFE, %%mm6, %%mm4             \n\t"
1224
        "pmaxub %%mm4, %%mm6                    \n\t"
1225 cd38e322 Michael Niedermayer
#else
1226 16e0bf73 Diego Biurrun
        "psubusb %%mm4, %%mm6                   \n\t"
1227
        "paddb %%mm4, %%mm6                     \n\t"
1228
        "movq %%mm6, %%mm4                      \n\t"
1229
        "psrlq $16, %%mm6                       \n\t"
1230
        "psubusb %%mm4, %%mm6                   \n\t"
1231
        "paddb %%mm4, %%mm6                     \n\t"
1232
        "movq %%mm6, %%mm4                      \n\t"
1233
        "psrlq $32, %%mm6                       \n\t"
1234
        "psubusb %%mm4, %%mm6                   \n\t"
1235
        "paddb %%mm4, %%mm6                     \n\t"
1236 cd38e322 Michael Niedermayer
#endif
1237 16e0bf73 Diego Biurrun
        "movq %%mm6, %%mm0                      \n\t" // max
1238
        "psubb %%mm7, %%mm6                     \n\t" // max - min
1239
        "movd %%mm6, %%ecx                      \n\t"
1240
        "cmpb "MANGLE(deringThreshold)", %%cl   \n\t"
1241
        " jb 1f                                 \n\t"
1242
        "lea -24(%%"REG_SP"), %%"REG_c"         \n\t"
1243
        "and "ALIGN_MASK", %%"REG_c"            \n\t"
1244
        PAVGB(%%mm0, %%mm7)                           // a=(max + min)/2
1245
        "punpcklbw %%mm7, %%mm7                 \n\t"
1246
        "punpcklbw %%mm7, %%mm7                 \n\t"
1247
        "punpcklbw %%mm7, %%mm7                 \n\t"
1248
        "movq %%mm7, (%%"REG_c")                \n\t"
1249
1250
        "movq (%0), %%mm0                       \n\t" // L10
1251
        "movq %%mm0, %%mm1                      \n\t" // L10
1252
        "movq %%mm0, %%mm2                      \n\t" // L10
1253
        "psllq $8, %%mm1                        \n\t"
1254
        "psrlq $8, %%mm2                        \n\t"
1255
        "movd -4(%0), %%mm3                     \n\t"
1256
        "movd 8(%0), %%mm4                      \n\t"
1257
        "psrlq $24, %%mm3                       \n\t"
1258
        "psllq $56, %%mm4                       \n\t"
1259
        "por %%mm3, %%mm1                       \n\t" // L00
1260
        "por %%mm4, %%mm2                       \n\t" // L20
1261
        "movq %%mm1, %%mm3                      \n\t" // L00
1262
        PAVGB(%%mm2, %%mm1)                           // (L20 + L00)/2
1263
        PAVGB(%%mm0, %%mm1)                           // (L20 + L00 + 2L10)/4
1264
        "psubusb %%mm7, %%mm0                   \n\t"
1265
        "psubusb %%mm7, %%mm2                   \n\t"
1266
        "psubusb %%mm7, %%mm3                   \n\t"
1267
        "pcmpeqb "MANGLE(b00)", %%mm0           \n\t" // L10 > a ? 0 : -1
1268
        "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L20 > a ? 0 : -1
1269
        "pcmpeqb "MANGLE(b00)", %%mm3           \n\t" // L00 > a ? 0 : -1
1270
        "paddb %%mm2, %%mm0                     \n\t"
1271
        "paddb %%mm3, %%mm0                     \n\t"
1272
1273
        "movq (%%"REG_a"), %%mm2                \n\t" // L11
1274
        "movq %%mm2, %%mm3                      \n\t" // L11
1275
        "movq %%mm2, %%mm4                      \n\t" // L11
1276
        "psllq $8, %%mm3                        \n\t"
1277
        "psrlq $8, %%mm4                        \n\t"
1278
        "movd -4(%%"REG_a"), %%mm5              \n\t"
1279
        "movd 8(%%"REG_a"), %%mm6               \n\t"
1280
        "psrlq $24, %%mm5                       \n\t"
1281
        "psllq $56, %%mm6                       \n\t"
1282
        "por %%mm5, %%mm3                       \n\t" // L01
1283
        "por %%mm6, %%mm4                       \n\t" // L21
1284
        "movq %%mm3, %%mm5                      \n\t" // L01
1285
        PAVGB(%%mm4, %%mm3)                           // (L21 + L01)/2
1286
        PAVGB(%%mm2, %%mm3)                           // (L21 + L01 + 2L11)/4
1287
        "psubusb %%mm7, %%mm2                   \n\t"
1288
        "psubusb %%mm7, %%mm4                   \n\t"
1289
        "psubusb %%mm7, %%mm5                   \n\t"
1290
        "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L11 > a ? 0 : -1
1291
        "pcmpeqb "MANGLE(b00)", %%mm4           \n\t" // L21 > a ? 0 : -1
1292
        "pcmpeqb "MANGLE(b00)", %%mm5           \n\t" // L01 > a ? 0 : -1
1293
        "paddb %%mm4, %%mm2                     \n\t"
1294
        "paddb %%mm5, %%mm2                     \n\t"
1295 70c5ae87 Michael Niedermayer
// 0, 2, 3, 1
1296 053dea12 Aurelien Jacobs
#define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1297 16e0bf73 Diego Biurrun
        "movq " #src ", " #sx "                 \n\t" /* src[0] */\
1298
        "movq " #sx ", " #lx "                  \n\t" /* src[0] */\
1299
        "movq " #sx ", " #t0 "                  \n\t" /* src[0] */\
1300
        "psllq $8, " #lx "                      \n\t"\
1301
        "psrlq $8, " #t0 "                      \n\t"\
1302
        "movd -4" #src ", " #t1 "               \n\t"\
1303
        "psrlq $24, " #t1 "                     \n\t"\
1304
        "por " #t1 ", " #lx "                   \n\t" /* src[-1] */\
1305
        "movd 8" #src ", " #t1 "                \n\t"\
1306
        "psllq $56, " #t1 "                     \n\t"\
1307
        "por " #t1 ", " #t0 "                   \n\t" /* src[+1] */\
1308
        "movq " #lx ", " #t1 "                  \n\t" /* src[-1] */\
1309
        PAVGB(t0, lx)                                 /* (src[-1] + src[+1])/2 */\
1310
        PAVGB(sx, lx)                                 /* (src[-1] + 2src[0] + src[+1])/4 */\
1311
        PAVGB(lx, pplx)                                     \
1312
        "movq " #lx ", 8(%%"REG_c")             \n\t"\
1313
        "movq (%%"REG_c"), " #lx "              \n\t"\
1314
        "psubusb " #lx ", " #t1 "               \n\t"\
1315
        "psubusb " #lx ", " #t0 "               \n\t"\
1316
        "psubusb " #lx ", " #sx "               \n\t"\
1317
        "movq "MANGLE(b00)", " #lx "            \n\t"\
1318
        "pcmpeqb " #lx ", " #t1 "               \n\t" /* src[-1] > a ? 0 : -1*/\
1319
        "pcmpeqb " #lx ", " #t0 "               \n\t" /* src[+1] > a ? 0 : -1*/\
1320
        "pcmpeqb " #lx ", " #sx "               \n\t" /* src[0]  > a ? 0 : -1*/\
1321
        "paddb " #t1 ", " #t0 "                 \n\t"\
1322
        "paddb " #t0 ", " #sx "                 \n\t"\
1323 70c5ae87 Michael Niedermayer
\
1324 16e0bf73 Diego Biurrun
        PAVGB(plx, pplx)                              /* filtered */\
1325
        "movq " #dst ", " #t0 "                 \n\t" /* dst */\
1326
        "movq " #t0 ", " #t1 "                  \n\t" /* dst */\
1327
        "psubusb %3, " #t0 "                    \n\t"\
1328
        "paddusb %3, " #t1 "                    \n\t"\
1329
        PMAXUB(t0, pplx)\
1330
        PMINUB(t1, pplx, t0)\
1331
        "paddb " #sx ", " #ppsx "               \n\t"\
1332
        "paddb " #psx ", " #ppsx "              \n\t"\
1333
        "#paddb "MANGLE(b02)", " #ppsx "        \n\t"\
1334
        "pand "MANGLE(b08)", " #ppsx "          \n\t"\
1335
        "pcmpeqb " #lx ", " #ppsx "             \n\t"\
1336
        "pand " #ppsx ", " #pplx "              \n\t"\
1337
        "pandn " #dst ", " #ppsx "              \n\t"\
1338
        "por " #pplx ", " #ppsx "               \n\t"\
1339
        "movq " #ppsx ", " #dst "               \n\t"\
1340
        "movq 8(%%"REG_c"), " #lx "             \n\t"
1341 2e212618 Michael Niedermayer
1342 053dea12 Aurelien Jacobs
#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1343
   REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
1344 70c5ae87 Michael Niedermayer
/*
1345
0000000
1346
1111111
1347 e5c30e06 Michael Niedermayer

1348 70c5ae87 Michael Niedermayer
1111110
1349
1111101
1350
1111100
1351
1111011
1352
1111010
1353
1111001
1354 e5c30e06 Michael Niedermayer

1355 70c5ae87 Michael Niedermayer
1111000
1356
1110111
1357 e5c30e06 Michael Niedermayer

1358 70c5ae87 Michael Niedermayer
*/
1359 bb270c08 Diego Biurrun
//DERING_CORE(dst          ,src            ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1360
DERING_CORE((%%REGa)       ,(%%REGa, %1)   ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1361
DERING_CORE((%%REGa, %1)   ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1362
DERING_CORE((%%REGa, %1, 2),(%0, %1, 4)    ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1363
DERING_CORE((%0, %1, 4)    ,(%%REGd)       ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1364
DERING_CORE((%%REGd)       ,(%%REGd, %1)   ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1365
DERING_CORE((%%REGd, %1)   ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1366
DERING_CORE((%%REGd, %1, 2),(%0, %1, 8)    ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1367
DERING_CORE((%0, %1, 8)    ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1368
1369 16e0bf73 Diego Biurrun
        "1:                        \n\t"
1370 7cebed70 Reimar Döffinger
        : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2)
1371 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_d, "%"REG_c
1372
    );
1373 ebc3209a Diego Biurrun
#else //HAVE_MMX2 || HAVE_AMD3DNOW
1374 16e0bf73 Diego Biurrun
    int y;
1375
    int min=255;
1376
    int max=0;
1377
    int avg;
1378
    uint8_t *p;
1379
    int s[10];
1380
    const int QP2= c->QP/2 + 1;
1381
1382
    for(y=1; y<9; y++){
1383
        int x;
1384
        p= src + stride*y;
1385
        for(x=1; x<9; x++){
1386
            p++;
1387
            if(*p > max) max= *p;
1388
            if(*p < min) min= *p;
1389 bb270c08 Diego Biurrun
        }
1390 16e0bf73 Diego Biurrun
    }
1391
    avg= (min + max + 1)>>1;
1392
1393
    if(max - min <deringThreshold) return;
1394
1395
    for(y=0; y<10; y++){
1396
        int t = 0;
1397
1398
        if(src[stride*y + 0] > avg) t+= 1;
1399
        if(src[stride*y + 1] > avg) t+= 2;
1400
        if(src[stride*y + 2] > avg) t+= 4;
1401
        if(src[stride*y + 3] > avg) t+= 8;
1402
        if(src[stride*y + 4] > avg) t+= 16;
1403
        if(src[stride*y + 5] > avg) t+= 32;
1404
        if(src[stride*y + 6] > avg) t+= 64;
1405
        if(src[stride*y + 7] > avg) t+= 128;
1406
        if(src[stride*y + 8] > avg) t+= 256;
1407
        if(src[stride*y + 9] > avg) t+= 512;
1408
1409
        t |= (~t)<<16;
1410
        t &= (t<<1) & (t>>1);
1411
        s[y] = t;
1412
    }
1413
1414
    for(y=1; y<9; y++){
1415
        int t = s[y-1] & s[y] & s[y+1];
1416
        t|= t>>16;
1417
        s[y-1]= t;
1418
    }
1419
1420
    for(y=1; y<9; y++){
1421
        int x;
1422
        int t = s[y-1];
1423 bb270c08 Diego Biurrun
1424 16e0bf73 Diego Biurrun
        p= src + stride*y;
1425
        for(x=1; x<9; x++){
1426
            p++;
1427
            if(t & (1<<x)){
1428
                int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1429
                      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1430
                      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1431
                f= (f + 8)>>4;
1432 2e212618 Michael Niedermayer
1433 cd38e322 Michael Niedermayer
#ifdef DEBUG_DERING_THRESHOLD
1434 be449fca Diego Pettenò
                    __asm__ volatile("emms\n\t":);
1435 16e0bf73 Diego Biurrun
                    {
1436
                    static long long numPixels=0;
1437
                    if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1438
//                    if((max-min)<20 || (max-min)*QP<200)
1439
//                    if((max-min)*QP < 500)
1440
//                    if(max-min<QP/2)
1441
                    if(max-min < 20){
1442 a94948d3 Diego Biurrun
                        static int numSkipped=0;
1443 16e0bf73 Diego Biurrun
                        static int errorSum=0;
1444
                        static int worstQP=0;
1445
                        static int worstRange=0;
1446
                        static int worstDiff=0;
1447
                        int diff= (f - *p);
1448
                        int absDiff= FFABS(diff);
1449
                        int error= diff*diff;
1450
1451
                        if(x==1 || x==8 || y==1 || y==8) continue;
1452
1453 a94948d3 Diego Biurrun
                        numSkipped++;
1454 16e0bf73 Diego Biurrun
                        if(absDiff > worstDiff){
1455
                            worstDiff= absDiff;
1456
                            worstQP= QP;
1457
                            worstRange= max-min;
1458 bb270c08 Diego Biurrun
                        }
1459 16e0bf73 Diego Biurrun
                        errorSum+= error;
1460
1461 a94948d3 Diego Biurrun
                        if(1024LL*1024LL*1024LL % numSkipped == 0){
1462 16e0bf73 Diego Biurrun
                            av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1463
                                   "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1464 a94948d3 Diego Biurrun
                                   (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
1465
                                   worstDiff, (float)numSkipped/numPixels);
1466 16e0bf73 Diego Biurrun
                        }
1467
                    }
1468
                    }
1469
#endif
1470
                    if     (*p + QP2 < f) *p= *p + QP2;
1471
                    else if(*p - QP2 > f) *p= *p - QP2;
1472
                    else *p=f;
1473
            }
1474 bb270c08 Diego Biurrun
        }
1475 16e0bf73 Diego Biurrun
    }
1476 cd38e322 Michael Niedermayer
#ifdef DEBUG_DERING_THRESHOLD
1477 16e0bf73 Diego Biurrun
    if(max-min < 20){
1478
        for(y=1; y<9; y++){
1479
            int x;
1480
            int t = 0;
1481
            p= src + stride*y;
1482
            for(x=1; x<9; x++){
1483
                p++;
1484
                *p = FFMIN(*p + 20, 255);
1485
            }
1486 bb270c08 Diego Biurrun
        }
1487 16e0bf73 Diego Biurrun
//        src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1488
    }
1489 cd38e322 Michael Niedermayer
#endif
1490 ebc3209a Diego Biurrun
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
1491 3057fa66 Arpi
}
1492 b0ac780a Michael Niedermayer
#endif //HAVE_ALTIVEC
1493 3057fa66 Arpi
1494 3b58b885 Michael Niedermayer
/**
1495 b304569a Michael Niedermayer
 * Deinterlaces the given block by linearly interpolating every second line.
1496 7fb36f6c Michael Niedermayer
 * will be called for every 8x8 block and can read & write from line 4-15
1497 bd107136 Diego Biurrun
 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1498 7fb36f6c Michael Niedermayer
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1499 3b58b885 Michael Niedermayer
 */
1500 cc9b0679 Michael Niedermayer
static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1501 3b58b885 Michael Niedermayer
{
1502 ebc3209a Diego Biurrun
#if HAVE_MMX2 || HAVE_AMD3DNOW
1503 16e0bf73 Diego Biurrun
    src+= 4*stride;
1504 be449fca Diego Pettenò
    __asm__ volatile(
1505 16e0bf73 Diego Biurrun
        "lea (%0, %1), %%"REG_a"                \n\t"
1506
        "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
1507 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9
1508
//      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
1509
1510 16e0bf73 Diego Biurrun
        "movq (%0), %%mm0                       \n\t"
1511
        "movq (%%"REG_a", %1), %%mm1            \n\t"
1512
        PAVGB(%%mm1, %%mm0)
1513
        "movq %%mm0, (%%"REG_a")                \n\t"
1514
        "movq (%0, %1, 4), %%mm0                \n\t"
1515
        PAVGB(%%mm0, %%mm1)
1516
        "movq %%mm1, (%%"REG_a", %1, 2)         \n\t"
1517
        "movq (%%"REG_c", %1), %%mm1            \n\t"
1518
        PAVGB(%%mm1, %%mm0)
1519
        "movq %%mm0, (%%"REG_c")                \n\t"
1520
        "movq (%0, %1, 8), %%mm0                \n\t"
1521
        PAVGB(%%mm0, %%mm1)
1522
        "movq %%mm1, (%%"REG_c", %1, 2)         \n\t"
1523
1524 7cebed70 Reimar Döffinger
        : : "r" (src), "r" ((x86_reg)stride)
1525 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_c
1526
    );
1527 3b58b885 Michael Niedermayer
#else
1528 16e0bf73 Diego Biurrun
    int a, b, x;
1529
    src+= 4*stride;
1530
1531
    for(x=0; x<2; x++){
1532
        a= *(uint32_t*)&src[stride*0];
1533
        b= *(uint32_t*)&src[stride*2];
1534
        *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1535
        a= *(uint32_t*)&src[stride*4];
1536
        *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1537
        b= *(uint32_t*)&src[stride*6];
1538
        *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1539
        a= *(uint32_t*)&src[stride*8];
1540
        *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1541
        src += 4;
1542
    }
1543 3b58b885 Michael Niedermayer
#endif
1544
}
1545
1546
/**
1547 b304569a Michael Niedermayer
 * Deinterlaces the given block by cubic interpolating every second line.
1548 7fb36f6c Michael Niedermayer
 * will be called for every 8x8 block and can read & write from line 4-15
1549 bd107136 Diego Biurrun
 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1550 7fb36f6c Michael Niedermayer
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1551
 * this filter will read lines 3-15 and write 7-13
1552 3b58b885 Michael Niedermayer
 */
1553 cc9b0679 Michael Niedermayer
static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1554 3b58b885 Michael Niedermayer
{
1555 ebc3209a Diego Biurrun
#if HAVE_MMX2 || HAVE_AMD3DNOW
1556 16e0bf73 Diego Biurrun
    src+= stride*3;
1557 be449fca Diego Pettenò
    __asm__ volatile(
1558 16e0bf73 Diego Biurrun
        "lea (%0, %1), %%"REG_a"                \n\t"
1559
        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1560
        "lea (%%"REG_d", %1, 4), %%"REG_c"      \n\t"
1561
        "add %1, %%"REG_c"                      \n\t"
1562
        "pxor %%mm7, %%mm7                      \n\t"
1563 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9       10
1564
//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
1565 3b58b885 Michael Niedermayer
1566 053dea12 Aurelien Jacobs
#define REAL_DEINT_CUBIC(a,b,c,d,e)\
1567 16e0bf73 Diego Biurrun
        "movq " #a ", %%mm0                     \n\t"\
1568
        "movq " #b ", %%mm1                     \n\t"\
1569
        "movq " #d ", %%mm2                     \n\t"\
1570
        "movq " #e ", %%mm3                     \n\t"\
1571
        PAVGB(%%mm2, %%mm1)                             /* (b+d) /2 */\
1572
        PAVGB(%%mm3, %%mm0)                             /* a(a+e) /2 */\
1573
        "movq %%mm0, %%mm2                      \n\t"\
1574
        "punpcklbw %%mm7, %%mm0                 \n\t"\
1575
        "punpckhbw %%mm7, %%mm2                 \n\t"\
1576
        "movq %%mm1, %%mm3                      \n\t"\
1577
        "punpcklbw %%mm7, %%mm1                 \n\t"\
1578
        "punpckhbw %%mm7, %%mm3                 \n\t"\
1579
        "psubw %%mm1, %%mm0                     \n\t"   /* L(a+e - (b+d))/2 */\
1580
        "psubw %%mm3, %%mm2                     \n\t"   /* H(a+e - (b+d))/2 */\
1581
        "psraw $3, %%mm0                        \n\t"   /* L(a+e - (b+d))/16 */\
1582
        "psraw $3, %%mm2                        \n\t"   /* H(a+e - (b+d))/16 */\
1583
        "psubw %%mm0, %%mm1                     \n\t"   /* L(9b + 9d - a - e)/16 */\
1584
        "psubw %%mm2, %%mm3                     \n\t"   /* H(9b + 9d - a - e)/16 */\
1585
        "packuswb %%mm3, %%mm1                  \n\t"\
1586
        "movq %%mm1, " #c "                     \n\t"
1587 053dea12 Aurelien Jacobs
#define DEINT_CUBIC(a,b,c,d,e)  REAL_DEINT_CUBIC(a,b,c,d,e)
1588 acced553 Michael Niedermayer
1589 bb270c08 Diego Biurrun
DEINT_CUBIC((%0)        , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
1590
DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd)       , (%%REGd, %1), (%0, %1, 8))
1591
DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
1592
DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc)    , (%%REGc, %1, 2))
1593 3b58b885 Michael Niedermayer
1594 7cebed70 Reimar Döffinger
        : : "r" (src), "r" ((x86_reg)stride)
1595 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_d, "%"REG_c
1596
    );
1597 ebc3209a Diego Biurrun
#else //HAVE_MMX2 || HAVE_AMD3DNOW
1598 16e0bf73 Diego Biurrun
    int x;
1599
    src+= stride*3;
1600
    for(x=0; x<8; x++){
1601
        src[stride*3] = CLIP((-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1602
        src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1603
        src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1604
        src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1605
        src++;
1606
    }
1607 ebc3209a Diego Biurrun
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
1608 3b58b885 Michael Niedermayer
}
1609
1610
/**
1611 b304569a Michael Niedermayer
 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1612 7fb36f6c Michael Niedermayer
 * will be called for every 8x8 block and can read & write from line 4-15
1613 bd107136 Diego Biurrun
 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1614 7fb36f6c Michael Niedermayer
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1615 9c9e467d Michael Niedermayer
 * this filter will read lines 4-13 and write 5-11
1616
 */
1617
static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1618
{
1619 ebc3209a Diego Biurrun
#if HAVE_MMX2 || HAVE_AMD3DNOW
1620 16e0bf73 Diego Biurrun
    src+= stride*4;
1621 be449fca Diego Pettenò
    __asm__ volatile(
1622 16e0bf73 Diego Biurrun
        "lea (%0, %1), %%"REG_a"                \n\t"
1623
        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1624
        "pxor %%mm7, %%mm7                      \n\t"
1625
        "movq (%2), %%mm0                       \n\t"
1626 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9       10
1627
//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
1628 9c9e467d Michael Niedermayer
1629 053dea12 Aurelien Jacobs
#define REAL_DEINT_FF(a,b,c,d)\
1630 16e0bf73 Diego Biurrun
        "movq " #a ", %%mm1                     \n\t"\
1631
        "movq " #b ", %%mm2                     \n\t"\
1632
        "movq " #c ", %%mm3                     \n\t"\
1633
        "movq " #d ", %%mm4                     \n\t"\
1634
        PAVGB(%%mm3, %%mm1)                          \
1635
        PAVGB(%%mm4, %%mm0)                          \
1636
        "movq %%mm0, %%mm3                      \n\t"\
1637
        "punpcklbw %%mm7, %%mm0                 \n\t"\
1638
        "punpckhbw %%mm7, %%mm3                 \n\t"\
1639
        "movq %%mm1, %%mm4                      \n\t"\
1640
        "punpcklbw %%mm7, %%mm1                 \n\t"\
1641
        "punpckhbw %%mm7, %%mm4                 \n\t"\
1642
        "psllw $2, %%mm1                        \n\t"\
1643
        "psllw $2, %%mm4                        \n\t"\
1644
        "psubw %%mm0, %%mm1                     \n\t"\
1645
        "psubw %%mm3, %%mm4                     \n\t"\
1646
        "movq %%mm2, %%mm5                      \n\t"\
1647
        "movq %%mm2, %%mm0                      \n\t"\
1648
        "punpcklbw %%mm7, %%mm2                 \n\t"\
1649
        "punpckhbw %%mm7, %%mm5                 \n\t"\
1650
        "paddw %%mm2, %%mm1                     \n\t"\
1651
        "paddw %%mm5, %%mm4                     \n\t"\
1652
        "psraw $2, %%mm1                        \n\t"\
1653
        "psraw $2, %%mm4                        \n\t"\
1654
        "packuswb %%mm4, %%mm1                  \n\t"\
1655
        "movq %%mm1, " #b "                     \n\t"\
1656 9c9e467d Michael Niedermayer
1657 053dea12 Aurelien Jacobs
#define DEINT_FF(a,b,c,d)  REAL_DEINT_FF(a,b,c,d)
1658
1659 bb270c08 Diego Biurrun
DEINT_FF((%0)        , (%%REGa)       , (%%REGa, %1), (%%REGa, %1, 2))
1660
DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd)       )
1661
DEINT_FF((%0, %1, 4) , (%%REGd)       , (%%REGd, %1), (%%REGd, %1, 2))
1662
DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
1663 9c9e467d Michael Niedermayer
1664 16e0bf73 Diego Biurrun
        "movq %%mm0, (%2)                       \n\t"
1665 7cebed70 Reimar Döffinger
        : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1666 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_d
1667
    );
1668 ebc3209a Diego Biurrun
#else //HAVE_MMX2 || HAVE_AMD3DNOW
1669 16e0bf73 Diego Biurrun
    int x;
1670
    src+= stride*4;
1671
    for(x=0; x<8; x++){
1672
        int t1= tmp[x];
1673
        int t2= src[stride*1];
1674
1675
        src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1676
        t1= src[stride*4];
1677
        src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1678
        t2= src[stride*6];
1679
        src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1680
        t1= src[stride*8];
1681
        src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1682
        tmp[x]= t1;
1683
1684
        src++;
1685
    }
1686 ebc3209a Diego Biurrun
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
1687 9c9e467d Michael Niedermayer
}
1688
1689
/**
1690 134eb1e5 Michael Niedermayer
 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter.
1691
 * will be called for every 8x8 block and can read & write from line 4-15
1692 bd107136 Diego Biurrun
 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1693 134eb1e5 Michael Niedermayer
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1694
 * this filter will read lines 4-13 and write 4-11
1695
 */
1696
static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1697
{
1698 ebc3209a Diego Biurrun
#if HAVE_MMX2 || HAVE_AMD3DNOW
1699 16e0bf73 Diego Biurrun
    src+= stride*4;
1700 be449fca Diego Pettenò
    __asm__ volatile(
1701 16e0bf73 Diego Biurrun
        "lea (%0, %1), %%"REG_a"                \n\t"
1702
        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1703
        "pxor %%mm7, %%mm7                      \n\t"
1704
        "movq (%2), %%mm0                       \n\t"
1705
        "movq (%3), %%mm1                       \n\t"
1706 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9       10
1707
//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
1708 134eb1e5 Michael Niedermayer
1709 053dea12 Aurelien Jacobs
#define REAL_DEINT_L5(t1,t2,a,b,c)\
1710 16e0bf73 Diego Biurrun
        "movq " #a ", %%mm2                     \n\t"\
1711
        "movq " #b ", %%mm3                     \n\t"\
1712
        "movq " #c ", %%mm4                     \n\t"\
1713
        PAVGB(t2, %%mm3)                             \
1714
        PAVGB(t1, %%mm4)                             \
1715
        "movq %%mm2, %%mm5                      \n\t"\
1716
        "movq %%mm2, " #t1 "                    \n\t"\
1717
        "punpcklbw %%mm7, %%mm2                 \n\t"\
1718
        "punpckhbw %%mm7, %%mm5                 \n\t"\
1719
        "movq %%mm2, %%mm6                      \n\t"\
1720
        "paddw %%mm2, %%mm2                     \n\t"\
1721
        "paddw %%mm6, %%mm2                     \n\t"\
1722
        "movq %%mm5, %%mm6                      \n\t"\
1723
        "paddw %%mm5, %%mm5                     \n\t"\
1724
        "paddw %%mm6, %%mm5                     \n\t"\
1725
        "movq %%mm3, %%mm6                      \n\t"\
1726
        "punpcklbw %%mm7, %%mm3                 \n\t"\
1727
        "punpckhbw %%mm7, %%mm6                 \n\t"\
1728
        "paddw %%mm3, %%mm3                     \n\t"\
1729
        "paddw %%mm6, %%mm6                     \n\t"\
1730
        "paddw %%mm3, %%mm2                     \n\t"\
1731
        "paddw %%mm6, %%mm5                     \n\t"\
1732
        "movq %%mm4, %%mm6                      \n\t"\
1733
        "punpcklbw %%mm7, %%mm4                 \n\t"\
1734
        "punpckhbw %%mm7, %%mm6                 \n\t"\
1735
        "psubw %%mm4, %%mm2                     \n\t"\
1736
        "psubw %%mm6, %%mm5                     \n\t"\
1737
        "psraw $2, %%mm2                        \n\t"\
1738
        "psraw $2, %%mm5                        \n\t"\
1739
        "packuswb %%mm5, %%mm2                  \n\t"\
1740
        "movq %%mm2, " #a "                     \n\t"\
1741 134eb1e5 Michael Niedermayer
1742 053dea12 Aurelien Jacobs
#define DEINT_L5(t1,t2,a,b,c)  REAL_DEINT_L5(t1,t2,a,b,c)
1743
1744
DEINT_L5(%%mm0, %%mm1, (%0)           , (%%REGa)       , (%%REGa, %1)   )
1745
DEINT_L5(%%mm1, %%mm0, (%%REGa)       , (%%REGa, %1)   , (%%REGa, %1, 2))
1746
DEINT_L5(%%mm0, %%mm1, (%%REGa, %1)   , (%%REGa, %1, 2), (%0, %1, 4)   )
1747
DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4)    , (%%REGd)       )
1748 115329f1 Diego Biurrun
DEINT_L5(%%mm0, %%mm1, (%0, %1, 4)    , (%%REGd)       , (%%REGd, %1)   )
1749 053dea12 Aurelien Jacobs
DEINT_L5(%%mm1, %%mm0, (%%REGd)       , (%%REGd, %1)   , (%%REGd, %1, 2))
1750
DEINT_L5(%%mm0, %%mm1, (%%REGd, %1)   , (%%REGd, %1, 2), (%0, %1, 8)   )
1751
DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8)    , (%%REGd, %1, 4))
1752 134eb1e5 Michael Niedermayer
1753 16e0bf73 Diego Biurrun
        "movq %%mm0, (%2)                       \n\t"
1754
        "movq %%mm1, (%3)                       \n\t"
1755 7cebed70 Reimar Döffinger
        : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1756 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_d
1757
    );
1758 ebc3209a Diego Biurrun
#else //HAVE_MMX2 || HAVE_AMD3DNOW
1759 16e0bf73 Diego Biurrun
    int x;
1760
    src+= stride*4;
1761
    for(x=0; x<8; x++){
1762
        int t1= tmp[x];
1763
        int t2= tmp2[x];
1764
        int t3= src[0];
1765
1766
        src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1767
        t1= src[stride*1];
1768
        src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1769
        t2= src[stride*2];
1770
        src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1771
        t3= src[stride*3];
1772
        src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1773
        t1= src[stride*4];
1774
        src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1775
        t2= src[stride*5];
1776
        src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1777
        t3= src[stride*6];
1778
        src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1779
        t1= src[stride*7];
1780
        src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1781
1782
        tmp[x]= t3;
1783
        tmp2[x]= t1;
1784
1785
        src++;
1786
    }
1787 ebc3209a Diego Biurrun
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
1788 134eb1e5 Michael Niedermayer
}
1789
1790
/**
1791 b304569a Michael Niedermayer
 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter.
1792 9c9e467d Michael Niedermayer
 * will be called for every 8x8 block and can read & write from line 4-15
1793 bd107136 Diego Biurrun
 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1794 9c9e467d Michael Niedermayer
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1795 7fb36f6c Michael Niedermayer
 * this filter will read lines 4-13 and write 4-11
1796 3b58b885 Michael Niedermayer
 */
1797 13ba9ae4 Michael Niedermayer
static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1798 3b58b885 Michael Niedermayer
{
1799 ebc3209a Diego Biurrun
#if HAVE_MMX2 || HAVE_AMD3DNOW
1800 16e0bf73 Diego Biurrun
    src+= 4*stride;
1801 be449fca Diego Pettenò
    __asm__ volatile(
1802 16e0bf73 Diego Biurrun
        "lea (%0, %1), %%"REG_a"                \n\t"
1803
        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1804 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9
1805
//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
1806
1807 16e0bf73 Diego Biurrun
        "movq (%2), %%mm0                       \n\t" // L0
1808
        "movq (%%"REG_a"), %%mm1                \n\t" // L2
1809
        PAVGB(%%mm1, %%mm0)                           // L0+L2
1810
        "movq (%0), %%mm2                       \n\t" // L1
1811
        PAVGB(%%mm2, %%mm0)
1812
        "movq %%mm0, (%0)                       \n\t"
1813
        "movq (%%"REG_a", %1), %%mm0            \n\t" // L3
1814
        PAVGB(%%mm0, %%mm2)                           // L1+L3
1815
        PAVGB(%%mm1, %%mm2)                           // 2L2 + L1 + L3
1816
        "movq %%mm2, (%%"REG_a")                \n\t"
1817
        "movq (%%"REG_a", %1, 2), %%mm2         \n\t" // L4
1818
        PAVGB(%%mm2, %%mm1)                           // L2+L4
1819
        PAVGB(%%mm0, %%mm1)                           // 2L3 + L2 + L4
1820
        "movq %%mm1, (%%"REG_a", %1)            \n\t"
1821
        "movq (%0, %1, 4), %%mm1                \n\t" // L5
1822
        PAVGB(%%mm1, %%mm0)                           // L3+L5
1823
        PAVGB(%%mm2, %%mm0)                           // 2L4 + L3 + L5
1824
        "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
1825
        "movq (%%"REG_d"), %%mm0                \n\t" // L6
1826
        PAVGB(%%mm0, %%mm2)                           // L4+L6
1827
        PAVGB(%%mm1, %%mm2)                           // 2L5 + L4 + L6
1828
        "movq %%mm2, (%0, %1, 4)                \n\t"
1829
        "movq (%%"REG_d", %1), %%mm2            \n\t" // L7
1830
        PAVGB(%%mm2, %%mm1)                           // L5+L7
1831
        PAVGB(%%mm0, %%mm1)                           // 2L6 + L5 + L7
1832
        "movq %%mm1, (%%"REG_d")                \n\t"
1833
        "movq (%%"REG_d", %1, 2), %%mm1         \n\t" // L8
1834
        PAVGB(%%mm1, %%mm0)                           // L6+L8
1835
        PAVGB(%%mm2, %%mm0)                           // 2L7 + L6 + L8
1836
        "movq %%mm0, (%%"REG_d", %1)            \n\t"
1837
        "movq (%0, %1, 8), %%mm0                \n\t" // L9
1838
        PAVGB(%%mm0, %%mm2)                           // L7+L9
1839
        PAVGB(%%mm1, %%mm2)                           // 2L8 + L7 + L9
1840
        "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
1841
        "movq %%mm1, (%2)                       \n\t"
1842
1843 7cebed70 Reimar Döffinger
        : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1844 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_d
1845
    );
1846 ebc3209a Diego Biurrun
#else //HAVE_MMX2 || HAVE_AMD3DNOW
1847 16e0bf73 Diego Biurrun
    int a, b, c, x;
1848
    src+= 4*stride;
1849
1850
    for(x=0; x<2; x++){
1851
        a= *(uint32_t*)&tmp[stride*0];
1852
        b= *(uint32_t*)&src[stride*0];
1853
        c= *(uint32_t*)&src[stride*1];
1854
        a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1855
        *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1856
1857
        a= *(uint32_t*)&src[stride*2];
1858
        b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1859
        *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1860
1861
        b= *(uint32_t*)&src[stride*3];
1862
        c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1863
        *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1864
1865
        c= *(uint32_t*)&src[stride*4];
1866
        a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1867
        *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1868
1869
        a= *(uint32_t*)&src[stride*5];
1870
        b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1871
        *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1872
1873
        b= *(uint32_t*)&src[stride*6];
1874
        c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1875
        *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1876
1877
        c= *(uint32_t*)&src[stride*7];
1878
        a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1879
        *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1880
1881
        a= *(uint32_t*)&src[stride*8];
1882
        b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1883
        *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1884
1885
        *(uint32_t*)&tmp[stride*0]= c;
1886
        src += 4;
1887
        tmp += 4;
1888
    }
1889 ebc3209a Diego Biurrun
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
1890 3b58b885 Michael Niedermayer
}
1891
1892
/**
1893 b304569a Michael Niedermayer
 * Deinterlaces the given block by applying a median filter to every second line.
1894 7fb36f6c Michael Niedermayer
 * will be called for every 8x8 block and can read & write from line 4-15,
1895 bd107136 Diego Biurrun
 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1896 7fb36f6c Michael Niedermayer
 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1897 3b58b885 Michael Niedermayer
 */
1898 cc9b0679 Michael Niedermayer
static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1899 3b58b885 Michael Niedermayer
{
1900 ff5d91d9 Carl Eugen Hoyos
#if HAVE_MMX
1901 16e0bf73 Diego Biurrun
    src+= 4*stride;
1902 b250f9c6 Aurelien Jacobs
#if HAVE_MMX2
1903 be449fca Diego Pettenò
    __asm__ volatile(
1904 16e0bf73 Diego Biurrun
        "lea (%0, %1), %%"REG_a"                \n\t"
1905
        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1906 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9
1907
//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
1908
1909 16e0bf73 Diego Biurrun
        "movq (%0), %%mm0                       \n\t" //
1910
        "movq (%%"REG_a", %1), %%mm2            \n\t" //
1911
        "movq (%%"REG_a"), %%mm1                \n\t" //
1912
        "movq %%mm0, %%mm3                      \n\t"
1913
        "pmaxub %%mm1, %%mm0                    \n\t" //
1914
        "pminub %%mm3, %%mm1                    \n\t" //
1915
        "pmaxub %%mm2, %%mm1                    \n\t" //
1916
        "pminub %%mm1, %%mm0                    \n\t"
1917
        "movq %%mm0, (%%"REG_a")                \n\t"
1918
1919
        "movq (%0, %1, 4), %%mm0                \n\t" //
1920
        "movq (%%"REG_a", %1, 2), %%mm1         \n\t" //
1921
        "movq %%mm2, %%mm3                      \n\t"
1922
        "pmaxub %%mm1, %%mm2                    \n\t" //
1923
        "pminub %%mm3, %%mm1                    \n\t" //
1924
        "pmaxub %%mm0, %%mm1                    \n\t" //
1925
        "pminub %%mm1, %%mm2                    \n\t"
1926
        "movq %%mm2, (%%"REG_a", %1, 2)         \n\t"
1927
1928
        "movq (%%"REG_d"), %%mm2                \n\t" //
1929
        "movq (%%"REG_d", %1), %%mm1            \n\t" //
1930
        "movq %%mm2, %%mm3                      \n\t"
1931
        "pmaxub %%mm0, %%mm2                    \n\t" //
1932
        "pminub %%mm3, %%mm0                    \n\t" //
1933
        "pmaxub %%mm1, %%mm0                    \n\t" //
1934
        "pminub %%mm0, %%mm2                    \n\t"
1935
        "movq %%mm2, (%%"REG_d")                \n\t"
1936
1937
        "movq (%%"REG_d", %1, 2), %%mm2         \n\t" //
1938
        "movq (%0, %1, 8), %%mm0                \n\t" //
1939
        "movq %%mm2, %%mm3                      \n\t"
1940
        "pmaxub %%mm0, %%mm2                    \n\t" //
1941
        "pminub %%mm3, %%mm0                    \n\t" //
1942
        "pmaxub %%mm1, %%mm0                    \n\t" //
1943
        "pminub %%mm0, %%mm2                    \n\t"
1944
        "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
1945
1946
1947 7cebed70 Reimar Döffinger
        : : "r" (src), "r" ((x86_reg)stride)
1948 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_d
1949
    );
1950 a6be8111 Michael Niedermayer
1951
#else // MMX without MMX2
1952 be449fca Diego Pettenò
    __asm__ volatile(
1953 16e0bf73 Diego Biurrun
        "lea (%0, %1), %%"REG_a"                \n\t"
1954
        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1955 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9
1956
//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
1957 16e0bf73 Diego Biurrun
        "pxor %%mm7, %%mm7                      \n\t"
1958 a6be8111 Michael Niedermayer
1959 053dea12 Aurelien Jacobs
#define REAL_MEDIAN(a,b,c)\
1960 16e0bf73 Diego Biurrun
        "movq " #a ", %%mm0                     \n\t"\
1961
        "movq " #b ", %%mm2                     \n\t"\
1962
        "movq " #c ", %%mm1                     \n\t"\
1963
        "movq %%mm0, %%mm3                      \n\t"\
1964
        "movq %%mm1, %%mm4                      \n\t"\
1965
        "movq %%mm2, %%mm5                      \n\t"\
1966
        "psubusb %%mm1, %%mm3                   \n\t"\
1967
        "psubusb %%mm2, %%mm4                   \n\t"\
1968
        "psubusb %%mm0, %%mm5                   \n\t"\
1969
        "pcmpeqb %%mm7, %%mm3                   \n\t"\
1970
        "pcmpeqb %%mm7, %%mm4                   \n\t"\
1971
        "pcmpeqb %%mm7, %%mm5                   \n\t"\
1972
        "movq %%mm3, %%mm6                      \n\t"\
1973
        "pxor %%mm4, %%mm3                      \n\t"\
1974
        "pxor %%mm5, %%mm4                      \n\t"\
1975
        "pxor %%mm6, %%mm5                      \n\t"\
1976
        "por %%mm3, %%mm1                       \n\t"\
1977
        "por %%mm4, %%mm2                       \n\t"\
1978
        "por %%mm5, %%mm0                       \n\t"\
1979
        "pand %%mm2, %%mm0                      \n\t"\
1980
        "pand %%mm1, %%mm0                      \n\t"\
1981
        "movq %%mm0, " #b "                     \n\t"
1982 053dea12 Aurelien Jacobs
#define MEDIAN(a,b,c)  REAL_MEDIAN(a,b,c)
1983 a6be8111 Michael Niedermayer
1984 bb270c08 Diego Biurrun
MEDIAN((%0)        , (%%REGa)       , (%%REGa, %1))
1985 053dea12 Aurelien Jacobs
MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
1986 bb270c08 Diego Biurrun
MEDIAN((%0, %1, 4) , (%%REGd)       , (%%REGd, %1))
1987 053dea12 Aurelien Jacobs
MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
1988 a6be8111 Michael Niedermayer
1989 7cebed70 Reimar Döffinger
        : : "r" (src), "r" ((x86_reg)stride)
1990 16e0bf73 Diego Biurrun
        : "%"REG_a, "%"REG_d
1991
    );
1992 64c968da Diego Biurrun
#endif //HAVE_MMX2
1993
#else //HAVE_MMX
1994 16e0bf73 Diego Biurrun
    int x, y;
1995
    src+= 4*stride;
1996
    // FIXME - there should be a way to do a few columns in parallel like w/mmx
1997
    for(x=0; x<8; x++){
1998
        uint8_t *colsrc = src;
1999
        for (y=0; y<4; y++){
2000
            int a, b, c, d, e, f;
2001
            a = colsrc[0       ];
2002
            b = colsrc[stride  ];
2003
            c = colsrc[stride*2];
2004
            d = (a-b)>>31;
2005
            e = (b-c)>>31;
2006
            f = (c-a)>>31;
2007
            colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
2008
            colsrc += stride*2;
2009 bb270c08 Diego Biurrun
        }
2010 16e0bf73 Diego Biurrun
        src++;
2011
    }
2012 64c968da Diego Biurrun
#endif //HAVE_MMX
2013 3b58b885 Michael Niedermayer
}
2014
2015 b250f9c6 Aurelien Jacobs
#if HAVE_MMX
2016 4e4dcbc5 Michael Niedermayer
/**
2017
 * transposes and shift the given 8x8 Block into dst1 and dst2
2018
 */
2019 cc9b0679 Michael Niedermayer
static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2020 4e4dcbc5 Michael Niedermayer
{
2021 be449fca Diego Pettenò
    __asm__(
2022 16e0bf73 Diego Biurrun
        "lea (%0, %1), %%"REG_a"                \n\t"
2023 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9
2024
//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
2025 16e0bf73 Diego Biurrun
        "movq (%0), %%mm0                       \n\t" // 12345678
2026
        "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
2027
        "movq %%mm0, %%mm2                      \n\t" // 12345678
2028
        "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2029
        "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2030
2031
        "movq (%%"REG_a", %1), %%mm1            \n\t"
2032
        "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
2033
        "movq %%mm1, %%mm4                      \n\t"
2034
        "punpcklbw %%mm3, %%mm1                 \n\t"
2035
        "punpckhbw %%mm3, %%mm4                 \n\t"
2036
2037
        "movq %%mm0, %%mm3                      \n\t"
2038
        "punpcklwd %%mm1, %%mm0                 \n\t"
2039
        "punpckhwd %%mm1, %%mm3                 \n\t"
2040
        "movq %%mm2, %%mm1                      \n\t"
2041
        "punpcklwd %%mm4, %%mm2                 \n\t"
2042
        "punpckhwd %%mm4, %%mm1                 \n\t"
2043
2044
        "movd %%mm0, 128(%2)                    \n\t"
2045
        "psrlq $32, %%mm0                       \n\t"
2046
        "movd %%mm0, 144(%2)                    \n\t"
2047
        "movd %%mm3, 160(%2)                    \n\t"
2048
        "psrlq $32, %%mm3                       \n\t"
2049
        "movd %%mm3, 176(%2)                    \n\t"
2050
        "movd %%mm3, 48(%3)                     \n\t"
2051
        "movd %%mm2, 192(%2)                    \n\t"
2052
        "movd %%mm2, 64(%3)                     \n\t"
2053
        "psrlq $32, %%mm2                       \n\t"
2054
        "movd %%mm2, 80(%3)                     \n\t"
2055
        "movd %%mm1, 96(%3)                     \n\t"
2056
        "psrlq $32, %%mm1                       \n\t"
2057
        "movd %%mm1, 112(%3)                    \n\t"
2058
2059
        "lea (%%"REG_a", %1, 4), %%"REG_a"      \n\t"
2060
2061
        "movq (%0, %1, 4), %%mm0                \n\t" // 12345678
2062
        "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
2063
        "movq %%mm0, %%mm2                      \n\t" // 12345678
2064
        "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2065
        "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2066
2067
        "movq (%%"REG_a", %1), %%mm1            \n\t"
2068
        "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
2069
        "movq %%mm1, %%mm4                      \n\t"
2070
        "punpcklbw %%mm3, %%mm1                 \n\t"
2071
        "punpckhbw %%mm3, %%mm4                 \n\t"
2072
2073
        "movq %%mm0, %%mm3                      \n\t"
2074
        "punpcklwd %%mm1, %%mm0                 \n\t"
2075
        "punpckhwd %%mm1, %%mm3                 \n\t"
2076
        "movq %%mm2, %%mm1                      \n\t"
2077
        "punpcklwd %%mm4, %%mm2                 \n\t"
2078
        "punpckhwd %%mm4, %%mm1                 \n\t"
2079
2080
        "movd %%mm0, 132(%2)                    \n\t"
2081
        "psrlq $32, %%mm0                       \n\t"
2082
        "movd %%mm0, 148(%2)                    \n\t"
2083
        "movd %%mm3, 164(%2)                    \n\t"
2084
        "psrlq $32, %%mm3                       \n\t"
2085
        "movd %%mm3, 180(%2)                    \n\t"
2086
        "movd %%mm3, 52(%3)                     \n\t"
2087
        "movd %%mm2, 196(%2)                    \n\t"
2088
        "movd %%mm2, 68(%3)                     \n\t"
2089
        "psrlq $32, %%mm2                       \n\t"
2090
        "movd %%mm2, 84(%3)                     \n\t"
2091
        "movd %%mm1, 100(%3)                    \n\t"
2092
        "psrlq $32, %%mm1                       \n\t"
2093
        "movd %%mm1, 116(%3)                    \n\t"
2094 bb270c08 Diego Biurrun
2095
2096 7cebed70 Reimar Döffinger
        :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
2097 bb270c08 Diego Biurrun
        : "%"REG_a
2098 16e0bf73 Diego Biurrun
    );
2099 4e4dcbc5 Michael Niedermayer
}
2100
2101
/**
2102
 * transposes the given 8x8 block
2103
 */
2104 cc9b0679 Michael Niedermayer
static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2105 4e4dcbc5 Michael Niedermayer
{
2106 be449fca Diego Pettenò
    __asm__(
2107 16e0bf73 Diego Biurrun
        "lea (%0, %1), %%"REG_a"                \n\t"
2108
        "lea (%%"REG_a",%1,4), %%"REG_d"        \n\t"
2109 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9
2110
//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
2111 16e0bf73 Diego Biurrun
        "movq (%2), %%mm0                       \n\t" // 12345678
2112
        "movq 16(%2), %%mm1                     \n\t" // abcdefgh
2113
        "movq %%mm0, %%mm2                      \n\t" // 12345678
2114
        "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2115
        "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2116
2117
        "movq 32(%2), %%mm1                     \n\t"
2118
        "movq 48(%2), %%mm3                     \n\t"
2119
        "movq %%mm1, %%mm4                      \n\t"
2120
        "punpcklbw %%mm3, %%mm1                 \n\t"
2121
        "punpckhbw %%mm3, %%mm4                 \n\t"
2122
2123
        "movq %%mm0, %%mm3                      \n\t"
2124
        "punpcklwd %%mm1, %%mm0                 \n\t"
2125
        "punpckhwd %%mm1, %%mm3                 \n\t"
2126
        "movq %%mm2, %%mm1                      \n\t"
2127
        "punpcklwd %%mm4, %%mm2                 \n\t"
2128
        "punpckhwd %%mm4, %%mm1                 \n\t"
2129
2130
        "movd %%mm0, (%0)                       \n\t"
2131
        "psrlq $32, %%mm0                       \n\t"
2132
        "movd %%mm0, (%%"REG_a")                \n\t"
2133
        "movd %%mm3, (%%"REG_a", %1)            \n\t"
2134
        "psrlq $32, %%mm3                       \n\t"
2135
        "movd %%mm3, (%%"REG_a", %1, 2)         \n\t"
2136
        "movd %%mm2, (%0, %1, 4)                \n\t"
2137
        "psrlq $32, %%mm2                       \n\t"
2138
        "movd %%mm2, (%%"REG_d")                \n\t"
2139
        "movd %%mm1, (%%"REG_d", %1)            \n\t"
2140
        "psrlq $32, %%mm1                       \n\t"
2141
        "movd %%mm1, (%%"REG_d", %1, 2)         \n\t"
2142
2143
2144
        "movq 64(%2), %%mm0                     \n\t" // 12345678
2145
        "movq 80(%2), %%mm1                     \n\t" // abcdefgh
2146
        "movq %%mm0, %%mm2                      \n\t" // 12345678
2147
        "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2148
        "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2149
2150
        "movq 96(%2), %%mm1                     \n\t"
2151
        "movq 112(%2), %%mm3                    \n\t"
2152
        "movq %%mm1, %%mm4                      \n\t"
2153
        "punpcklbw %%mm3, %%mm1                 \n\t"
2154
        "punpckhbw %%mm3, %%mm4                 \n\t"
2155
2156
        "movq %%mm0, %%mm3                      \n\t"
2157
        "punpcklwd %%mm1, %%mm0                 \n\t"
2158
        "punpckhwd %%mm1, %%mm3                 \n\t"
2159
        "movq %%mm2, %%mm1                      \n\t"
2160
        "punpcklwd %%mm4, %%mm2                 \n\t"
2161
        "punpckhwd %%mm4, %%mm1                 \n\t"
2162
2163
        "movd %%mm0, 4(%0)                      \n\t"
2164
        "psrlq $32, %%mm0                       \n\t"
2165
        "movd %%mm0, 4(%%"REG_a")               \n\t"
2166
        "movd %%mm3, 4(%%"REG_a", %1)           \n\t"
2167
        "psrlq $32, %%mm3                       \n\t"
2168
        "movd %%mm3, 4(%%"REG_a", %1, 2)        \n\t"
2169
        "movd %%mm2, 4(%0, %1, 4)               \n\t"
2170
        "psrlq $32, %%mm2                       \n\t"
2171
        "movd %%mm2, 4(%%"REG_d")               \n\t"
2172
        "movd %%mm1, 4(%%"REG_d", %1)           \n\t"
2173
        "psrlq $32, %%mm1                       \n\t"
2174
        "movd %%mm1, 4(%%"REG_d", %1, 2)        \n\t"
2175 bb270c08 Diego Biurrun
2176 7cebed70 Reimar Döffinger
        :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
2177 bb270c08 Diego Biurrun
        : "%"REG_a, "%"REG_d
2178 16e0bf73 Diego Biurrun
    );
2179 4e4dcbc5 Michael Niedermayer
}
2180 64c968da Diego Biurrun
#endif //HAVE_MMX
2181 053dea12 Aurelien Jacobs
//static long test=0;
2182 4e4dcbc5 Michael Niedermayer
2183 b250f9c6 Aurelien Jacobs
#if !HAVE_ALTIVEC
2184 a2596758 Michael Niedermayer
static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2185 aa089f6c Diego Biurrun
                                    uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
2186 117e45b0 Michael Niedermayer
{
2187 16e0bf73 Diego Biurrun
    // to save a register (FIXME do this outside of the loops)
2188 aa089f6c Diego Biurrun
    tempBlurredPast[127]= maxNoise[0];
2189
    tempBlurredPast[128]= maxNoise[1];
2190
    tempBlurredPast[129]= maxNoise[2];
2191 115329f1 Diego Biurrun
2192 be44a4d7 Michael Niedermayer
#define FAST_L2_DIFF
2193
//#define L1_DIFF //u should change the thresholds too if u try that one
2194 ebc3209a Diego Biurrun
#if HAVE_MMX2 || HAVE_AMD3DNOW
2195 be449fca Diego Pettenò
    __asm__ volatile(
2196 16e0bf73 Diego Biurrun
        "lea (%2, %2, 2), %%"REG_a"             \n\t" // 3*stride
2197
        "lea (%2, %2, 4), %%"REG_d"             \n\t" // 5*stride
2198
        "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2199 bb270c08 Diego Biurrun
//      0       1       2       3       4       5       6       7       8       9
2200
//      %x      %x+%2   %x+2%2  %x+eax  %x+4%2  %x+edx  %x+2eax %x+ecx  %x+8%2
2201 be44a4d7 Michael Niedermayer
//FIXME reorder?
2202
#ifdef L1_DIFF //needs mmx2
2203 16e0bf73 Diego Biurrun
        "movq (%0), %%mm0                       \n\t" // L0
2204
        "psadbw (%1), %%mm0                     \n\t" // |L0-R0|
2205
        "movq (%0, %2), %%mm1                   \n\t" // L1
2206
        "psadbw (%1, %2), %%mm1                 \n\t" // |L1-R1|
2207
        "movq (%0, %2, 2), %%mm2                \n\t" // L2
2208
        "psadbw (%1, %2, 2), %%mm2              \n\t" // |L2-R2|
2209
        "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2210
        "psadbw (%1, %%"REG_a"), %%mm3          \n\t" // |L3-R3|
2211
2212
        "movq (%0, %2, 4), %%mm4                \n\t" // L4
2213
        "paddw %%mm1, %%mm0                     \n\t"
2214
        "psadbw (%1, %2, 4), %%mm4              \n\t" // |L4-R4|
2215
        "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
2216
        "paddw %%mm2, %%mm0                     \n\t"
2217
        "psadbw (%1, %%"REG_d"), %%mm5          \n\t" // |L5-R5|
2218
        "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
2219
        "paddw %%mm3, %%mm0                     \n\t"
2220
        "psadbw (%1, %%"REG_a", 2), %%mm6       \n\t" // |L6-R6|
2221
        "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
2222
        "paddw %%mm4, %%mm0                     \n\t"
2223
        "psadbw (%1, %%"REG_c"), %%mm7          \n\t" // |L7-R7|
2224
        "paddw %%mm5, %%mm6                     \n\t"
2225
        "paddw %%mm7, %%mm6                     \n\t"
2226
        "paddw %%mm6, %%mm0                     \n\t"
2227 64c968da Diego Biurrun
#else //L1_DIFF
2228 053dea12 Aurelien Jacobs
#if defined (FAST_L2_DIFF)
2229 16e0bf73 Diego Biurrun
        "pcmpeqb %%mm7, %%mm7                   \n\t"
2230
        "movq "MANGLE(b80)", %%mm6              \n\t"
2231
        "pxor %%mm0, %%mm0                      \n\t"
2232 053dea12 Aurelien Jacobs
#define REAL_L2_DIFF_CORE(a, b)\
2233 16e0bf73 Diego Biurrun
        "movq " #a ", %%mm5                     \n\t"\
2234
        "movq " #b ", %%mm2                     \n\t"\
2235
        "pxor %%mm7, %%mm2                      \n\t"\
2236
        PAVGB(%%mm2, %%mm5)\
2237
        "paddb %%mm6, %%mm5                     \n\t"\
2238
        "movq %%mm5, %%mm2                      \n\t"\
2239
        "psllw $8, %%mm5                        \n\t"\
2240
        "pmaddwd %%mm5, %%mm5                   \n\t"\
2241
        "pmaddwd %%mm2, %%mm2                   \n\t"\
2242
        "paddd %%mm2, %%mm5                     \n\t"\
2243
        "psrld $14, %%mm5                       \n\t"\
2244
        "paddd %%mm5, %%mm0                     \n\t"
2245 be44a4d7 Michael Niedermayer
2246 64c968da Diego Biurrun
#else //defined (FAST_L2_DIFF)
2247 16e0bf73 Diego Biurrun
        "pxor %%mm7, %%mm7                      \n\t"
2248
        "pxor %%mm0, %%mm0                      \n\t"
2249 053dea12 Aurelien Jacobs
#define REAL_L2_DIFF_CORE(a, b)\
2250 16e0bf73 Diego Biurrun
        "movq " #a ", %%mm5                     \n\t"\
2251
        "movq " #b ", %%mm2                     \n\t"\
2252
        "movq %%mm5, %%mm1                      \n\t"\
2253
        "movq %%mm2, %%mm3                      \n\t"\
2254
        "punpcklbw %%mm7, %%mm5                 \n\t"\
2255
        "punpckhbw %%mm7, %%mm1                 \n\t"\
2256
        "punpcklbw %%mm7, %%mm2                 \n\t"\
2257
        "punpckhbw %%mm7, %%mm3                 \n\t"\
2258
        "psubw %%mm2, %%mm5                     \n\t"\
2259
        "psubw %%mm3, %%mm1                     \n\t"\
2260
        "pmaddwd %%mm5, %%mm5                   \n\t"\
2261
        "pmaddwd %%mm1, %%mm1                   \n\t"\
2262
        "paddd %%mm1, %%mm5                     \n\t"\
2263
        "paddd %%mm5, %%mm0                     \n\t"
2264 be44a4d7 Michael Niedermayer
2265 64c968da Diego Biurrun
#endif //defined (FAST_L2_DIFF)
2266 053dea12 Aurelien Jacobs
2267
#define L2_DIFF_CORE(a, b)  REAL_L2_DIFF_CORE(a, b)
2268
2269 bb270c08 Diego Biurrun
L2_DIFF_CORE((%0)          , (%1))
2270
L2_DIFF_CORE((%0, %2)      , (%1, %2))
2271
L2_DIFF_CORE((%0, %2, 2)   , (%1, %2, 2))
2272
L2_DIFF_CORE((%0, %%REGa)  , (%1, %%REGa))
2273
L2_DIFF_CORE((%0, %2, 4)   , (%1, %2, 4))
2274
L2_DIFF_CORE((%0, %%REGd)  , (%1, %%REGd))
2275 053dea12 Aurelien Jacobs
L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
2276 bb270c08 Diego Biurrun
L2_DIFF_CORE((%0, %%REGc)  , (%1, %%REGc))
2277 be44a4d7 Michael Niedermayer
2278 64c968da Diego Biurrun
#endif //L1_DIFF
2279 be44a4d7 Michael Niedermayer
2280 16e0bf73 Diego Biurrun
        "movq %%mm0, %%mm4                      \n\t"
2281
        "psrlq $32, %%mm0                       \n\t"
2282
        "paddd %%mm0, %%mm4                     \n\t"
2283
        "movd %%mm4, %%ecx                      \n\t"
2284
        "shll $2, %%ecx                         \n\t"
2285
        "mov %3, %%"REG_d"                      \n\t"
2286
        "addl -4(%%"REG_d"), %%ecx              \n\t"
2287
        "addl 4(%%"REG_d"), %%ecx               \n\t"
2288
        "addl -1024(%%"REG_d"), %%ecx           \n\t"
2289
        "addl $4, %%ecx                         \n\t"
2290
        "addl 1024(%%"REG_d"), %%ecx            \n\t"
2291
        "shrl $3, %%ecx                         \n\t"
2292
        "movl %%ecx, (%%"REG_d")                \n\t"
2293
2294
//        "mov %3, %%"REG_c"                      \n\t"
2295
//        "mov %%"REG_c", test                    \n\t"
2296
//        "jmp 4f                                 \n\t"
2297
        "cmpl 512(%%"REG_d"), %%ecx             \n\t"
2298
        " jb 2f                                 \n\t"
2299
        "cmpl 516(%%"REG_d"), %%ecx             \n\t"
2300
        " jb 1f                                 \n\t"
2301
2302
        "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2303
        "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2304
        "movq (%0), %%mm0                       \n\t" // L0
2305
        "movq (%0, %2), %%mm1                   \n\t" // L1
2306
        "movq (%0, %2, 2), %%mm2                \n\t" // L2
2307
        "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2308
        "movq (%0, %2, 4), %%mm4                \n\t" // L4
2309
        "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
2310
        "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
2311
        "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
2312
        "movq %%mm0, (%1)                       \n\t" // L0
2313
        "movq %%mm1, (%1, %2)                   \n\t" // L1
2314
        "movq %%mm2, (%1, %2, 2)                \n\t" // L2
2315
        "movq %%mm3, (%1, %%"REG_a")            \n\t&qu